# Gradient Boosting

## Libraries

In [14]:
#import warnings
#warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from sklego.preprocessing import RepeatingBasisFunction

from scipy.sparse import coo_matrix

import optuna
import xgboost as xgb

## Parameters

In [15]:
RANDOM_STATE_SEED = 0
N_FOLDS = 5

## Methods

In [16]:
# this function takes the military time format and calculates the total hours passed. 
#  Military time seems a linear continuous variable, but they are not in 10-based regular numbers.
#  calculating the total hours. This is needed for our transformation approach. 
def military_to_hours(military_time):
    hour = military_time // 100
    minute = military_time % 100
    total_hours = round((hour * 60 + minute) / 60, 2)
    return total_hours

In [17]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


In [18]:
def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

## Data

In [19]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [20]:
train = pd.read_csv(DATA_PATH + "flight_delays_train.csv")
test = pd.read_csv(DATA_PATH + "flight_delays_test.csv")

In [21]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [22]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


## Optuna Method

In [23]:
def objective(trial):

    param = {
    'eval_metric': ['auc', 'aucpr'],
    'device': 'cuda',
    'objective': 'binary:logistic',
    "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
    "lambda": trial.suggest_float("lambda", 1e-3, 1.0, log=True),
    "alpha": trial.suggest_float("alpha", 1e-3, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 10)
        param["eta"] = trial.suggest_float("eta", 0.001, 1.0)
        param["gamma"] = trial.suggest_float("gamma", 0.001, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-4, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-4, 1.0, log=True)

    # Add a callback for pruning.
    #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")

    xgb_cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        nfold=N_FOLDS,
        stratified=True,
        verbose_eval=False,
        seed=RANDOM_STATE_SEED
        #callbacks=[pruning_callback]
    )

     # Extract the best score.
    best_score = xgb_cv_results["test-auc-mean"].values[-1]
    return best_score

## First Benchmark

In [24]:
X_train = train[["Distance", "DepTime"]]
X_test = test[["Distance", "DepTime"]]
y_train = train["dep_delayed_15min"].map({"Y": 1, "N": 0})

dtrain = xgb.DMatrix(X_train, label=y_train)

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-07-17 10:34:33,080] A new study created in memory with name: no-name-23d45e98-8c99-4b84-940b-43769a08c134
[I 2024-07-17 10:34:33,954] Trial 0 finished with value: 0.6973826737557418 and parameters: {'booster': 'dart', 'lambda': 0.04244377469427122, 'alpha': 0.07673951144298143, 'max_depth': 8, 'eta': 0.5245335441204217, 'gamma': 0.033067294787675845, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0007465904165394481, 'skip_drop': 0.00046920083736380104}. Best is trial 0 with value: 0.6973826737557418.
[I 2024-07-17 10:34:34,171] Trial 1 finished with value: 0.69917596649899 and parameters: {'booster': 'gbtree', 'lambda': 0.003938131593297287, 'alpha': 0.005416439057373545, 'max_depth': 4, 'eta': 0.7538281757423603, 'gamma': 0.15546934989166664, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.69917596649899.
[I 2024-07-17 10:34:34,558] Trial 2 finished with value: 0.6974023796129838 and parameters: {'booster': 'dart', 'l

## Second Benchmark

In [26]:
train_ = train.copy()
train_['Month'] = train_['Month'].apply(lambda x: x.split('-')[1])
train_['DayofMonth'] = train_['DayofMonth'].apply(lambda x: x.split('-')[1])
train_['DayOfWeek'] = train_['DayOfWeek'].apply(lambda x: x.split('-')[1])
train_['Flight'] = train_['Origin'] + '-' + train_['Dest']
train_['Hour'] = military_to_hours(train['DepTime'])

In [27]:
obj_features = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Flight']
numeric_features = ['Hour', 'Distance']

In [28]:
# Get one hot encoding of object columns
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for i, feature in enumerate(obj_features):
    if i == 0:
        x_lb = lb.fit_transform(train_[feature])
    x_lb = np.hstack((x_lb, lb.fit_transform(train_[feature])))


In [29]:
bins = [0, 6, 11, 18, 24]
labels = ['Night', 'Morning','Day','Evening']

time = train_['Hour']
time = pd.cut(time, bins=bins, labels=labels, include_lowest=False, ordered=False).astype('str')

x_lb = np.hstack((x_lb, lb.fit_transform(time)))

In [30]:
X_train = np.hstack((train_[numeric_features].values, x_lb))

In [31]:
X_sparse = coo_matrix(X_train)

In [32]:
dtrain = xgb.DMatrix(X_sparse, label=y_train)

In [33]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-07-17 10:35:45,575] A new study created in memory with name: no-name-e8cbf85c-205f-44ef-b4b6-b4383fecdbcd
[I 2024-07-17 10:35:45,980] Trial 0 finished with value: 0.7127592217593622 and parameters: {'booster': 'gbtree', 'lambda': 0.23780106761581138, 'alpha': 0.9860357694640004, 'max_depth': 4, 'eta': 0.871477064628787, 'gamma': 0.7120331226029998, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7127592217593622.
[I 2024-07-17 10:35:46,302] Trial 1 finished with value: 0.713511554051664 and parameters: {'booster': 'gbtree', 'lambda': 0.008858922894078993, 'alpha': 0.002650838224838845, 'max_depth': 4, 'eta': 0.6784688270787601, 'gamma': 0.8013513519078058, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.713511554051664.
[I 2024-07-17 10:35:47,387] Trial 2 finished with value: 0.7155574328205738 and parameters: {'booster': 'gbtree', 'lambda': 0.47976680233331837, 'alpha': 0.02533045648650242, 'max_depth': 8, 'eta': 0.5699172867766469, 'gamma': 0.97535839608

In [34]:
study.best_value

0.7205793314658188

## Third Benchmark

In [35]:
time_features = ['Month', 'DayofMonth', 'DayOfWeek', 'Hour']
obj_features = ['UniqueCarrier', 'Flight']
fourier_features = ['sin_Month', 'cos_Month', 'sin_DayofMonth', 'cos_DayofMonth', 'sin_DayOfWeek', 'cos_DayOfWeek']

In [36]:
X_train = pd.DataFrame([])

#X_train['sin_Month'] = cos_transformer(12).fit_transform(train_['Month'].astype('int'))
X_train['cos_Month'] = sin_transformer(12).fit_transform(train_['Month'].astype('int'))

#X_train['sin_DayofMonth'] = cos_transformer(30).fit_transform(train_['DayofMonth'].astype('int'))
X_train['cos_DayofMonth'] = sin_transformer(30).fit_transform(train_['DayofMonth'].astype('int'))

X_train['sin_DayOfWeek'] = cos_transformer(30).fit_transform(train_['DayOfWeek'].astype('int'))
#X_train['cos_DayOfWeek'] = sin_transformer(30).fit_transform(train_['DayOfWeek'].astype('int'))

X_train['Hour'] = train_['Hour']

X_train.head()

Unnamed: 0,cos_Month,cos_DayofMonth,sin_DayOfWeek,Hour
0,-0.866025,-0.951057,0.104528,19.57
1,0.866025,-0.866025,0.809017,15.8
2,-1.0,0.406737,0.5,14.37
3,-0.5,-0.866025,0.309017,10.25
4,-0.866025,0.994522,0.309017,18.47


In [37]:
rbf = RepeatingBasisFunction(
    n_periods=24,
    column='Hour',
    input_range=(0, 24),
    remainder='drop'
)

X_rbf = rbf.fit_transform(X_train)
X_rbf = pd.DataFrame(X_rbf)
X_rbf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,2.999261e-09,1.566284e-13,1.106976e-18,1.058806e-24,1.370585e-31,2.40108e-39,5.692699e-48,1.8265889999999999e-57,7.298147e-59,3.009487e-49,...,3.357818e-14,8.507524e-10,2.91716e-06,0.001353719,0.08501734,0.7225996,0.8311874,0.1293931,0.002726054,7.772645e-06
1,6.281148e-30,1.743071e-37,6.546393e-46,3.327363e-55,3.379375e-61,2.2074539999999998e-51,1.951452e-42,2.334723e-34,3.780278e-27,8.283677e-21,...,0.0391639,0.5272924,0.9607894,0.2369278,0.007907054,3.571285e-05,2.182958e-08,1.805831e-12,2.0217160000000002e-17,3.0631910000000006e-23
2,5.30733e-41,8.434703999999999e-50,1.814156e-59,7.173906e-57,1.982977e-47,7.418068000000001e-39,3.7555620000000004e-31,2.573179e-24,2.386032e-18,2.994287e-13,...,0.8720574,0.6724013,0.0701654,0.000990897,1.893848e-06,4.898607e-10,1.714791e-14,8.123830999999999e-20,5.208607e-26,4.51953e-33
3,2.354702e-46,6.929124999999999e-38,2.759509e-30,1.4872920000000003e-23,1.084855e-17,1.070923e-12,1.430724e-08,2.58681e-05,0.006329715,0.2096114,...,7.811489e-07,1.589391e-10,4.376619e-15,1.631014e-20,8.225981e-27,5.614728e-34,5.186577e-42,6.484014e-51,1.0970290000000001e-60,1.082941e-55
4,5.234605e-14,3.028952e-19,2.371985e-25,2.5138690000000003e-32,3.605656e-40,6.999016e-49,1.838657e-58,7.308481e-58,2.467448e-48,1.127405e-39,...,2.1009e-09,5.897984e-06,0.00224085,0.1152214,0.8017969,0.7551038,0.09624098,0.001660063,3.875251e-06,1.224296e-09


In [38]:
X_train = pd.concat([X_train, X_rbf], axis=1)
X_train = X_train.drop(columns='Hour')
X_train.head()

Unnamed: 0,cos_Month,cos_DayofMonth,sin_DayOfWeek,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,-0.866025,-0.951057,0.104528,2.999261e-09,1.566284e-13,1.106976e-18,1.058806e-24,1.370585e-31,2.40108e-39,5.692699e-48,...,3.357818e-14,8.507524e-10,2.91716e-06,0.001353719,0.08501734,0.7225996,0.8311874,0.1293931,0.002726054,7.772645e-06
1,0.866025,-0.866025,0.809017,6.281148e-30,1.743071e-37,6.546393e-46,3.327363e-55,3.379375e-61,2.2074539999999998e-51,1.951452e-42,...,0.0391639,0.5272924,0.9607894,0.2369278,0.007907054,3.571285e-05,2.182958e-08,1.805831e-12,2.0217160000000002e-17,3.0631910000000006e-23
2,-1.0,0.406737,0.5,5.30733e-41,8.434703999999999e-50,1.814156e-59,7.173906e-57,1.982977e-47,7.418068000000001e-39,3.7555620000000004e-31,...,0.8720574,0.6724013,0.0701654,0.000990897,1.893848e-06,4.898607e-10,1.714791e-14,8.123830999999999e-20,5.208607e-26,4.51953e-33
3,-0.5,-0.866025,0.309017,2.354702e-46,6.929124999999999e-38,2.759509e-30,1.4872920000000003e-23,1.084855e-17,1.070923e-12,1.430724e-08,...,7.811489e-07,1.589391e-10,4.376619e-15,1.631014e-20,8.225981e-27,5.614728e-34,5.186577e-42,6.484014e-51,1.0970290000000001e-60,1.082941e-55
4,-0.866025,0.994522,0.309017,5.234605e-14,3.028952e-19,2.371985e-25,2.5138690000000003e-32,3.605656e-40,6.999016e-49,1.838657e-58,...,2.1009e-09,5.897984e-06,0.00224085,0.1152214,0.8017969,0.7551038,0.09624098,0.001660063,3.875251e-06,1.224296e-09


In [39]:
X_train = pd.concat([X_train, pd.get_dummies(train_[obj_features], drop_first=True, dtype='int')], axis=1)
X_train.head()

Unnamed: 0,cos_Month,cos_DayofMonth,sin_DayOfWeek,0,1,2,3,4,5,6,...,Flight_XNA-IAH,Flight_XNA-LAX,Flight_XNA-LGA,Flight_XNA-ORD,Flight_XNA-SLC,Flight_YAK-CDV,Flight_YAK-JNU,Flight_YUM-IPL,Flight_YUM-LAX,Flight_YUM-PHX
0,-0.866025,-0.951057,0.104528,2.999261e-09,1.566284e-13,1.106976e-18,1.058806e-24,1.370585e-31,2.40108e-39,5.692699e-48,...,0,0,0,0,0,0,0,0,0,0
1,0.866025,-0.866025,0.809017,6.281148e-30,1.743071e-37,6.546393e-46,3.327363e-55,3.379375e-61,2.2074539999999998e-51,1.951452e-42,...,0,0,0,0,0,0,0,0,0,0
2,-1.0,0.406737,0.5,5.30733e-41,8.434703999999999e-50,1.814156e-59,7.173906e-57,1.982977e-47,7.418068000000001e-39,3.7555620000000004e-31,...,0,0,0,0,0,0,0,0,0,0
3,-0.5,-0.866025,0.309017,2.354702e-46,6.929124999999999e-38,2.759509e-30,1.4872920000000003e-23,1.084855e-17,1.070923e-12,1.430724e-08,...,0,0,0,0,0,0,0,0,0,0
4,-0.866025,0.994522,0.309017,5.234605e-14,3.028952e-19,2.371985e-25,2.5138690000000003e-32,3.605656e-40,6.999016e-49,1.838657e-58,...,0,0,0,0,0,0,0,0,0,0


In [40]:
X_sparse = coo_matrix(X_train)

dtrain = xgb.DMatrix(X_sparse, label=y_train)

In [41]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

[I 2024-07-17 10:38:45,438] A new study created in memory with name: no-name-669e42fa-3c69-48bd-b8d0-1fc789a3c611


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-07-17 10:38:46,485] Trial 0 finished with value: 0.7091134116278504 and parameters: {'booster': 'dart', 'lambda': 0.0032954670283376008, 'alpha': 0.0011823923475935997, 'max_depth': 6, 'eta': 0.6953938038135438, 'gamma': 0.9646884206683435, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0021323908654855266, 'skip_drop': 0.026273227741524706}. Best is trial 0 with value: 0.7091134116278504.
[I 2024-07-17 10:38:46,937] Trial 1 finished with value: 0.6793140200186263 and parameters: {'booster': 'dart', 'lambda': 0.3172120026764565, 'alpha': 0.3162508452792691, 'max_depth': 1, 'eta': 0.5978436075562276, 'gamma': 0.10989057549959584, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.6675112117852242, 'skip_drop': 0.0015935105682986074}. Best is trial 0 with value: 0.7091134116278504.
[I 2024-07-17 10:38:47,507] Trial 2 finished with value: 0.7103216164893749 and parameters: {'booster': 'gbtr

In [42]:
study.best_value

0.7149659679973253