

Your task is to beat all benchmarks in this competition. Here you won’t be provided with detailed instructions. Hopefully, at this stage of the course, it's enough for you to take a quick look at the data in order to understand that this is the type of task where gradient boosting will do. Most likely it will be LightGBM. But you can try Xgboost or Catboost as well.

<img src="https://habrastorage.org/webt/fs/42/ms/fs42ms0r7qsoj-da4x7yfntwrbq.jpeg" width=30% />

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb

In [223]:
train_df = pd.read_csv('../../data/flight_delays_train.csv')
test_df = pd.read_csv('../../data/flight_delays_test.csv')

In [3]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


Given flight departure time, carrier's code, departure airport, destination location, and flight distance, you have to predict departure delay for more than 15 minutes. As the simplest benchmark, let's take logistic regression and two features that are easiest to take: DepTime and Distance. This will correspond to **"simple logit baseline"** on Public LB.

In [5]:
X_train, y_train = train_df[['Distance', 'DepTime']].values, train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df[['Distance', 'DepTime']].values

X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)

In [6]:
logit_pipe = Pipeline([('scaler', StandardScaler()),
                       ('logit', LogisticRegression(C=1, random_state=17, solver='liblinear'))])

In [7]:
logit_pipe.fit(X_train_part, y_train_part)
logit_valid_pred = logit_pipe.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6795691465352607

XGB

In [293]:
df_airport[df_airport['iata_code'] == 'OHE']

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
15026,CN-0011,closed,[Duplicate] Gu-Lian Airport,1836.0,AS,CN,CN-23,Mohe,ZYMH,OHE,,"122.427768, 52.913917"
37175,OHE,closed,[Duplicate] Gu-Lian Airport,1836.0,AS,CN,CN-23,Mohe,ZYMH,OHE,,"122.429056, 52.913244"
54723,ZYMH,medium_airport,Gu-Lian Airport,1836.0,AS,CN,CN-23,Mohe,ZYMH,OHE,,"122.43, 52.912777777799995"


In [295]:
#df_airport = pd.read_csv('airports.dat', names=['ID', 'Name', 'City', 'Country', 'Code', 'Code2', 'Lat', 'Long', 'Altitude', 'Timezone', 'DST', 'Type', 'Source'], index_col=False)
df_airport = pd.read_csv('airport-codes.csv')
df_airport = df_airport[df_airport['type'] != 'closed']

In [324]:
df = pd.concat([train_df.iloc[:, :-1], test_df])
df['Month'] = df['Month'].str.replace('c-', '').astype(int)
df['DayofMonth'] = df['DayofMonth'].str.replace('c-', '').astype(int)
df['DayOfWeek'] = df['DayOfWeek'].str.replace('c-', '').astype(int)
df['dow_sin'] = df['DayOfWeek'].apply(lambda x: np.sin(2*np.pi*x/7))
df['dow_cos'] = df['DayOfWeek'].apply(lambda x: np.cos(2*np.pi*x/7))
df['month_sin'] = df['Month'].apply(lambda x: np.sin(2*np.pi*x/12))
df['month_cos'] = df['Month'].apply(lambda x: np.cos(2*np.pi*x/12))
df['dom_sin'] = df['DayofMonth'].apply(lambda x: np.sin(2*np.pi*x/31))
df['dom_cos'] = df['DayofMonth'].apply(lambda x: np.cos(2*np.pi*x/31))
df['isWeekend'] = (df['DayOfWeek'] >= 6).astype(int)
df['Winter'] = ((df['Month'] >= 12) & (df['Month'] <= 2)).astype(int)
df['Spring'] = ((df['Month'] >= 3) & (df['Month'] <= 5)).astype(int)
df['Summer'] = ((df['Month'] >= 6) & (df['Month'] <= 8)).astype(int)
df['Autumn'] = ((df['Month'] >= 9) & (df['Month'] <= 11)).astype(int)
df = pd.merge(df, df_airport.loc[:, ['iata_code', 'iso_country', 'iso_region']], how='left', left_on='Origin', right_on='iata_code', sort=False)
df = pd.merge(df, df_airport.loc[:, ['iata_code', 'iso_country', 'iso_region']], how='left', left_on='Dest', right_on='iata_code', suffixes=('_origin', '_dest'), sort=False)

In [325]:
codes = df['Origin'].append(df['Dest'])
regions = df['iso_region_origin'].append(df['iso_region_dest'])
code_dict = {**{x: x for x in codes.value_counts()[(codes.value_counts() >= 1000)].index.values}, 
             **{x : 'Other' for x in codes.value_counts()[(codes.value_counts() < 1000)].index.values}}
region_dict = {**{x: x for x in regions.value_counts()[(regions.value_counts() >= 1000)].index.values}, 
               **{x : 'Other' for x in regions.value_counts()[(regions.value_counts() < 1000)].index.values}}

In [326]:
df['Origin'] = df['Origin'].map(code_dict)
df['Dest'] = df['Dest'].map(code_dict)
df['iso_region_origin'] = df['iso_region_origin'].map(region_dict)
df['iso_region_dest'] = df['iso_region_dest'].map(region_dict)
df.drop(['Month', 'DayofMonth', 'DayOfWeek', 'iata_code_origin', 'iata_code_dest', 'iso_country_origin', 'iso_country_dest'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['UniqueCarrier', 'Origin', 'Dest', 'iso_region_origin', 'iso_region_dest'])

In [327]:
df_train = df.iloc[:train_df.shape[0]]
df_test = df.iloc[train_df.shape[0]:]
y_train = train_df['dep_delayed_15min'].map({'Y':1, 'N':0}).values
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(df_train, y_train, 
                     test_size=0.3, random_state=17)

In [328]:
scaler = StandardScaler()
scaled = scaler.fit_transform(X_train_part.loc[:,['DepTime', 'Distance']])
X_train_part['DepTime'] = scaled[:, 0]
X_train_part['Distance'] = scaled[:, 1]

In [329]:
scaled_valid = scaler.transform(X_valid.loc[:,['DepTime', 'Distance']])
X_valid['DepTime'] = scaled_valid[:, 0]
X_valid['Distance'] = scaled_valid[:, 1]

In [330]:
lgbtree = lgb.LGBMClassifier(n_estimators=700, subsample=.75, colsample_bytree=.5, reg_lambda=10, num_leaves=75)

In [331]:
%%time
lgbtree.fit(X_train_part, y_train_part)

CPU times: user 25.8 s, sys: 2.47 s, total: 28.2 s
Wall time: 4.82 s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=700, n_jobs=-1, num_leaves=75, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=10, silent=True,
        subsample=0.75, subsample_for_bin=200000, subsample_freq=0)

In [332]:
lgbtree_valid_pred = lgbtree.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, lgbtree_valid_pred)

0.7502031931097103

In [333]:
xgbtree = xgb.XGBClassifier(max_depth=7, n_estimators=700, colsample_bytree=.5,
                            subsample=.75, reg_lambda=10)

In [334]:
%%time
xgbtree.fit(X_train_part, y_train_part)

CPU times: user 10min 48s, sys: 3.44 s, total: 10min 51s
Wall time: 10min 54s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [335]:
xgbtree_valid_pred = xgbtree.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, xgbtree_valid_pred)

0.7498250461390371

In [336]:
ensemble_valid_pred = np.mean(np.vstack([xgbtree_valid_pred, lgbtree_valid_pred]), axis=0)
roc_auc_score(y_valid, ensemble_valid_pred)

0.7528888377895263

Fit final model as ensemble of XGBoost and LightGBM

In [337]:
X_train = df_train
X_test = df_test

In [338]:
scaler_fin = StandardScaler()
scaled_fin = scaler_fin.fit_transform(X_train.loc[:,['DepTime', 'Distance']])
X_train['DepTime'] = scaled_fin[:, 0]
X_train['Distance'] = scaled_fin[:, 1]

In [339]:
scaled_test = scaler_fin.transform(X_test.loc[:,['DepTime', 'Distance']])
X_test['DepTime'] = scaled_test[:, 0]
X_test['Distance'] = scaled_test[:, 1]

In [340]:
xgb_fin = xgb.XGBClassifier(max_depth=7, n_estimators=700, colsample_bytree=.5,
                            subsample=.75, reg_lambda=10)

In [341]:
%%time
xgb_fin.fit(X_train, y_train)

CPU times: user 15min 18s, sys: 4.54 s, total: 15min 23s
Wall time: 15min 28s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [342]:
lgb_fin = lgb.LGBMClassifier(**lgb_mdl.best_params_)

In [343]:
%%time
lgb_fin.fit(X_train, y_train)

CPU times: user 46.2 s, sys: 4.11 s, total: 50.3 s
Wall time: 8.12 s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        importance_type='split', learning_rate=0.01, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=700, n_jobs=-1, num_leaves=91, objective=None,
        random_state=None, reg_alpha=1.0, reg_lambda=1.0, silent=True,
        subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [344]:
xgb_pred = xgb_fin.predict_proba(X_test)[:, 1]
lgb_pred = lgb_fin.predict_proba(X_test)[:, 1]
ensemble_pred = np.mean(np.vstack([xgb_pred, lgb_pred]), axis=0)
pd.Series(ensemble_pred, name='dep_delayed_15min').to_csv('ensemble.csv', index_label='id', header=True)

Cross-validation to tune XGBoost params

In [43]:
cv = StratifiedKFold(n_splits=3)
params = {'max_depth' : [5, 7],
          'n_estimators' : [500],
          'colsample_bytree' : [.5, .75],
          'subsample' : [.75, 1],
          'reg_lambda' : np.logspace(0, 2, 3),
          'reg_alpha' : np.logspace(0, 2, 3),
          'n_jobs' : [-1]}
xgbtree = xgb.XGBClassifier()
xgb_mdl = RandomizedSearchCV(xgbtree, params, cv=cv)

In [44]:
%%time
xgb_mdl.fit(X_train_part, y_train_part)

Wall time: 2h 44min 5s


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
          error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': [5, 7], 'n_estimators': [500], 'colsample_bytree': [0.5, 0.75], 'subsample': [0.75, 1], 'reg_lambda': array([  1.,  10., 100.]), 'reg_alpha': array([  1.,  10., 100.]), 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [45]:
xgbmdl_valid_pred = xgb_mdl.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, xgbmdl_valid_pred)

0.7461536754093123

In [93]:
cv = StratifiedKFold(n_splits=5)
params = {'num_leaves' : [31, 51, 71, 91],
          'n_estimators' : [700],
          'colsample_bytree' : [.5, .75, 1],
          'subsample' : [.75, 1],
          'reg_lambda' : np.logspace(0, 2, 3),
          'reg_alpha' : np.logspace(0, 2, 3),
          'min_child_samples' : [20, 35, 50],
          'learning_rate' : [.01, .05, .1]}
lgbtree2 = lgb.LGBMClassifier()
lgb_mdl = RandomizedSearchCV(lgbtree2, params, cv=cv)

In [94]:
%%time
lgb_mdl.fit(X_train_part, y_train_part)

CPU times: user 25min 19s, sys: 2min 7s, total: 27min 26s
Wall time: 5min 49s


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
          error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'num_leaves': [31, 51, 71, 91], 'n_estimators': [700], 'colsample_bytree': [0.5, 0.75, 1], 'subsample': [0.75, 1], 'reg_lambda': array([  1.,  10., 100.]), 'reg_alpha': array([  1.,  10., 100.]), 'min_child_samples': [20, 35, 50], 'learning_rate': [0.01, 0.05, 0.1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,

In [96]:
lgb_mdl.best_params_

{'subsample': 1,
 'reg_lambda': 1.0,
 'reg_alpha': 1.0,
 'num_leaves': 91,
 'n_estimators': 700,
 'min_child_samples': 20,
 'learning_rate': 0.01,
 'colsample_bytree': 0.5}

In [95]:
lgb_valid_pred = lgb_mdl.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, lgb_valid_pred)

0.7467952532963773