In [349]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import catboost
from matplotlib import pyplot as plt
import seaborn as sns
import operator

**Read the data**

In [350]:
train_df = pd.read_csv('flight_delays_train.csv')
test_df = pd.read_csv('flight_delays_test.csv')

In [351]:
dataset = pd.concat([train_df, test_df], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [352]:
for col in ['Month', 'DayofMonth', 'DayOfWeek'] : dataset[col] = dataset[col].apply(lambda x: x.split('-')[1]).astype(np.int32) - 1


In [353]:
# Hour and minute
dataset['hour'] = dataset['DepTime'] // 100
#dataset['Dep_hour_flag'] = ((dataset['hour'] >= 6) & (dataset['hour'] < 23)).astype('int')

dataset.loc[dataset['hour'] == 24, 'hour'] = 0
dataset.loc[dataset['hour'] == 25, 'hour'] = 1
dataset['minute'] = dataset['DepTime'] % 100

# Season
#dataset['summer'] = (dataset['Month'].isin([6, 7, 8])).astype(np.int32)
# dataset['autumn'] = (dataset['Month'].isin([9, 10, 11])).astype(np.int32)
# dataset['winter'] = (dataset['Month'].isin([12, 1, 2])).astype(np.int32)
# dataset['spring'] = (dataset['Month'].isin([3, 4, 5])).astype(np.int32)

# Daytime
dataset['daytime'] = pd.cut(dataset['hour'], bins=[0,6,10,12,14,16,18,20,23], include_lowest=True)
dataset['daytime']  = pd.factorize(dataset.daytime)[0]
dataset['Dist'] = pd.cut(dataset['Distance'], bins=[0, 200, 400, 600, 800, 1000,1200,1400,1600, 2000, 3000], include_lowest=True)
dataset['Dist']  = pd.factorize(dataset.Dist)[0]


# Extract the labels
dataset['dep_delayed_15min'] = dataset['dep_delayed_15min'].map({'N' : 0, 'Y' : 1})

dataset['flight'] = dataset['Origin'] + '-->' + dataset['Dest']  # correlate with Distance

carriers_list = dataset.UniqueCarrier.value_counts().head(10).index.tolist()
dataset['UniqueCarrier'] = dataset.UniqueCarrier.apply(lambda x: x if x in carriers_list else 'other')

In [354]:
## killing features

dataset['new1'] = dataset.hour.astype(str) + dataset.Origin.astype(str)
dataset['new2'] = dataset.hour.astype(str) + dataset.Dest.astype(str)
dataset['new3'] = dataset['UniqueCarrier'].astype(str) + dataset['Dist'].astype(str)
dataset['new4'] = dataset['Origin'].astype(str) + dataset['Dist'].astype(str)
dataset['new5'] = dataset['Dest'].astype(str) + dataset['Dist'].astype(str)
dataset['new6'] = dataset['UniqueCarrier'].astype(str) + dataset['hour'].astype(str)

dataset['new7'] = dataset['hour'].astype(str) + dataset['DayOfWeek'].astype(str)
dataset['new8'] = dataset['hour'].astype(str) + dataset['Month'].astype(str)
dataset['new9'] = dataset['new6'].astype(str) + dataset['new1'].astype(str)


In [355]:
#dataset = pd.concat([dataset,pd.get_dummies(dataset.UniqueCarrier)],axis=1)
#dataset = pd.concat([dataset,pd.get_dummies(dataset.Origin)],axis=1)
#dataset = pd.concat([dataset,pd.get_dummies(dataset.Dest)],axis=1)

In [356]:
# for i,col in enumerate(train_df.columns):
#     if train_df[col].dtype == 'int32':
        #plt.figure(i)
# sns.distplot(dataset.spring[dataset.dep_delayed_15min == 1], hist=False, label = 'delayed')
# sns.distplot(dataset.spring[dataset.dep_delayed_15min == 0], hist=False, label = 'non delayed')

**Allocate a hold-out set (a.k.a. a validation set) to validate the model**

In [357]:
dataset.drop(columns = ['Dist', 'flight', 'daytime'] ,axis = 1, inplace = True)

In [367]:
cols = ['DayOfWeek', 'DayofMonth', 'Dest', 'Month', 'Origin',
       'UniqueCarrier', 'hour', 'minute', 
       'new1', 'new2','new3','new4','new5','new6','new7','new8','new9']
for col in cols:
    dataset[col] = dataset[col].astype('category')
    

In [368]:
X_train = dataset[pd.notna(dataset.dep_delayed_15min) == True].drop('dep_delayed_15min', axis=1)
y_train = dataset[pd.notna(dataset.dep_delayed_15min) == True]['dep_delayed_15min']
X_test = dataset[pd.notna(dataset.dep_delayed_15min) == False].drop('dep_delayed_15min', axis=1)

In [369]:
categ_feat_idx = np.where(X_train.dtypes == 'category')[0]
print(categ_feat_idx)

[ 0  1  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18]


In [370]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                )

In [371]:
ctb = CatBoostClassifier(iterations = 1500 ,random_seed=17, learning_rate = 0.2, depth = 4, silent = True, cat_features=categ_feat_idx, task_type="GPU", border_count = 32)

In [None]:
ctb.fit(X_train_part, y_train_part,cat_features=categ_feat_idx)
parameters = {'learning_rate' : np.arange(0.1, 0.4, 0.1), 'depth' : np.arange(3,6,1)}
cv = GridSearchCV(ctb, parameters, cv=2, verbose = 3, scoring = 'roc_auc')
cv.fit(X_train_part, y_train_part)

In [373]:
%%time
ctb.fit(X_train_part, y_train_part)
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
print(roc_auc_score(y_valid, ctb_valid_pred))

0.788808806176471
Wall time: 3min 51s


In [374]:
pd.options.display.max_rows = 10000
ctb.get_feature_importance(prettified=True)#['Feature Id'].values

Unnamed: 0,Feature Id,Importances
0,hour,10.386204
1,minute,7.633185
2,new9,7.26494
3,new5,5.911647
4,DepTime,5.67917
5,Dest,5.513651
6,new1,5.218234
7,Origin,5.17375
8,new8,5.079302
9,new6,4.72537


In [None]:
ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('sample_submission.csv', index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')