In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# Data Exploration

In [50]:
df_use = pd.read_csv('flights.csv')
df = df_use.copy()
del df_use

In [51]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-06-10,AA,AA_CODESHARE,AA,5393,OH,N580NN,5393,14685,SAV,...,213.0,,,,,,,,,
1,2019-05-08,WN,WN,WN,1065,WN,N267WN,1065,13232,MDW,...,283.0,27.0,0.0,0.0,0.0,0.0,,,,
2,2018-11-09,UA,UA,UA,739,UA,N13110,739,13204,MCO,...,1546.0,,,,,,,,,
3,2018-01-27,AA,AA,AA,2283,AA,N972TW,2283,15624,VPS,...,641.0,,,,,,,,,
4,2019-11-05,DL,DL,DL,1678,DL,N321US,1678,12892,LAX,...,1235.0,,,,,,,,,


In [52]:
df.groupby('arr_delay').size()

arr_delay
-200.0     1
-73.0      1
-72.0      1
-71.0      1
-68.0      1
          ..
 1486.0    1
 1557.0    1
 1608.0    1
 1734.0    1
 1792.0    1
Length: 703, dtype: int64

In [53]:
df.groupby('cancelled').size()

cancelled
0.0    157250
1.0      2750
dtype: int64

In [54]:
df = df.loc[~df.cancelled.isin([1])]

In [55]:
df['arr_delay_pred'] = (df.arr_delay >= 0).astype('int')

In [56]:
df.groupby('arr_delay_pred').size()

arr_delay_pred
0    99449
1    57801
dtype: int64

# Feature Engineering

## Numerical Features

In [57]:
df[['dep_delay','crs_dep_time','dep_time','crs_arr_time','arr_time','arr_delay','arr_delay_pred']].head()

Unnamed: 0,dep_delay,crs_dep_time,dep_time,crs_arr_time,arr_time,arr_delay,arr_delay_pred
0,-4.0,1249,1245.0,1403,1400.0,-3.0,0
1,42.0,2255,2337.0,105,132.0,27.0,1
2,-8.0,807,759.0,1005,1007.0,2.0,1
3,-9.0,1326,1317.0,1543,1511.0,-32.0,0
4,-8.0,1755,1747.0,2256,2239.0,-17.0,0


In [58]:
cols_num = ['crs_dep_time','dep_time','distance','crs_arr_time']
cols_num

['crs_dep_time', 'dep_time', 'distance', 'crs_arr_time']

In [59]:
df[cols_num].isnull().sum()

crs_dep_time    0
dep_time        0
distance        0
crs_arr_time    0
dtype: int64

## Categorical Features

In [60]:
cols_cat = ['mkt_carrier','origin', 'dest']
cols_cat

['mkt_carrier', 'origin', 'dest']

In [61]:
df[cols_cat].nunique()

mkt_carrier     11
origin         375
dest           372
dtype: int64

In [62]:
df[cols_cat].isnull().sum()

mkt_carrier    0
origin         0
dest           0
dtype: int64

In [63]:
pd.get_dummies(df['mkt_carrier'],prefix = 'mkt_carrier').head()

Unnamed: 0,mkt_carrier_AA,mkt_carrier_AS,mkt_carrier_B6,mkt_carrier_DL,mkt_carrier_F9,mkt_carrier_G4,mkt_carrier_HA,mkt_carrier_NK,mkt_carrier_UA,mkt_carrier_VX,mkt_carrier_WN
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0


In [64]:
df_cat = pd.get_dummies(df[cols_cat],drop_first = False)

In [65]:
df_cat.head()

Unnamed: 0,mkt_carrier_AA,mkt_carrier_AS,mkt_carrier_B6,mkt_carrier_DL,mkt_carrier_F9,mkt_carrier_G4,mkt_carrier_HA,mkt_carrier_NK,mkt_carrier_UA,mkt_carrier_VX,...,dest_VEL,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
df = pd.concat([df,df_cat], axis = 1)

In [67]:
cols_all_cat = list(df_cat.columns)
print('number of categorical:',len(cols_all_cat))

number of categorical: 758


## Engineering Features Summary

Let's make a new dataframe that only has the columns of interest

In [68]:
cols_input = cols_num + cols_all_cat
df_data = df[cols_input + ['arr_delay_pred']]
df_data.head()

Unnamed: 0,crs_dep_time,dep_time,distance,crs_arr_time,mkt_carrier_AA,mkt_carrier_AS,mkt_carrier_B6,mkt_carrier_DL,mkt_carrier_F9,mkt_carrier_G4,...,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM,arr_delay_pred
0,1249,1245.0,213.0,1403,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2255,2337.0,283.0,105,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,807,759.0,1546.0,1005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1326,1317.0,641.0,1543,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1755,1747.0,1235.0,2256,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training/Validation/Test

In [69]:
df_data = df_data.sample(n = len(df_data))
df_data = df_data.reset_index(drop = True)

In [70]:
df_valid_test=df_data.sample(frac=0.30)
print('Split size: %.3f'%(len(df_valid_test)/len(df_data)))

Split size: 0.300


split into test and validation using 50% fraction.

In [71]:
df_test = df_valid_test.sample(frac = 0.5)
df_valid = df_valid_test.drop(df_test.index)

In [72]:
df_train_all=df_data.drop(df_valid_test.index)

In [73]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [74]:
print('Prevalence:%.3f'%calc_prevalence(df['arr_delay_pred'].values))

Prevalence:0.368


In [75]:
print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.arr_delay_pred.values)))
print('Valid prevalence(n = %d):%.3f'%(len(df_valid),calc_prevalence(df_valid.arr_delay_pred.values)))
print('Train all prevalence(n = %d):%.3f'%(len(df_train_all), calc_prevalence(df_train_all.arr_delay_pred.values)))

Test prevalence(n = 23588):0.367
Valid prevalence(n = 23587):0.365
Train all prevalence(n = 110075):0.368


In [76]:
df_train_all.to_csv('df_train_all.csv',index=False)
df_valid.to_csv('df_valid.csv',index=False)
df_test.to_csv('df_test.csv',index=False)

# Data preparation

 Here, we will create a balanced training data set that has 50% positive and 50% negative samples each.

In [77]:
rows_pos = df_train_all.arr_delay_pred == 1
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))],axis = 0)

df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

print('Train balanced prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.arr_delay_pred.values)))

Train balanced prevalence(n = 81066):0.500


In [78]:
col2use = [c for c in list(df_train_all.columns) if c != 'arr_delay_pred']
print(len(col2use))

762


In [79]:
X_train = df_train[col2use].values
X_train_all = df_train_all[col2use].values
X_valid = df_valid[col2use].values

y_train = df_train['arr_delay_pred'].values
y_valid = df_valid['arr_delay_pred'].values

print('Training All shapes:',X_train_all.shape)
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)

Training All shapes: (110075, 762)
Training shapes: (81066, 762) (81066,)
Validation shapes: (23587, 762) (23587,)


In [80]:
from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
scaler.fit(X_train_all)

StandardScaler()

In [81]:
import pickle
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))

Load it back:

In [82]:
scaler = pickle.load(open(scalerfile, 'rb'))

Now we can transform our data matrices

In [83]:
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)

# Machine Learning Algorithms

Default parameters

In [96]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity 

In [97]:
thresh = 0.5

## Logistic Regression

In [98]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train_tf, y_train)

LogisticRegression()

In [99]:
y_train_preds = lr.predict_proba(X_train_tf)[:,1]
y_valid_preds = lr.predict_proba(X_valid_tf)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Logistic Regression
Training:
AUC:0.690
accuracy:0.638
recall:0.628
precision:0.641
specificity:0.648
prevalence:0.500
 
Validation:
AUC:0.668
accuracy:0.627
recall:0.611
precision:0.497
specificity:0.636
prevalence:0.370
 


## Stochastic Gradient Descent

In [100]:
from sklearn.linear_model import SGDClassifier
sgdc=SGDClassifier(loss = 'log',alpha = 0.1)
sgdc.fit(X_train_tf, y_train)

SGDClassifier(alpha=0.1, loss='log')

In [101]:
y_train_preds = sgdc.predict_proba(X_train_tf)[:,1]
y_valid_preds = sgdc.predict_proba(X_valid_tf)[:,1]

print('Stochastic Gradient Descend')
print('Training:')
sgdc_train_auc, sgdc_train_accuracy, sgdc_train_recall, sgdc_train_precision, sgdc_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
sgdc_valid_auc, sgdc_valid_accuracy, sgdc_valid_recall, sgdc_valid_precision, sgdc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Stochastic Gradient Descend
Training:
AUC:0.620
accuracy:0.588
recall:0.592
precision:0.587
specificity:0.584
prevalence:0.500
 
Validation:
AUC:0.601
accuracy:0.577
recall:0.579
precision:0.446
specificity:0.576
prevalence:0.370
 


## Naive Bayes

In [102]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_tf, y_train)

GaussianNB()

In [103]:
y_train_preds = nb.predict_proba(X_train_tf)[:,1]
y_valid_preds = nb.predict_proba(X_valid_tf)[:,1]

print('Naive Bayes')
print('Training:')
nb_train_auc, nb_train_accuracy, nb_train_recall, nb_train_precision, nb_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
nb_valid_auc, nb_valid_accuracy, nb_valid_recall, nb_valid_precision, nb_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Naive Bayes
Training:
AUC:0.522
accuracy:0.516
recall:0.943
precision:0.508
specificity:0.089
prevalence:0.500
 
Validation:
AUC:0.508
accuracy:0.397
recall:0.932
precision:0.374
specificity:0.082
prevalence:0.370
 


## Decision Tree Classifier

In [104]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=10)
tree.fit(X_train_tf, y_train)

DecisionTreeClassifier(max_depth=10)

In [105]:
y_train_preds = tree.predict_proba(X_train_tf)[:,1]
y_valid_preds = tree.predict_proba(X_valid_tf)[:,1]

print('Decision Tree')
print('Training:')
tree_train_auc, tree_train_accuracy, tree_train_recall, tree_train_precision, tree_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
tree_valid_auc, tree_valid_accuracy, tree_valid_recall, tree_valid_precision, tree_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Decision Tree
Training:
AUC:0.761
accuracy:0.690
recall:0.570
precision:0.751
specificity:0.811
prevalence:0.500
 
Validation:
AUC:0.738
accuracy:0.707
recall:0.552
precision:0.616
specificity:0.798
prevalence:0.370
 


## Random Forest

In [106]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(max_depth = 6)
rf.fit(X_train_tf, y_train)

RandomForestClassifier(max_depth=6)

In [107]:
y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]

print('Random Forest')
print('Training:')
rf_train_auc, rf_train_accuracy, rf_train_recall, rf_train_precision, rf_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
rf_valid_auc, rf_valid_accuracy, rf_valid_recall, rf_valid_precision, rf_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Random Forest
Training:
AUC:0.623
accuracy:0.584
recall:0.602
precision:0.581
specificity:0.566
prevalence:0.500
 
Validation:
AUC:0.612
accuracy:0.574
recall:0.593
precision:0.444
specificity:0.563
prevalence:0.370
 


## Gradient Boosting Classifier 

In [108]:
from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(n_estimators=100,
     max_depth=3, learning_rate=1.0)
gbc.fit(X_train_tf, y_train)

GradientBoostingClassifier(learning_rate=1.0)

In [109]:
y_train_preds = gbc.predict_proba(X_train_tf)[:,1]
y_valid_preds = gbc.predict_proba(X_valid_tf)[:,1]

print('Gradient Boosting Classifier')
print('Training:')
gbc_train_auc, gbc_train_accuracy, gbc_train_recall, gbc_train_precision, gbc_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
gbc_valid_auc, gbc_valid_accuracy, gbc_valid_recall, gbc_valid_precision, gbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

Gradient Boosting Classifier
Training:
AUC:0.847
accuracy:0.780
recall:0.673
precision:0.855
specificity:0.886
prevalence:0.500
 
Validation:
AUC:0.821
accuracy:0.787
recall:0.651
precision:0.741
specificity:0.866
prevalence:0.370
 
