# Flight delay model: Gradient Boosting

- **Installing necessary modules**

In [2]:
!pip install seaborn



- **Importing modules**

In [28]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV

- **Loading data and showing information about the dataset**

In [4]:
flight_delay_train_set = pd.read_csv('flight_delays_train.csv')
flight_delay_train_set.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [5]:
flight_delay_train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
Month                100000 non-null object
DayofMonth           100000 non-null object
DayOfWeek            100000 non-null object
DepTime              100000 non-null int64
UniqueCarrier        100000 non-null object
Origin               100000 non-null object
Dest                 100000 non-null object
Distance             100000 non-null int64
dep_delayed_15min    100000 non-null object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [6]:
flight_delay_train_set.describe()

Unnamed: 0,DepTime,Distance
count,100000.0,100000.0
mean,1341.52388,729.39716
std,476.378445,574.61686
min,1.0,30.0
25%,931.0,317.0
50%,1330.0,575.0
75%,1733.0,957.0
max,2534.0,4962.0


- **Manipulating the data**

In [7]:
df = flight_delay_train_set
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [14]:
df_1.corr()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,Distance,dep_delayed_15min
Month,1.0,0.007106,0.004717,-0.001544,0.006853,0.016998
DayofMonth,0.007106,1.0,-0.015537,0.001116,0.00151,0.025842
DayOfWeek,0.004717,-0.015537,1.0,0.009392,0.011937,0.010439
DepTime,-0.001544,0.001116,0.009392,1.0,-0.020681,0.243819
Distance,0.006853,0.00151,0.011937,-0.020681,1.0,-0.002734
dep_delayed_15min,0.016998,0.025842,0.010439,0.243819,-0.002734,1.0


In [8]:
def toInt(dataFrame):
    df_copy = dataFrame.copy()
    df_copy['Month'] = df_copy['Month'].str.replace('c-', '').astype(int)
    df_copy['DayofMonth'] = df_copy['DayofMonth'].str.replace('c-', '').astype(int)
    df_copy['DayOfWeek'] = df_copy['DayOfWeek'].str.replace('c-', '').astype(int)

    return df_copy

def joinOriginWithDest(dataFrame):
    df_copy = dataFrame.copy()
    df_copy['Itinerary'] = df_copy['Origin'] + '-' + df_copy['Dest']
    
    return df_copy

def replaceDelayedWithNum(dataFrame):
    df_copy = dataFrame.copy()
    df_copy['dep_delayed_15min'] = df_copy['dep_delayed_15min'].replace(['Y', 'N'], [1, 0])

    return df_copy

def dropOriginDest(dataFrame):
    df_copy = dataFrame.copy()
    df_copy = df_copy.drop(labels=['Origin', 'Dest'], axis=1)

    return df_copy 

df_1 = replaceDelayedWithNum(toInt(df))
df_1.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,8,21,7,1934,AA,ATL,DFW,732,0
1,4,20,3,1548,US,PIT,MCO,834,0
2,9,2,5,1422,XE,RDU,CLE,416,0
3,11,25,6,1015,OO,DEN,MEM,872,0
4,10,7,6,1828,WN,MDW,OMA,423,1


In [16]:
def topListOfUniqueCarriers(dataFrame):
    df_copy = dataFrame.copy()
    uniqueCarrier_list = df_copy['UniqueCarrier'].value_counts().head(12).index.tolist()
    df_copy['UniqueCarrier'] = df_copy['UniqueCarrier'].apply(lambda x: x if x in uniqueCarrier_list else 'other')
    return df_copy

def topListOfOrigins(dataFrame):
    df_copy = dataFrame.copy()
    origin_list = df_copy['Origin'].value_counts().head(12).index.tolist()
    df_copy['Origin'] = df_copy['Origin'].apply(lambda x: x if x in origin_list else 'other')
    return df_copy

def topListOfDest(dataFrame):
    df_copy = dataFrame.copy()
    dest_list = df_copy['Dest'].value_counts().head(12).index.tolist()
    df_copy['Dest'] = df_copy['Dest'].apply(lambda x: x if x in dest_list else 'other')
    return df_copy 

df_2 = topListOfDest(topListOfOrigins(topListOfUniqueCarriers(df_1)))
df_2 = pd.get_dummies(df_2)   
df_2.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,Distance,dep_delayed_15min,UniqueCarrier_AA,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_EV,...,Dest_DFW,Dest_DTW,Dest_EWR,Dest_IAH,Dest_LAS,Dest_LAX,Dest_ORD,Dest_PHX,Dest_SLC,Dest_other
0,8,21,7,1934,732,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,4,20,3,1548,834,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,9,2,5,1422,416,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,11,25,6,1015,872,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,10,7,6,1828,423,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


- **Parameter searching** 

In [31]:
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile']
            , 'learning_rate': [0.05, 0.1, 0.15]
            , 'n_estimators': [100, 200, 300]
            , 'subsample': [0.5, 1.0, 1.5]
            , 'criterion': ['friedman_mse', 'mse', 'mae']
            , 'min_samples_split': [2, 4, 6]
            , 'min_samples_leaf': [1, 2, 3]
            , 'min_weight_fraction_leaf': [0.0, 0.05, 0.1]
            , 'max_depth': [3, 4, 5]
            , 'min_impurity_decrease': [0.0, 0.05, 0.1]
            , 'max_features': ['auto', 'sqrt', 'log2']
            , 'n_iter_no_change': [1, 2, 3]}

gb = GradientBoostingRegressor()
gscv = RandomizedSearchCV(gb, parameters, n_iter=20)
X, y = df_2.drop('dep_delayed_15min', axis=1), df_2['dep_delayed_15min']
result = gscv.fit(X, y) 
result.best_params_

Traceback (most recent call last):
  File "/opt/venv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/venv/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 441, in fit
    self._check_params()
  File "/opt/venv/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 252, in _check_params
    "was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 1.5

Traceback (most recent call last):
  File "/opt/venv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/venv/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 441, in fit
    self._check_params()
  File "/opt/venv/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 252, in _check_params
    "was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 1.5

Tracebac

{'subsample': 1.0,
 'n_iter_no_change': 1,
 'n_estimators': 100,
 'min_weight_fraction_leaf': 0.1,
 'min_samples_split': 6,
 'min_samples_leaf': 3,
 'min_impurity_decrease': 0.0,
 'max_features': 'sqrt',
 'max_depth': 4,
 'loss': 'ls',
 'learning_rate': 0.15,
 'criterion': 'mse'}

In [35]:
gb_best= GradientBoostingRegressor(**result.best_params_)
gb_best.fit(X,y)

GradientBoostingRegressor(criterion='mse', learning_rate=0.15, max_depth=4,
                          max_features='sqrt', min_samples_leaf=3,
                          min_samples_split=6, min_weight_fraction_leaf=0.1,
                          n_iter_no_change=1)

- **Test set treatment**

In [32]:
df_test = pd.read_csv('flight_delays_test.csv')
df_test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [34]:
df_3 = pd.get_dummies(topListOfDest(topListOfOrigins(topListOfUniqueCarriers(toInt(df_test)))))
df_3.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,Distance,UniqueCarrier_AA,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_EV,UniqueCarrier_MQ,...,Dest_DTW,Dest_EWR,Dest_IAH,Dest_LAS,Dest_LAX,Dest_MSP,Dest_ORD,Dest_PHX,Dest_SLC,Dest_other
0,7,25,3,615,598,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,4,17,2,739,1235,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,12,2,7,651,577,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,25,7,1614,377,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,6,6,3,1505,258,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [38]:
X_test = df_3.copy()
gb_best.predict(X_test)
my_submission = pd.DataFrame({'id': X_test.index
                            , 'dep_delayed_15min': gb_best.predict(X_test)})
my_submission.to_csv('asf_ohs_submission.csv', index=False)