# Flight Delay Predictive Models

In [15]:
##### Imports

# ML Standards
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Data Processing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import MiniBatchSparsePCA

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBRFClassifier

# Evaluation Metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report 
from sklearn.metrics import cohen_kappa_score
from scipy.stats import fisher_exact
from statsmodels.stats.contingency_tables import mcnemar


In [16]:
#Import the data (the file was initially processed in data-collection)
flight_delay = pd.read_csv("flight_information.csv")

#Test data
# flight_delay = pd.read_csv("flight_test.csv")

In [17]:
# See what we're working with
flight_delay


Unnamed: 0.1,Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,0,2018-07-25,B6,B6,B6,695,B6,N239JB,695,12197,...,972.0,10.0,0.0,4.0,0.0,31.0,,,,
1,1,2019-02-16,UA,UA_CODESHARE,UA,5617,OO,N160SY,5617,11298,...,801.0,,,,,,,,,
2,2,2019-02-28,DL,DL,DL,1730,DL,N915AT,1730,13930,...,606.0,,,,,,,,,
3,3,2018-10-28,AA,AA_CODESHARE,AA,4688,YX,N104HQ,4688,12124,...,207.0,,,,,,,,,
4,4,2018-04-27,WN,WN,WN,1775,WN,N8632A,1775,15016,...,687.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,199995,2019-05-18,AA,AA,AA,1555,AA,N562UW,1555,12889,...,236.0,0.0,0.0,0.0,0.0,23.0,,,,
199996,199996,2019-06-24,AS,AS_CODESHARE,AS,3449,OO,N196SY,3449,14869,...,599.0,,,,,,,,,
199997,199997,2019-01-13,NK,NK,NK,858,NK,N507NK,858,14679,...,1303.0,,,,,,,,,
199998,199998,2019-04-08,UA,UA_CODESHARE,UA,5430,OO,N124SY,5430,14771,...,599.0,,,,,,,,,


In [18]:
flight_delay.columns

Index(['Unnamed: 0', 'fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')

### Feature Engineering

We will extract some useful features here (and eliminate non useful features).

The features that will have primary importance are:

- Time of day and time of year
- Departing and arrival airports
- Airlines


In [19]:
# Note: not required for test data

# # Cleaning things: flights should take longer than 0 minutes
#  flight_delay = flight_delay[flight_delay['air_time']>0]

# #Don't consider diverted or cancelled flights
#  flight_delay.drop(flight_delay[(flight_delay['cancelled']==1) | (flight_delay['diverted']==1)].index, inplace=True)


In [25]:
# Extract desired features into new df
model_df = flight_delay[['fl_date', 'mkt_unique_carrier', 'crs_dep_time', 'crs_arr_time', 'origin', 'dest', 'arr_delay', 'distance']]
# model_df = flight_delay[['fl_date', 'mkt_unique_carrier', 'crs_dep_time', 'crs_arr_time', 'origin', 'dest', 'distance']]

#Only consider flights that are at no more than 60 minutes early (earlier flights may simply be misclassified )
model_df = model_df[model_df['arr_delay']>-60]

# As this is will be a classification, we will have two broad categories: not delayed and delayed.
# Classify all early flights as arriving on time (the 'not delayed' category) 
model_df['arr_delay'] = model_df['arr_delay'].where(model_df['arr_delay'] > 0, 0)
model_df = model_df.fillna(0) # For missing values

### Categorizing hours and time of day of flights
# For departure/arrival times, extract only the hour of departure/arrival
model_df['dep_hr'] = (model_df['crs_dep_time'] // 100).astype('str')
model_df['arr_hr'] = (model_df['crs_arr_time'] // 100).astype('str')
model_df = model_df.drop(columns=['crs_dep_time', 'crs_arr_time'])

# Aggregate based on time of day (morning, afternoon, evening)
model_df['dep_time_of_day'] = model_df['dep_hr'] # just create the column and fill to start
model_df['arr_time_of_day'] = model_df['arr_hr'] 
model_df['dep_hr'] = pd.to_numeric(model_df['dep_hr']) # Needs to be an integer for our filtering below
model_df['arr_hr'] = pd.to_numeric(model_df['arr_hr']) 

# Assigning categories
model_df.loc[(model_df['dep_hr'] >=5) & (model_df['dep_hr'] <12), 'dep_time_of_day'] = 'morn'
model_df.loc[(model_df['dep_hr'] >=12) & (model_df['dep_hr'] <17), 'dep_time_of_day'] = 'aft'
model_df.loc[(model_df['dep_hr'] >=17) | (model_df['dep_hr'] <5), 'dep_time_of_day'] = 'eve'

model_df.loc[(model_df['arr_hr'] >=5) & (model_df['arr_hr'] <12), 'arr_time_of_day'] = 'morn'
model_df.loc[(model_df['arr_hr'] >=12) & (model_df['arr_hr'] <17), 'arr_time_of_day'] = 'aft'
model_df.loc[(model_df['arr_hr'] >=17) |  (model_df['arr_hr'] <5), 'arr_time_of_day'] = 'eve'


### Categorizing day, month and time of year of flights 
# Extract month and day of week from the flight dates 
model_df['month'] = pd.DatetimeIndex(model_df['fl_date']).month
model_df['weekday'] = pd.DatetimeIndex(model_df['fl_date']).weekday
model_df = model_df.drop(columns=['fl_date'])

# Aggregate into season, based on months
model_df['season'] = model_df['month'] # just create column and fill to start
model_df['month'] = pd.to_numeric(model_df['month'] ) # Needs to be an integer for our filtering below

#Assigning categories
model_df.loc[(model_df['month'] >= 3) & (model_df['month'] < 6), 'season' ] = 'spr'
model_df.loc[(model_df['month'] >= 6) & (model_df['month'] < 9), 'season'] = 'sum'
model_df.loc[(model_df['month'] >= 9) & (model_df['month'] < 12), 'season'] = 'aut'
model_df.loc[(model_df['month'] == 12) | (model_df['month'] < 3), 'season'] = 'win'

# and make ready for categorical
model_df['month'] = model_df['month'].replace({1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
                            5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"})
model_df['weekday'] = model_df['weekday'].replace({0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu",
                            4: "Fri", 5: "Sat", 6: "Sun"})


#### While we're here, let's categorize airlines into size: "large" and "small" 
# (Airlines chosen based on exploratory analysis)
model_df['carrier_size'] = model_df['mkt_unique_carrier'] # just create the column to start
model_df.loc[(model_df['mkt_unique_carrier'] == 'UA') | (model_df['mkt_unique_carrier'] == 'AA') | \
            (model_df['mkt_unique_carrier'] == 'WN') | (model_df['mkt_unique_carrier'] == 'DL') | \
            (model_df['mkt_unique_carrier'] == 'AS'), 'carrier_size'] = 'large' 

model_df.loc[(model_df['mkt_unique_carrier'] == 'VX') | (model_df['mkt_unique_carrier'] == 'B6') | \
            (model_df['mkt_unique_carrier'] == 'HA') | (model_df['mkt_unique_carrier'] == 'F9') | \
            (model_df['mkt_unique_carrier'] == 'G4')| (model_df['mkt_unique_carrier'] == 'NK'), 'carrier_size'] = 'small'

#If needed, we can also create a 'medium' category: F9, B6, NK, AS
 

#### You know what, let's categorize flight distance too 
# Categories again chosen based on exploratory analysis
model_df['distance_cat'] = model_df['distance']
model_df['distance'] = pd.to_numeric(model_df['distance'] ) # Needs to be an integer for our filtering below

model_df.loc[(model_df['distance'] < 400) , 'distance_cat' ] = 'short'
model_df.loc[(model_df['distance'] >= 400) & (model_df['distance'] < 800), 'distance_cat'] = 'med'
model_df.loc[(model_df['distance'] >= 800) , 'distance_cat' ] = 'long'
model_df.drop('distance', axis=1, inplace=True)

In [26]:
model_df

Unnamed: 0,mkt_unique_carrier,origin,dest,arr_delay,dep_hr,arr_hr,dep_time_of_day,arr_time_of_day,month,weekday,season,carrier_size,distance_cat
0,B6,HPN,MCO,45.0,20,23,eve,eve,Jul,Wed,sum,small,long
1,UA,DFW,ORD,7.0,14,17,aft,eve,Feb,Sat,win,large,long
2,DL,ORD,ATL,0.0,17,20,eve,eve,Feb,Thu,win,large,med
3,AA,HHH,CLT,0.0,6,8,morn,morn,Oct,Sun,aut,large,short
4,WN,STL,HOU,0.0,21,23,eve,eve,Apr,Fri,spr,large,med
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,AA,LAS,LAX,23.0,14,16,aft,aft,May,Sat,spr,large,short
199996,AS,SLC,SFO,0.0,7,8,morn,morn,Jun,Mon,sum,large,med
199997,NK,SAN,IAH,0.0,7,12,morn,aft,Jan,Sun,win,small,long
199998,UA,SFO,SLC,0.0,15,18,aft,eve,Apr,Mon,spr,large,med


### Feature Selection / Dimensionality Reduction

Try Sparse PCA (since our matrix is very sparse); Mini Batch runs quicker, but sacrifices accuracy.

In [27]:
# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Modeling

#### Random forest classifiers 

Baseline calculations

Try it with all of our features to start.

In [28]:
def feature_target(df):
    '''Creates feature and target arrays ready for processing:
    Params:
        - df: expected dataframe 
    Returns:
        - X: array of features (turns categorical features into dummy variables)
        - X_columns: array of feature names
        - y: array of targets (in this case, arrival delays)
    '''
    # Extract target variable
    # Create labels for data (Rather than treat it as continuous)

    y_clas = df[['arr_delay']]
    y_clas.loc[( (y_clas['arr_delay']  >= 0) & (y_clas['arr_delay'] <15)), 'arr_delay'] = 0
    y_clas.loc[(y_clas['arr_delay']  >= 15), 'arr_delay'] = 1
    df = df.drop(columns=['arr_delay'])

    # ...and create categorical features
    X = pd.get_dummies(df)
    X_columns = X.columns
    return X, X_columns, y_clas

In [29]:
feature_df, X_columns, y_clas = feature_target(model_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [30]:
# Might make sense to use the same split for all the analysis...


X_train, X_test, Y_train, Y_test = train_test_split(feature_df, y_clas, test_size = 0.3)

#### Evaluation Preamble

We want to be able to evaluate how our model is performing; looking at accuracy (*i.e.* % of correct predictions over all results) is not necessarily the best way to evaluate our model. For example, let's see how many delays there are in our test data:

In [31]:
y_clas.sum()/y_clas.count()

arr_delay    0.19052
dtype: float64

So about 19% of the flights in our test data is delayed. Well, we could just say *all* of our flights are delayed and get 81% accuracy! Which is not ideal; we would like to be able to predict at least *some* of the delays. So how do we assess whether our model is able to do that?

We can use a confusion matrix (which tells us [precision and recall](https://developers.google.com/machine-learning/crash-course/classification/precision-and-recall)), but there's also a chance our model just happens to accurately predict delays well *by chance alone*; it would be nice if we could see if this is the case. 

To do this, we will use two metrics: McNemar's Test and the Cohen $\kappa$ score. 

[McNemar's Test](https://en.wikipedia.org/wiki/McNemar's_test), which is commonly used for diagnostic tests in medical sciences, tells us if our model disagrees in the same way; for example, if it makes errors in a similar way with all data, it may be doing so by chance (accept null hypothesis), whereas if it disagrees with expected results  (reject null hypothesis), there are differences in how it disagrees (*i.e.* it's not due to chance . Here, we use the p-value to determine how strongly the associations are (generally, values of 0.05 are used to reject/accept the null hypothesis). [More here](https://machinelearningmastery.com/mcnemars-test-for-machine-learning/).

The [Cohen $\kappa$ score](https://en.wikipedia.org/wiki/Cohen's_kappa) is a measure of the agreement between two models that classify items. A score of around 1 tells us the models are in perfect agreement, and a score of around 0 means no agreement. 

Below, we will look at some toy models to give us a baseline for what types of results to expect:



1. Perfect agreement

In [32]:
#Proportion of total flights delayed

print('Results when match everything perfectly:')

# print(accuracy_score(y_test, y_pred))
print(classification_report(Y_test, Y_test))

conf_mat = confusion_matrix(Y_test, Y_test)

print(conf_mat)


mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

# So McNemar. Null hypothesis is that the model makes errors in the same proportion for both predictions. 
# If we reject null (p-val less than alpha, say, 5%), then the proportions are different in different ways, i.e.
# there's a reason for the disagreement, i think

print(f"The Cohen kappa score is {cohen_kappa_score(Y_test, Y_test)}.")

Results when match everything perfectly:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     48635
         1.0       1.00      1.00      1.00     11354

    accuracy                           1.00     59989
   macro avg       1.00      1.00      1.00     59989
weighted avg       1.00      1.00      1.00     59989

[[48635     0]
 [    0 11354]]
The p-value of the Mcnemar test (which I should use?) is 100.0%.
The Cohen kappa score is 1.0.


2. Random chance.

In [33]:
#Proportion of total flights delayed

prop_delays = (Y_test.sum()/Y_test.count()).values[0]

# If we just randomly guessed flights that were delayed, based on that calculated proportion, how well would we do?
# See documentation; first parameter of 2 is like 'np.arange(2)'
y_pred = np.random.choice(2, Y_test.shape[0])
# y_pred = np.random.choice(2, size = y_test.shape[0], p=[1-prop_delays, prop_delays])

print('Results when we just randomly guess which flights will be delayed:')

print(f'{prop_delays*100:.2f}% of all flights in the sample are delayed, based on our criteria for "delayed".')

print(accuracy_score(Y_test, y_pred))

print(classification_report(Y_test, y_pred))

conf_mat = confusion_matrix(Y_test, y_pred)

print(conf_mat)

oddsr, p = fisher_exact(conf_mat)

print(f"The probability that we got these results from random chance is {p*100:.1f}%.")

mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

print(f"The Cohen kappa score is {cohen_kappa_score(Y_test, y_pred)}.")


Results when we just randomly guess which flights will be delayed:
18.93% of all flights in the sample are delayed, based on our criteria for "delayed".
0.4974411975528847
              precision    recall  f1-score   support

         0.0       0.81      0.50      0.62     48635
         1.0       0.19      0.50      0.27     11354

    accuracy                           0.50     59989
   macro avg       0.50      0.50      0.44     59989
weighted avg       0.69      0.50      0.55     59989

[[24177 24458]
 [ 5690  5664]]
The probability that we got these results from random chance is 44.1%.
The p-value of the Mcnemar test (which I should use?) is 0.0%.
The Cohen kappa score is -0.0024693754933267087.


3. Randomness based on proportion of true results (i.e. 80% chance of on-time; 20% chance of delay)

In [34]:
y_pred = np.random.choice(2, size = Y_test.shape[0], p=[1-prop_delays, prop_delays])

print('Results when we bias the probabilities of our random guessing based on the actual proportion of delayed flights:')

print(f'{prop_delays*100:.2f}% of all flights in the sample are delayed, based on our criteria for "delayed".')

print(accuracy_score(Y_test, y_pred))

print(classification_report(Y_test, y_pred))

conf_mat = confusion_matrix(Y_test, y_pred)

print(conf_mat)

oddsr, p = fisher_exact(conf_mat)

print(f"The probability that we got these results from random chance is {p*100:.1f}%.")

mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

print(f"The Cohen kappa score is {cohen_kappa_score(Y_test, y_pred)}.")

Results when we bias the probabilities of our random guessing based on the actual proportion of delayed flights:
18.93% of all flights in the sample are delayed, based on our criteria for "delayed".
0.6923936054943406
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81     48635
         1.0       0.18      0.18      0.18     11354

    accuracy                           0.69     59989
   macro avg       0.50      0.50      0.50     59989
weighted avg       0.69      0.69      0.69     59989

[[39462  9173]
 [ 9280  2074]]
The probability that we got these results from random chance is 14.6%.
The p-value of the Mcnemar test (which I should use?) is 43.5%.
The Cohen kappa score is -0.005963665439225219.


We need to apply different selection techniques to find out which one will be the best for our problems.

- Original Features vs. PCA conponents?

In [35]:
#Try Sparse PCA, since this is sparse data...

def pca_processing(X, n):
    '''Runs the MiniBatchSparsePCA on our data and returns the transformed features
    Params:
        - X: array of data features; assumed to be a sparse matrix (lots of 0s and 1s)
        - n: Number of PCA components to fit with
    Returns:
        - x: array of transformed data with n features  
        - x_columns: array of features names (just PCA number, in this case)
        ''' 
    pca_trans = MiniBatchSparsePCA(n_components=n, batch_size=30, random_state=0)
    pca_trans.fit(X)
    x = pca_trans.transform(X)
    x_columns = range(n)
    return x, x_columns

Let's try something, based on the above results: remove ...things

best model initially is mkt_unique_carrier, arr_hr and month

**In cell below, commented out features will remain in the model**

In [36]:
feature_df.drop(list(feature_df.filter(regex = 'origin')), axis = 1, inplace=True) 
feature_df.drop(list(feature_df.filter(regex = 'dest')), axis = 1, inplace=True) 

# feature_df.drop(list(feature_df.filter(regex = 'mkt_unique_carrier')), axis = 1, inplace=True) 
feature_df.drop(list(feature_df.filter(regex = 'carrier_size')), axis = 1, inplace=True) 

feature_df.drop(list(feature_df.filter(regex = 'dep_hr')), axis = 1, inplace=True) 
feature_df.drop(list(feature_df.filter(regex = 'dep_time_of_day')), axis = 1, inplace=True) 
feature_df.drop(list(feature_df.filter(regex = 'arr_hr')), axis = 1, inplace=True) 
# feature_df.drop(list(feature_df.filter(regex = 'arr_time_of_day')), axis = 1, inplace=True) 

feature_df.drop(list(feature_df.filter(regex = 'month')), axis = 1, inplace=True) 
# feature_df.drop(list(feature_df.filter(regex = 'season')), axis = 1, inplace=True) 
feature_df.drop(list(feature_df.filter(regex = 'weekday')), axis = 1, inplace=True) 

feature_df.drop(list(feature_df.filter(regex = 'distance_cat')), axis = 1, inplace=True) 

In [37]:
# Check to see we have the desired features
feature_df.columns

Index(['mkt_unique_carrier_AA', 'mkt_unique_carrier_AS',
       'mkt_unique_carrier_B6', 'mkt_unique_carrier_DL',
       'mkt_unique_carrier_F9', 'mkt_unique_carrier_G4',
       'mkt_unique_carrier_HA', 'mkt_unique_carrier_NK',
       'mkt_unique_carrier_UA', 'mkt_unique_carrier_VX',
       'mkt_unique_carrier_WN', 'arr_time_of_day_aft', 'arr_time_of_day_eve',
       'arr_time_of_day_morn', 'season_aut', 'season_spr', 'season_sum',
       'season_win'],
      dtype='object')

In [38]:
X, x_columns = pca_processing(feature_df, 5)

#--------------------------------
##### Commenting out the above line and uncommenting the two lines below will run model without PCA 

# X = feature_df
# x_columns = feature_df.columns
#--------------------------------

x_train, x_test, y_train, y_test = train_test_split(X, y_clas, test_size = 0.3)

rfc = RandomForestClassifier(n_estimators=100, class_weight = 'balanced') # class_weights gives higher weight to our small number of delays
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

feature_val = rfc.feature_importances_
feature_name = x_columns
# print(feature_name)

features = pd.DataFrame({"Feature names": pd.Series(dtype='str'),
                         "Importance": pd.Series(dtype='int')})
features['Feature names'] = feature_name
features['Importance'] = feature_val
print(features.sort_values(by=['Importance'], ascending=False).head())

print('Results when we remove origin/destination information:')

print('----------------------------------')

print(f'{prop_delays*100:.2f}% of all flights in the sample are delayed, based on our criteria for "delayed".')

print(f'The accuracy is {accuracy_score(y_test, y_pred):.3f}.')

print(classification_report(y_test, y_pred))

conf_mat = confusion_matrix(y_test, y_pred)

print(conf_mat)

oddsr, p = fisher_exact(conf_mat)

print(f"The probability that we got these results from random chance is {p*100:.1f}%.")

mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

print(f'ROC-AUC Score: {roc_auc_score(y_test, rfc.predict_proba(x_test)[:,1]):.3f}')

print(f"The Cohen kappa score is {cohen_kappa_score(y_test, y_pred)}.")

  rfc.fit(x_train, y_train)


   Feature names  Importance
0              0    0.443876
1              1    0.313069
2              2    0.141765
4              4    0.062863
3              3    0.038427
Results when we remove origin/destination information:
----------------------------------
18.93% of all flights in the sample are delayed, based on our criteria for "delayed".
The accuracy is 0.610.
              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72     48491
         1.0       0.26      0.56      0.35     11498

    accuracy                           0.61     59989
   macro avg       0.56      0.59      0.54     59989
weighted avg       0.74      0.61      0.65     59989

[[30134 18357]
 [ 5064  6434]]
The probability that we got these results from random chance is 0.0%.
The p-value of the Mcnemar test (which I should use?) is 0.0%.
ROC-AUC Score: 0.624
The Cohen kappa score is 0.12561533107024492.


Decision Tree Classifier

In [39]:

dtc = DecisionTreeClassifier(class_weight = 'balanced') # class_weights gives higher weight to our small number of delays
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)

print(f'The accuracy is {accuracy_score(y_test, y_pred):.3f}.')

print(classification_report(y_test, y_pred))

conf_mat = confusion_matrix(y_test, y_pred)

print(conf_mat)

oddsr, p = fisher_exact(conf_mat)

print(f"The probability that we got these results from random chance is {p*100:.1f}%.")

mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

print(f'ROC-AUC Score: {roc_auc_score(y_test, dtc.predict_proba(x_test)[:,1]):.3f}')

print(f"The Cohen kappa score is {cohen_kappa_score(y_test, y_pred)}.")

The accuracy is 0.610.
              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72     48491
         1.0       0.26      0.56      0.35     11498

    accuracy                           0.61     59989
   macro avg       0.56      0.59      0.54     59989
weighted avg       0.74      0.61      0.65     59989

[[30134 18357]
 [ 5064  6434]]
The probability that we got these results from random chance is 0.0%.
The p-value of the Mcnemar test (which I should use?) is 0.0%.
ROC-AUC Score: 0.623
The Cohen kappa score is 0.12561533107024492.


#### XGBoost

In [40]:

xgbrf = XGBRFClassifier(colsample_bynode=0.8, max_depth= 40, eta= 1,  objective =  'binary:logistic' ,  subsample =0.7,  max_delta_step = 1, \
      tree_method =  'hist' ,  num_parallel_tree = 150,   scale_pos_weight = 4).fit(x_train, y_train)
# dtrain = xgb.DMatrix(x_train, label=y_train)
# dtest = xgb.DMatrix(x_test, label=y_test)

In [41]:
# num_round = 10
bst = xgbrf.fit( x_train, y_train)

In [42]:
ypred = bst.predict(x_test)
y_pred=np.round(ypred)

print(f'The accuracy is {accuracy_score(y_test, y_pred):.3f}.')

print(classification_report(y_test, y_pred))

conf_mat = confusion_matrix(y_test, y_pred)

print(conf_mat)

oddsr, p = fisher_exact(conf_mat)

print(f"The probability that we got these results from random chance is {p*100:.1f}%.")

mc_result = mcnemar(conf_mat, exact=True)

print(f"The p-value of the Mcnemar test (which I should use?) is {mc_result.pvalue*100:.1f}%.")

print(f'ROC-AUC Score: {roc_auc_score(y_test, dtc.predict_proba(x_test)[:,1]):.3f}')

print(f"The Cohen kappa score is {cohen_kappa_score(y_test, y_pred)}.")

The accuracy is 0.617.
              precision    recall  f1-score   support

         0.0       0.86      0.63      0.73     48491
         1.0       0.26      0.55      0.35     11498

    accuracy                           0.62     59989
   macro avg       0.56      0.59      0.54     59989
weighted avg       0.74      0.62      0.66     59989

[[30687 17804]
 [ 5188  6310]]
The probability that we got these results from random chance is 0.0%.
The p-value of the Mcnemar test (which I should use?) is 0.0%.
ROC-AUC Score: 0.623
The Cohen kappa score is 0.12804102112546378.
