In [1]:
pip install imblearn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [5]:
columns = [
    "Year", "Quarter", "Month", "DayofMonth",
    "DayOfWeek", "DOT_ID_Reporting_Airline", "Flight_Number_Reporting_Airline", "OriginAirportID",
    "OriginAirportSeqID", "OriginCityMarketID", "OriginStateFips", "OriginWac",
    "DestAirportID", "DestAirportSeqID", "DestCityMarketID", "DestStateFips",
    "DestWac", "CRSDepTime", "DepTime", "DepDelay", "DepDelayMinutes", "DepDel15", "DepartureDelayGroups", "DepTimeBlk",
    "TaxiOut", "WheelsOff","CRSArrTime","ArrDel15", "Cancelled"
]

target = ["ArrDel15"]

In [6]:
# Load the data
file_path = Path('../Resources/Delays_Cancellations.csv')
df = pd.read_excel(file_path)[:-2]
print(df.columns)
df = df.loc[:, columns].copy()

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'DOT_ID_Reporting_Airline', 'Flight_Number_Reporting_Airline',
       'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID',
       'OriginStateFips', 'OriginWac', 'DestAirportID', 'DestAirportSeqID',
       'DestCityMarketID', 'DestStateFips', 'DestWac', 'CRSDepTime', 'DepTime',
       'DepDelay', 'DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups',
       'DepTimeBlk', 'TaxiOut', 'WheelsOff', 'CRSArrTime', 'ArrDel15',
       'Cancelled'],
      dtype='object')


In [7]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

In [8]:
# Drop the null rows
df = df.dropna()

In [9]:
df.shape

(54441, 29)

In [10]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,...,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,CRSArrTime,ArrDel15,Cancelled
0,2022,1,1,2,7,20452,4922,12266,1226603,31453,...,39.0,39.0,1.0,2.0,1100-1159,20.0,1235.0,1600,1.0,0
1,2022,1,1,3,1,20452,4922,12266,1226603,31453,...,-3.0,0.0,0.0,-1.0,1100-1159,18.0,1151.0,1600,0.0,0
2,2022,1,1,4,2,20452,4922,12266,1226603,31453,...,-4.0,0.0,0.0,-1.0,1000-1059,14.0,1104.0,1529,0.0,0
3,2022,1,1,5,3,20452,4922,12266,1226603,31453,...,76.0,76.0,1.0,5.0,1000-1059,21.0,1231.0,1529,1.0,0
4,2022,1,1,6,4,20452,4922,12266,1226603,31453,...,19.0,19.0,1.0,1.0,1000-1059,19.0,1132.0,1529,0.0,0


In [10]:
# Create our features
X = df.drop("ArrDel15", axis=1)

X = pd.get_dummies(X)

# Create our target
y = df["ArrDel15"]

In [11]:
X.describe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,...,DepTimeBlk_1400-1459,DepTimeBlk_1500-1559,DepTimeBlk_1600-1659,DepTimeBlk_1700-1759,DepTimeBlk_1800-1859,DepTimeBlk_1900-1959,DepTimeBlk_2000-2059,DepTimeBlk_2100-2159,DepTimeBlk_2200-2259,DepTimeBlk_2300-2359
count,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,...,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0,54441.0
mean,2022.0,1.0,1.0,16.174905,4.021105,19921.983964,2641.249848,11652.929979,1165297.0,30822.33596,...,0.075605,0.049889,0.071307,0.046178,0.074392,0.064951,0.057255,0.032806,0.019177,0.000386
std,0.0,0.0,0.0,8.917965,2.083078,380.356508,1720.419222,963.197055,96319.69,934.514051,...,0.264367,0.217717,0.257339,0.209873,0.262411,0.246442,0.232331,0.178131,0.137147,0.019637
min,2022.0,1.0,1.0,1.0,1.0,19393.0,1.0,10136.0,1013603.0,30136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2022.0,1.0,1.0,9.0,2.0,19790.0,1250.0,11298.0,1129806.0,30194.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2022.0,1.0,1.0,16.0,4.0,19805.0,2365.0,11298.0,1129806.0,30194.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2022.0,1.0,1.0,24.0,6.0,20378.0,3900.0,12266.0,1226603.0,31453.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2022.0,1.0,1.0,31.0,7.0,20452.0,8812.0,15569.0,1556902.0,35569.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Check the balance of our target values
y.value_counts()

0.0    45420
1.0     9021
Name: ArrDel15, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
#Balanced Random Forest Classifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9147433618638738

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[10623,   726],
       [  241,  2021]])

In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.98      0.94      0.89      0.96      0.91      0.84     11349
        1.0       0.74      0.89      0.94      0.81      0.91      0.83      2262

avg / total       0.94      0.93      0.90      0.93      0.91      0.84     13611



In [19]:
# List the features sorted in descending order by feature importance
priority_features = pd.Series(data=brfc_model.feature_importances_,index=X.columns)
priority_features.sort_values(ascending=False,inplace=True)
print(priority_features)

DepDelay                           0.207383
DepDelayMinutes                    0.188194
DepDel15                           0.157933
DepartureDelayGroups               0.109518
TaxiOut                            0.067802
DayofMonth                         0.033743
DepTime                            0.026726
WheelsOff                          0.024754
Flight_Number_Reporting_Airline    0.019921
CRSArrTime                         0.019370
CRSDepTime                         0.017799
DestWac                            0.015295
DestCityMarketID                   0.013432
DestAirportID                      0.013154
DestAirportSeqID                   0.013109
DayOfWeek                          0.012466
DestStateFips                      0.011758
DOT_ID_Reporting_Airline           0.010138
OriginAirportID                    0.007202
OriginAirportSeqID                 0.006951
OriginCityMarketID                 0.005350
DepTimeBlk_1000-1059               0.001835
DepTimeBlk_1200-1259            

In [20]:
#Easy Ensemble AdaBoost Classifier

In [21]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =100, random_state=1)
eec_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [22]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9111603331297609

In [23]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[10637,   712],
       [  260,  2002]])

In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.98      0.94      0.89      0.96      0.91      0.83     11349
        1.0       0.74      0.89      0.94      0.80      0.91      0.83      2262

avg / total       0.94      0.93      0.89      0.93      0.91      0.83     13611



In [25]:
#Naive Random Oversampling

In [26]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 34071, 1.0: 34071})

In [27]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [28]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6958732502635809

In [29]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[8540, 2809],
       [ 816, 1446]])

In [30]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.91      0.75      0.64      0.82      0.69      0.49     11349
        1.0       0.34      0.64      0.75      0.44      0.69      0.48      2262

avg / total       0.82      0.73      0.66      0.76      0.69      0.48     13611



In [31]:
#SMOTE Oversampling

In [32]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [33]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6977540175193926

In [35]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[8091, 3258],
       [ 718, 1544]])

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.71      0.68      0.80      0.70      0.49     11349
        1.0       0.32      0.68      0.71      0.44      0.70      0.49      2262

avg / total       0.82      0.71      0.69      0.74      0.70      0.49     13611



In [37]:
#Undersampling

In [38]:
# Resample the data using the ClusterCentroids resampler
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 6759, 1.0: 6759})

In [39]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [40]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6977540175193926

In [41]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6080, 5269],
       [ 818, 1444]])

In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.88      0.54      0.64      0.67      0.58      0.34     11349
        1.0       0.22      0.64      0.54      0.32      0.58      0.35      2262

avg / total       0.77      0.55      0.62      0.61      0.58      0.34     13611



In [10]:
from sqlalchemy import create_engine

In [11]:
from config import db_password

In [12]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Issues_Air_Travel"
engine = create_engine(db_string)
df.to_sql(name='Delays_ML', con=engine, if_exists='replace')