In [14]:
pip install imblearn


Note: you may need to restart the kernel to use updated packages.


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [17]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [18]:
columns = [
    "Year", "Quarter", "Month", "DayofMonth",
    "DayOfWeek", "DOT_ID_Reporting_Airline", "Flight_Number_Reporting_Airline", "OriginAirportID",
    "OriginAirportSeqID", "OriginCityMarketID", "OriginStateFips", "OriginWac",
    "DestAirportID", "DestAirportSeqID", "DestCityMarketID", "DestStateFips",
    "DestWac", "CRSDepTime","DepTimeBlk","CRSArrTime","Cancelled"
]

target = ["Cancelled"]

In [19]:
# Load the data
file_path = Path('../Resources/Cancellations.csv')
df = pd.read_excel(file_path)[:-2]
print(df.columns)
df = df.loc[:, columns].copy()

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'DOT_ID_Reporting_Airline', 'Flight_Number_Reporting_Airline',
       'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID',
       'OriginStateFips', 'OriginWac', 'DestAirportID', 'DestAirportSeqID',
       'DestCityMarketID', 'DestStateFips', 'DestWac', 'CRSDepTime',
       'DepTimeBlk', 'CRSArrTime', 'Cancelled'],
      dtype='object')


In [20]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

In [21]:
# Drop the null rows
df = df.dropna()

In [25]:
df.shape

(56820, 21)

In [26]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,...,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,DestStateFips,DestWac,CRSDepTime,DepTimeBlk,CRSArrTime,Cancelled
0,2022,1,1,2,7,20452,4922,12266,1226603,31453,...,74,12953,1295304,31703,36,22,1136,1100-1159,1600,0
1,2022,1,1,3,1,20452,4922,12266,1226603,31453,...,74,12953,1295304,31703,36,22,1136,1100-1159,1600,0
2,2022,1,1,4,2,20452,4922,12266,1226603,31453,...,74,12953,1295304,31703,36,22,1054,1000-1059,1529,0
3,2022,1,1,5,3,20452,4922,12266,1226603,31453,...,74,12953,1295304,31703,36,22,1054,1000-1059,1529,0
4,2022,1,1,6,4,20452,4922,12266,1226603,31453,...,74,12953,1295304,31703,36,22,1054,1000-1059,1529,0


In [27]:
# Create our features
X = df.drop("Cancelled", axis=1)

X = pd.get_dummies(X)

# Create our target
y = df["Cancelled"]

In [28]:
X.describe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,...,DepTimeBlk_1400-1459,DepTimeBlk_1500-1559,DepTimeBlk_1600-1659,DepTimeBlk_1700-1759,DepTimeBlk_1800-1859,DepTimeBlk_1900-1959,DepTimeBlk_2000-2059,DepTimeBlk_2100-2159,DepTimeBlk_2200-2259,DepTimeBlk_2300-2359
count,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,...,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0,56820.0
mean,2022.0,1.0,1.0,15.968867,4.036677,19923.901109,2670.855702,11657.501056,1165754.0,30829.781855,...,0.076012,0.049472,0.071577,0.046762,0.074094,0.065312,0.057216,0.032629,0.01906,0.000405
std,0.0,0.0,0.0,8.959082,2.084403,381.451222,1743.752729,964.438626,96443.84,935.353668,...,0.26502,0.216853,0.257788,0.21113,0.261926,0.247077,0.232256,0.177666,0.136738,0.020115
min,2022.0,1.0,1.0,1.0,1.0,19393.0,1.0,10136.0,1013603.0,30136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2022.0,1.0,1.0,8.0,2.0,19790.0,1252.0,11298.0,1129806.0,30194.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2022.0,1.0,1.0,16.0,4.0,19805.0,2385.0,11298.0,1129806.0,30279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2022.0,1.0,1.0,24.0,6.0,20378.0,3941.0,12266.0,1226603.0,31453.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2022.0,1.0,1.0,31.0,7.0,20452.0,8812.0,15569.0,1556902.0,35569.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# Check the balance of our target values
y.value_counts()

0    54539
1     2281
Name: Cancelled, dtype: int64

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
#Balanced Random Forest Classifier

In [32]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc_model = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfc_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [33]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7494007266078573

In [34]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[10041,  3544],
       [  149,   471]])

In [35]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.76      0.84      0.75      0.56     13585
          1       0.12      0.76      0.74      0.20      0.75      0.56       620

avg / total       0.95      0.74      0.76      0.82      0.75      0.56     14205



In [36]:
# List the features sorted in descending order by feature importance
priority_features = pd.Series(data=brfc_model.feature_importances_,index=X.columns)
priority_features.sort_values(ascending=False,inplace=True)
print(priority_features)

DayofMonth                         0.179910
Flight_Number_Reporting_Airline    0.123099
CRSArrTime                         0.076584
DayOfWeek                          0.072627
CRSDepTime                         0.071137
DestCityMarketID                   0.055248
DestWac                            0.054123
DestAirportSeqID                   0.052173
DestAirportID                      0.051671
DOT_ID_Reporting_Airline           0.049297
DestStateFips                      0.041638
OriginAirportSeqID                 0.027043
OriginAirportID                    0.026879
OriginCityMarketID                 0.024699
DepTimeBlk_1400-1459               0.006981
DepTimeBlk_1000-1059               0.006972
DepTimeBlk_1200-1259               0.006895
DepTimeBlk_0700-0759               0.006467
DepTimeBlk_1100-1159               0.006048
DepTimeBlk_1700-1759               0.005931
DepTimeBlk_1600-1659               0.005591
DepTimeBlk_0800-0859               0.005580
DepTimeBlk_1800-1859            

In [37]:
#Easy Ensemble AdaBoost Classifier

In [38]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =100, random_state=1)
eec_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [39]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7358661711802629

In [40]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[9739, 3846],
       [ 152,  468]])

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.72      0.75      0.83      0.74      0.54     13585
          1       0.11      0.75      0.72      0.19      0.74      0.54       620

avg / total       0.95      0.72      0.75      0.80      0.74      0.54     14205



In [42]:
#Naive Random Oversampling

In [43]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 40954, 1: 40954})

In [44]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [45]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5971297802367412

In [46]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[8818, 4767],
       [ 282,  338]])

In [48]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.65      0.55      0.78      0.59      0.36     13585
          1       0.07      0.55      0.65      0.12      0.59      0.35       620

avg / total       0.93      0.64      0.55      0.75      0.59      0.36     14205



In [49]:
#SMOTE Oversampling

In [50]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [51]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [52]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5942450164436581

In [53]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[8893, 4692],
       [ 289,  331]])

In [54]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.65      0.53      0.78      0.59      0.35     13585
          1       0.07      0.53      0.65      0.12      0.59      0.35       620

avg / total       0.93      0.65      0.54      0.75      0.59      0.35     14205



In [55]:
#Undersampling

In [56]:
# Resample the data using the ClusterCentroids resampler
from collections import Counter
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1661, 1: 1661})

In [57]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [58]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.5942450164436581

In [59]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[7826, 5759],
       [ 293,  327]])

In [60]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.58      0.53      0.72      0.55      0.31     13585
          1       0.05      0.53      0.58      0.10      0.55      0.30       620

avg / total       0.92      0.57      0.53      0.69      0.55      0.31     14205



In [10]:
from sqlalchemy import create_engine

In [11]:
from config import db_password

In [12]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Issues_Air_Travel"
engine = create_engine(db_string)
df.to_sql(name='Delays_ML', con=engine, if_exists='replace')