In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from scipy.stats import pointbiserialr

Mounted at /content/drive


In [None]:
# Loading the combined flight and weather data
filepath = "/content/drive/MyDrive/Data/flight_and_weather.csv"
df = pd.read_csv(filepath, index_col=0)

# Understanding the column data
print(df.info())
print(f"\nShape: {df.shape}")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 33 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   FlightDate       object 
 5   OriginAirportID  int64  
 6   Origin           object 
 7   DestAirportID    int64  
 8   Dest             object 
 9   CRSDepTime       int64  
 10  DepTime          float64
 11  DepDelayMinutes  float64
 12  DepDel15         float64
 13  CRSArrTime       int64  
 14  ArrTime          float64
 15  ArrDelayMinutes  float64
 16  ArrDel15         float64
 17  Time_new         int64  
 18  windspeedKmph    int64  
 19  winddirDegree    int64  
 20  weatherCode      int64  
 21  precipMM         float64
 22  visibility       int64  
 23  pressure         int64  
 24  cloudcover       int64  
 25  DewPointF        int64  
 26  WindGustKmph     int64  
 27  tempF       

In [None]:
# Dropping columns with redundant or duplicate data
df.drop(columns=["FlightDate",
                 "OriginAirportID",
                 "DestAirportID",
                 "CRSArrTime",
                 "ArrTime",
                 "ArrDelayMinutes",
                 "Time_new",
                 "date",
                 "airport"],
        inplace=True)
print(f"\nShape: {df.shape}", end="\n\n")
print(df.info())


Shape: (1851436, 24)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   Origin           object 
 5   Dest             object 
 6   CRSDepTime       int64  
 7   DepTime          float64
 8   DepDelayMinutes  float64
 9   DepDel15         float64
 10  ArrDel15         float64
 11  windspeedKmph    int64  
 12  winddirDegree    int64  
 13  weatherCode      int64  
 14  precipMM         float64
 15  visibility       int64  
 16  pressure         int64  
 17  cloudcover       int64  
 18  DewPointF        int64  
 19  WindGustKmph     int64  
 20  tempF            int64  
 21  WindChillF       int64  
 22  humidity         int64  
 23  time             int64  
dtypes: float64(5), int64(17), object(2)
memory usage: 353.1+ MB
None


In [None]:
# Encode the names of the 15 airports which are strings into a number range: 0 - 14
labelEncoder = LabelEncoder()
df["Origin"] = labelEncoder.fit_transform(df["Origin"])
df["Dest"] = labelEncoder.fit_transform(df["Dest"])
features = df.loc[:, df.columns != "ArrDel15"]
labels = np.asarray(df["ArrDel15"])
print(features.columns)
df[["Origin", "Dest"]]

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'Origin', 'Dest',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDel15', 'windspeedKmph',
       'winddirDegree', 'weatherCode', 'precipMM', 'visibility', 'pressure',
       'cloudcover', 'DewPointF', 'WindGustKmph', 'tempF', 'WindChillF',
       'humidity', 'time'],
      dtype='object')


Unnamed: 0,Origin,Dest
0,13,6
1,13,12
2,13,2
3,13,5
4,13,0
...,...,...
1851431,12,8
1851432,12,13
1851433,12,14
1851434,2,14


In [None]:
# Number of samples/observations/rows is greater than 100,000
print(f"\nDataset shape: {df.shape}")
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)
print(f"features_train shape: {features_train.shape} | features_test shape: {features_test.shape}")
print(f"labels_train shape: {labels_train.shape} | labels_test shape: {labels_test.shape}")
# print(f"{features_train.shape[1]} Features: {features_train.columns.to_list()}")
df.to_csv("/content/drive/MyDrive/Data/flight_and_weather_encoded.csv")
del df


Dataset shape: (1851436, 24)
features_train shape: (1481148, 23) | features_test shape: (370288, 23)
labels_train shape: (1481148,) | labels_test shape: (370288,)


Logistic Regression

In [None]:
model = LogisticRegression(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
print(classification_report(labels_test, model_pred))
del model
del model_pred

              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    292530
         1.0       0.89      0.68      0.77     77758

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.91      0.92      0.91    370288



Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier()
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
#print(confusion_matrix(labels_test, model_pred))
print(classification_report(labels_test, model_pred))
del model
del model_pred

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    292530
         1.0       0.68      0.71      0.69     77758

    accuracy                           0.87    370288
   macro avg       0.80      0.81      0.80    370288
weighted avg       0.87      0.87      0.87    370288



XGBoost

In [None]:
model = GradientBoostingClassifier()
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
#print(confusion_matrix(labels_test, model_pred))
print(classification_report(labels_test, model_pred))
del model
del model_pred

              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    292530
         1.0       0.90      0.69      0.78     77758

    accuracy                           0.92    370288
   macro avg       0.91      0.83      0.86    370288
weighted avg       0.92      0.92      0.91    370288



Random Forest

In [None]:
model = RandomForestClassifier(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
#print(confusion_matrix(labels_test, model_pred))
print(classification_report(labels_test, model_pred))
del model
del model_pred

              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95    292530
         1.0       0.88      0.70      0.78     77758

    accuracy                           0.92    370288
   macro avg       0.90      0.84      0.87    370288
weighted avg       0.92      0.92      0.91    370288



Extra Trees Classifier

In [None]:
model = ExtraTreesClassifier(n_jobs=-1)
model.fit(features_train, labels_train)
model_pred = model.predict(features_test)
# print(confusion_matrix(labels_test, model_pred))
print(classification_report(labels_test, model_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94    292530
         1.0       0.81      0.74      0.77     77758

    accuracy                           0.91    370288
   macro avg       0.87      0.84      0.86    370288
weighted avg       0.91      0.91      0.91    370288

