In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

# Question 11:
### How does the accuracy of predictions change with the addition of more information?
To begin, the standard cleaned dataset is loaded in full.

In [2]:
df = pd.read_csv("AllData_Mergedfinal_removedoutliers1000.csv")
display(df.head())
print(df.shape)
print(df.columns)

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),...,Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes),Delay Unknown (Minutes),DEPARTURE_AIRPORT,Arrival Delay (Minutes),Total Delay (Minutes),Num of Delay Reasons,Primary Delay Reason,Primary Delay Percentage
0,B6,01/12/2024,JFK,17:50,18:08,164.0,212.0,18.0,19:14,66.0,...,48.0,0.0,0.0,0.0,MCI,48.0,66.0,2,Aviation System,0.727273
1,B6,01/24/2024,JFK,17:50,17:36,164.0,161.0,-14.0,17:48,12.0,...,0.0,0.0,0.0,0.0,MCI,-3.0,-17.0,0,No Reason,0.0
2,B6,02/12/2024,JFK,17:50,00:00,164.0,0.0,0.0,00:00,0.0,...,0.0,0.0,0.0,0.0,MCI,-164.0,-164.0,0,No Reason,0.0
3,B6,02/24/2024,JFK,16:43,16:35,166.0,171.0,-8.0,16:51,16.0,...,0.0,0.0,0.0,0.0,MCI,5.0,-3.0,0,No Reason,0.0
4,B6,03/12/2024,JFK,16:56,17:09,169.0,164.0,13.0,17:22,13.0,...,0.0,0.0,0.0,8.0,MCI,-5.0,8.0,1,Unknown,1.0


(43841, 22)
Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Destination Airport',
       'Scheduled departure time', 'Actual departure time',
       'Scheduled elapsed time (Minutes)', 'Actual elapsed time (Minutes)',
       'Departure delay (Minutes)', 'Wheels-off time',
       'Taxi-Out time (Minutes)', 'Delay Carrier (Minutes)',
       'Delay Weather (Minutes)', 'Delay National Aviation System (Minutes)',
       'Delay Security (Minutes)', 'Delay Late Aircraft Arrival (Minutes)',
       'Delay Unknown (Minutes)', 'DEPARTURE_AIRPORT',
       'Arrival Delay (Minutes)', 'Total Delay (Minutes)',
       'Num of Delay Reasons', 'Primary Delay Reason',
       'Primary Delay Percentage'],
      dtype='object')


From this dataset, modifications are made to enable the usage of a decision tree classifier. First, the date and time features are translated into simple integers to numerically represent their respective information. Second, each of the relevant categorical features are encoded to further numeric values. Third, the relevant features are sorted (based on domain knowledge) by importance to be incrementallu added to decision tree models.

In [3]:
df["temp_date"] = pd.to_datetime(df["Date (MM/DD/YYYY)"], format = "%m/%d/%Y")
df["day_of_year"] = df["temp_date"].dt.dayofyear

df["temp_scheduled_departure_time"] = pd.to_datetime(df["Scheduled departure time"], format = "%H:%M")
df["minutes_since_midnight_scheduled_departure_time"] = df["temp_scheduled_departure_time"].dt.hour * 60 + df["temp_scheduled_departure_time"].dt.minute

le = LabelEncoder()
df["primary_delay_reason_encoded"] = le.fit_transform(df["Primary Delay Reason"])
df["departure_airport_encoded"] = le.fit_transform(df["DEPARTURE_AIRPORT"])
df["destination_airport_encoded"] = le.fit_transform(df["Destination Airport"])

features_unordered = ["day_of_year", "Destination Airport", "minutes_since_midnight_scheduled_departure_time", "Delay Carrier (Minutes)", "Delay Weather (Minutes)", "Delay National Aviation System (Minutes)", "Delay Security (Minutes)", "Delay Late Aircraft Arrival (Minutes)", "Delay Unknown (Minutes)", "DEPARTURE_AIRPORT", "Arrival Delay (Minutes)", "Num of Delay Reasons", "Primary Delay Reason"]
# This ordering is based on domain knowledge for the expected importance of features
# The order itself is not particularly important compared to the differences in accuracy resulting from adding each feature to the model
features = ["Num of Delay Reasons", "primary_delay_reason_encoded", "departure_airport_encoded", "minutes_since_midnight_scheduled_departure_time", "day_of_year", "destination_airport_encoded"]

df["binary_delayed"] = (df["Departure delay (Minutes)"] > 0).astype(bool)
target = "binary_delayed"

df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),...,Primary Delay Reason,Primary Delay Percentage,temp_date,day_of_year,temp_scheduled_departure_time,minutes_since_midnight_scheduled_departure_time,primary_delay_reason_encoded,departure_airport_encoded,destination_airport_encoded,binary_delayed
0,B6,01/12/2024,JFK,17:50,18:08,164.0,212.0,18.0,19:14,66.0,...,Aviation System,0.727273,2024-01-12,12,1900-01-01 17:50:00,1070,0,2,101,True
1,B6,01/24/2024,JFK,17:50,17:36,164.0,161.0,-14.0,17:48,12.0,...,No Reason,0.0,2024-01-24,24,1900-01-01 17:50:00,1070,3,2,101,False
2,B6,02/12/2024,JFK,17:50,00:00,164.0,0.0,0.0,00:00,0.0,...,No Reason,0.0,2024-02-12,43,1900-01-01 17:50:00,1070,3,2,101,False
3,B6,02/24/2024,JFK,16:43,16:35,166.0,171.0,-8.0,16:51,16.0,...,No Reason,0.0,2024-02-24,55,1900-01-01 16:43:00,1003,3,2,101,False
4,B6,03/12/2024,JFK,16:56,17:09,169.0,164.0,13.0,17:22,13.0,...,Unknown,1.0,2024-03-12,72,1900-01-01 16:56:00,1016,5,2,101,True


A decision tree model is trained first on the most important feature, then on the two most important features, and onwards until all six relevant features are utilized. Then, the process is repeated with the list of features reversed, such that the least important features are used in training first.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.2, random_state = 0)

print("Decision Tree Test:")
for i in range(1, len(features) + 1):
    current_features = features[:i]
    
    model = DecisionTreeClassifier(random_state = 0)
    model.fit(X_train[current_features], y_train)
    
    y_pred = model.predict(X_test[current_features])
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"Using first {i} features: Accuracy = {acc:.4f}, F1-Score = {f1:.4f}, ROC AUC = {roc_auc:.4f}")



print("\nIterating over the features list in reverse:")
features_reversed = features[::-1]
for i in range(1, len(features_reversed) + 1):
    current_features = features_reversed[:i]
    
    model = DecisionTreeClassifier(random_state = 0)
    model.fit(X_train[current_features], y_train)
    
    y_pred = model.predict(X_test[current_features])
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"Using first {i} features: Accuracy = {acc:.4f}, F1-Score = {f1:.4f}, ROC AUC = {roc_auc:.4f}")

importances = pd.Series(model.feature_importances_, index = X_test.columns)
importances.sort_values(ascending = False)

Decision Tree Test:
Using first 1 features: Accuracy = 0.7944, F1-Score = 0.7325, ROC AUC = 0.7794
Using first 2 features: Accuracy = 0.8010, F1-Score = 0.7298, ROC AUC = 0.7795
Using first 3 features: Accuracy = 0.8010, F1-Score = 0.7298, ROC AUC = 0.7795
Using first 4 features: Accuracy = 0.7944, F1-Score = 0.7061, ROC AUC = 0.7649
Using first 5 features: Accuracy = 0.7635, F1-Score = 0.6859, ROC AUC = 0.7439
Using first 6 features: Accuracy = 0.7436, F1-Score = 0.6821, ROC AUC = 0.7344

Iterating over the features list in reverse:
Using first 1 features: Accuracy = 0.5994, F1-Score = 0.1136, ROC AUC = 0.5098
Using first 2 features: Accuracy = 0.6031, F1-Score = 0.3908, ROC AUC = 0.5555
Using first 3 features: Accuracy = 0.5811, F1-Score = 0.4766, ROC AUC = 0.5638
Using first 4 features: Accuracy = 0.5842, F1-Score = 0.4844, ROC AUC = 0.5683
Using first 5 features: Accuracy = 0.7349, F1-Score = 0.6688, ROC AUC = 0.7240
Using first 6 features: Accuracy = 0.7438, F1-Score = 0.6823, ROC

destination_airport_encoded                        0.364892
departure_airport_encoded                          0.219225
Num of Delay Reasons                               0.193406
primary_delay_reason_encoded                       0.148692
day_of_year                                        0.041248
minutes_since_midnight_scheduled_departure_time    0.032538
dtype: float64

# Question 7:
### What specific variables are most informative on how long a flight will be delayed for?

In [5]:
features = ["Num of Delay Reasons", "primary_delay_reason_encoded", "departure_airport_encoded", "minutes_since_midnight_scheduled_departure_time", "day_of_year", "destination_airport_encoded", "Delay Carrier (Minutes)", "Delay Weather (Minutes)", "Delay National Aviation System (Minutes)", "Delay Security (Minutes)", "Delay Late Aircraft Arrival (Minutes)", "Delay Unknown (Minutes)"]
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.2, random_state = 0)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predict on test
y_pred = rf.predict(X_test)


# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest Regressor metrics:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

importances = pd.Series(rf.feature_importances_, index = X_test.columns)
importances.sort_values(ascending = False)

Random Forest Regressor metrics:
MAE: 0.26394583138645883
RMSE: 0.3661385041174869
R²: 0.4411825569683533


Num of Delay Reasons                               0.568286
minutes_since_midnight_scheduled_departure_time    0.136118
destination_airport_encoded                        0.095893
day_of_year                                        0.065967
Delay Unknown (Minutes)                            0.030454
Delay Carrier (Minutes)                            0.029685
Delay Late Aircraft Arrival (Minutes)              0.025984
departure_airport_encoded                          0.021943
Delay National Aviation System (Minutes)           0.012518
primary_delay_reason_encoded                       0.011142
Delay Weather (Minutes)                            0.002002
Delay Security (Minutes)                           0.000008
dtype: float64