<a href="https://colab.research.google.com/github/carolineb3/US-FlightDelayModeling/blob/main/SHAP_Analysis_and_Final_Results_Table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Essential imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)

# XGBoost
!pip install xgboost
from xgboost import XGBClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
base = "/content/drive/MyDrive/MIS 545 Project/"

X_train = pd.read_csv(base + "X_train_clf.csv")
X_test  = pd.read_csv(base + "X_test_clf.csv")
y_train = pd.read_csv(base + "y_train_clf.csv").iloc[:, 0]
y_test  = pd.read_csv(base + "y_test_clf.csv").iloc[:, 0]

In [None]:
print("Shapes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
numeric_features = [
    'CRS_DEP_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'MONTH',
    'DAY_OF_WEEK',
    'HOUR',
    'ORIGIN_FREQ',
    'DEST_FREQ'
]
categorical_features = [
    'OP_UNIQUE_CARRIER',
    'DEP_TIME_BLK'
]

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
    )

In [None]:
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=37, n_jobs=-1))
])

rf_grid = {
    "model__n_estimators": [200],
    "model__max_depth": [10]
}

rf_gs = GridSearchCV(
    rf_model,
    rf_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1
)

rf_gs.fit(X_train, y_train)

best_rf_model = rf_gs.best_estimator_
best_rf_model.fit(X_train, y_train)

Calculate SHAP Values

In [None]:
import shap

# 1. Transform X_test using the pipeline's preprocessing step
X_test_pre = best_rf_model.named_steps["preprocessor"].transform(X_test)

# 2. Extract real feature names (numeric + one-hot encoded)
ohe = best_rf_model.named_steps["preprocessor"].named_transformers_["cat"]["onehot"]
ohe_feature_names = ohe.get_feature_names_out(categorical_features)

feature_names = numeric_features + list(ohe_feature_names)

# 3. Train SHAP explainer on the final RF model only (not the whole pipeline)
rf = best_rf_model.named_steps["model"]
explainer = shap.TreeExplainer(rf)

# 4. Compute SHAP values on the transformed data
shap_values = explainer.shap_values(X_test_pre)

# 5. Summary plot (beeswarm)
shap.summary_plot(
    shap_values[1],
    X_test_pre,
    feature_names=feature_names
)

# 6. Optional bar plot
shap.summary_plot(
    shap_values[1],
    X_test_pre,
    feature_names=feature_names,
    plot_type="bar"
)

In [None]:
X_test_sampled = X_test.sample(1000, random_state=37)

preprocessor_fitted = best_rf_model.named_steps["preprocessor"]
rf_model_fitted = best_rf_model.named_steps["model"]

X_test_preprocessed = preprocessor_fitted.transform(X_test_sampled)

print("Raw X_test_sampled shape:", X_test_sampled.shape)
print("Preprocessed shape:", X_test_preprocessed.shape)

In [None]:
explainer = shap.TreeExplainer(rf_model_fitted)
shap_values = explainer.shap_values(X_test_preprocessed)

print("Num classes in SHAP:", len(shap_values))
print("SHAP class 1 shape:", shap_values[1].shape)

In [None]:
feature_names = preprocessor_fitted.get_feature_names_out()


In [None]:
print("SHAP feature count:", shap_values[1].shape[1])
print("Data feature count:", X_test_preprocessed.shape[1])
print("Feature names count:", len(feature_names))

In [None]:
# 5. Working summary plot
shap.summary_plot(
    shap_values[1],
    X_test_pre,
    feature_names=feature_names,
    plot_type="bar"
)

In [None]:
shap.summary_plot(
    shap_values[1],
    X_test_preprocessed,
    feature_names=feature_names
)

Final Results Table

In [None]:
import pandas as pd
import numpy as np

results = {
    'Tuned Random Forest': {
        'Accuracy': 0.62,
        'Macro Avg Precision': 0.61,
        'Macro Avg Recall': 0.64,
        'Macro Avg F1-score': 0.60,
        'ROC-AUC': 0.698,
        'MAE': np.nan,
        'RMSE': np.nan,
        'R^2': np.nan
    },
    'Tuned XGBoost': {
        'Accuracy': 0.62,
        'Macro Avg Precision': 0.61,
        'Macro Avg Recall': 0.64,
        'Macro Avg F1-score': 0.60,
        'ROC-AUC': 0.696,
        'MAE': np.nan,
        'RMSE': np.nan,
        'R^2': np.nan
    },
    'Linear Regression': {
        'Accuracy': np.nan,
        'Macro Avg Precision': np.nan,
        'Macro Avg Recall': np.nan,
        'Macro Avg F1-score': np.nan,
        'ROC-AUC': np.nan,
        'MAE': 28.3346,
        'RMSE': 61.2507,
        'R^2': 0.0333
    },
    'Ridge Regression': {
        'Accuracy': np.nan,
        'Macro Avg Precision': np.nan,
        'Macro Avg Recall': np.nan,
        'Macro Avg F1-score': np.nan,
        'ROC-AUC': np.nan,
        'MAE': 28.3346,
        'RMSE': 61.2507,
        'R^2': 0.0333
    },
    'LASSO Regression': {
        'Accuracy': np.nan,
        'Macro Avg Precision': np.nan,
        'Macro Avg Recall': np.nan,
        'Macro Avg F1-score': np.nan,
        'ROC-AUC': np.nan,
        'MAE': 28.3394,
        'RMSE': 61.2525,
        'R^2': 0.0332
    },
    'Gradient Boosting Regressor': {
        'Accuracy': np.nan,
        'Macro Avg Precision': np.nan,
        'Macro Avg Recall': np.nan,
        'Macro Avg F1-score': np.nan,
        'ROC-AUC': np.nan,
        'MAE': 27.9429,
        'RMSE': 60.9452,
        'R^2': 0.0429
    }
}

results_df = pd.DataFrame(results).T
display(results_df.round(4))
