# **Model Building**

In [1]:
# Change CWD to repo base for imports
import os
from pathlib import Path
notebook_path = Path().resolve()
parent_directory = notebook_path.parent
os.chdir(parent_directory)

# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from etl import get_flight_data
import datetime as dt

import logging
log = logging.getLogger("modeling")
log.setLevel(logging.INFO)

df = get_flight_data()

reading data/202312_flights.csv
reading data/202307_flights.csv
reading data/202402_flights.csv
reading data/202308_flights.csv
reading data/202404_flights.csv
skipped data/state_region_map.csv
reading data/202401_flights.csv
reading data/202311_flights.csv
reading data/202406_flights.csv
reading data/202403_flights.csv
skipped data/airport_id_map.csv
reading data/202310_flights.csv
skipped data/airline_codes_map.csv
reading data/202405_flights.csv
reading data/202309_flights.csv


## **Feature Pipeline**

In [33]:
CANCELLED_COL = "Cancelled"
DELAYED_COL = "ArrDel15"
DELAY_DURATION_COL = "ArrDelay"
TARGET_COLS = [CANCELLED_COL, DELAYED_COL, DELAY_DURATION_COL]

feature_base_cols = [
    # origin
    "OriginAirportShortName",
    "OriginDivision",
    # destination
    "DestAirportShortName",
    "DestDivision",
    # flight features
    "Reporting_Airline",
    "Distance",
    "ScheduledDurationMinutes",
    # time features
    "FlightDate",
    "CRSDepTime",
]

df_train_test = df[feature_base_cols + TARGET_COLS].sample(frac=0.30, random_state=42)

y = df_train_test[TARGET_COLS].fillna(0)

In [34]:
df_train_test.head()

Unnamed: 0,OriginAirportShortName,OriginDivision,DestAirportShortName,DestDivision,Reporting_Airline,Distance,ScheduledDurationMinutes,FlightDate,CRSDepTime,Cancelled,ArrDel15,ArrDelay
1631682,Baltimore/Washington International Thurgood Ma...,South Atlantic,Buffalo Niagara International,Middle Atlantic,WN,281.0,75.0,2024-02-07,21:35:00,0.0,1.0,17.0
746271,Salt Lake City International,Mountain,San Francisco International,Pacific,DL,599.0,64.0,2023-07-13,11:30:00,0.0,0.0,4.0
3216311,St Louis Lambert International,West North Central,Long Beach Airport,Pacific,WN,1581.0,130.0,2024-01-05,13:25:00,0.0,0.0,-26.0
5497385,Harry Reid International,Mountain,Dallas/Fort Worth International,West South Central,NK,1055.0,285.0,2023-10-14,08:10:00,0.0,0.0,3.0
3486061,Cleveland-Hopkins International,East North Central,Charlotte Douglas International,South Atlantic,AA,430.0,99.0,2023-11-03,17:18:00,0.0,0.0,-6.0


In [35]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

# Sample data (replace this with your actual dataframe)

class DateTimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, flight_date_col, dep_time_col):
        self.flight_date_col = flight_date_col
        self.dep_time_col = dep_time_col
        self.derived_columns = ["is_weekend", "month", "day_of_week", "hour_of_day"]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Ensure FlightDate and CRSDepTime are in appropriate datetime formats
        X[self.flight_date_col] = pd.to_datetime(X[self.flight_date_col], errors='coerce')
        X[self.dep_time_col] = pd.to_datetime(X[self.dep_time_col], format='%H:%M', errors='coerce')

        # Extract new features
        X["is_weekend"] = X[self.flight_date_col].dt.dayofweek.isin([5, 6])
        X["month"] = X[self.flight_date_col].dt.month.astype(str)
        X["day_of_week"] = X[self.flight_date_col].dt.dayofweek.astype(str) + "_" + X[self.flight_date_col].dt.day_name()
        X["hour_of_day"] = X[self.dep_time_col].dt.hour.astype(str)

        # Drop the original datetime columns
        X.drop(columns=[self.flight_date_col, self.dep_time_col], inplace=True)
        return X


# Custom transformer for top-k preprocessing
class TopKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, columns, max_n_categories=10):
        self.columns = columns
        self.max_n_categories = max_n_categories
        self.top_categories = {}

    def fit(self, X, y=None):
        # Store top-k categories for each column
        for col in self.columns:
            self.top_categories[col] = X[col].value_counts().nlargest(self.max_n_categories).index
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].where(X[col].isin(self.top_categories[col]), "Other")
        return X

# Define columns for transformers
one_hot_columns = [
    "OriginAirportShortName", "DestAirportShortName",
    "OriginDivision", "DestDivision", "Reporting_Airline",
]
numerical_columns = ["Distance", "ScheduledDurationMinutes"]

# Define the pipeline
pipeline = Pipeline([
    ('datetime_features', DateTimeFeatureExtractor(flight_date_col="FlightDate", dep_time_col="CRSDepTime")),
    ('top_k_preprocess', TopKPreprocessor(columns=["OriginAirportShortName", "DestAirportShortName"], max_n_categories=10)),
    ('preprocess', ColumnTransformer([
        ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), 
            one_hot_columns + ["month", "is_weekend", "day_of_week", "hour_of_day"]
        ),
        ('scaler', StandardScaler(), numerical_columns)
    ], remainder='passthrough')),
    ('final_scaler', StandardScaler())  # Standard scaling applied to all columns after preprocessing
])

# Fit the pipeline fully
pipeline.fit(df_train_test[feature_base_cols])

# Function to get feature names after transformation
def get_feature_names(column_transformer, input_features):
    feature_names = []
    for name, transformer, columns in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'passthrough':
            feature_names.extend(columns)  # for passthrough columns, retain original names
        elif isinstance(transformer, OneHotEncoder):
            feature_names.extend(transformer.get_feature_names_out(columns))
        elif isinstance(transformer, StandardScaler):
            feature_names.extend(columns)  # StandardScaler retains original column names
    return feature_names

# Get column names after transformation
preprocessor = pipeline.named_steps['preprocess']
column_names = get_feature_names(preprocessor, df_train_test[feature_base_cols].columns)

# Transform the data and create a DataFrame with feature names
X_transformed = pipeline.transform(df_train_test[feature_base_cols])
X_transformed = pd.DataFrame(X_transformed, columns=column_names)

# Display transformed DataFrame with column names
print("Transformed DataFrame with Column Names:")
X_transformed.head()

Transformed DataFrame with Column Names:


Unnamed: 0,OriginAirportShortName_Charlotte Douglas International,OriginAirportShortName_Chicago O'Hare International,OriginAirportShortName_Dallas/Fort Worth International,OriginAirportShortName_Denver International,OriginAirportShortName_Harry Reid International,OriginAirportShortName_Hartsfield-Jackson Atlanta International,OriginAirportShortName_Los Angeles International,OriginAirportShortName_Orlando International,OriginAirportShortName_Other,OriginAirportShortName_Phoenix Sky Harbor International,...,day_of_week_0_Monday,day_of_week_1_Tuesday,day_of_week_2_Wednesday,day_of_week_3_Thursday,day_of_week_4_Friday,day_of_week_5_Saturday,day_of_week_6_Sunday,hour_of_day_nan,Distance,ScheduledDurationMinutes
0,-0.174608,-0.199277,-0.212685,-0.212003,-0.167896,-0.226097,-0.169496,-0.156492,0.705974,-0.166258,...,-0.415878,-0.398753,2.478771,-0.416108,-0.417717,-0.389837,-0.415637,0.0,-0.930822,-0.7097
1,-0.174608,-0.199277,-0.212685,-0.212003,-0.167896,-0.226097,-0.169496,-0.156492,0.705974,-0.166258,...,-0.415878,-0.398753,-0.403426,2.403223,-0.417717,-0.389837,-0.415637,0.0,-0.39805,-0.816541
2,-0.174608,-0.199277,-0.212685,-0.212003,-0.167896,-0.226097,-0.169496,-0.156492,0.705974,-0.166258,...,-0.415878,-0.398753,-0.403426,-0.416108,2.393964,-0.389837,-0.415637,0.0,1.247178,-0.175495
3,-0.174608,-0.199277,-0.212685,-0.212003,5.95608,-0.226097,-0.169496,-0.156492,-1.416483,-0.166258,...,-0.415878,-0.398753,-0.403426,-0.416108,-0.417717,2.565173,-0.415637,0.0,0.365926,1.329992
4,-0.174608,-0.199277,-0.212685,-0.212003,-0.167896,-0.226097,-0.169496,-0.156492,0.705974,-0.166258,...,-0.415878,-0.398753,-0.403426,-0.416108,2.393964,-0.389837,-0.415637,0.0,-0.68119,-0.476592


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.1, random_state=42)

## **Model Fitting**

In [37]:
from abc import ABC, abstractmethod
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd

# Constants for target columns
CANCELLED_COL = "Cancelled"
DELAYED_COL = "ArrDel15"
DELAY_DURATION_COL = "ArrDelay"
TARGET_COLS = [CANCELLED_COL, DELAYED_COL, DELAY_DURATION_COL]
TARGETS = {
    CANCELLED_COL: {"type": "classification"},
    DELAYED_COL: {"type": "classification"},
    DELAY_DURATION_COL: {"type": "regression"}
}

class FlightPerformanceModel():
    def __init__(
            self,
            classification_estimator,
            regression_estimator,
            classification_estimator_params={},
            regression_estimator_params={},
            targets = TARGETS
        ):
        self.classification_estimator = classification_estimator
        self.regression_estimator = regression_estimator
        self.classification_estimator_params = classification_estimator_params
        self.regression_estimator_params = regression_estimator_params
        self.targets = targets
        self.estimators = {}

    def fit(self, X_train, y_train):
        """Fit models for each target column."""
        for col in [CANCELLED_COL, DELAYED_COL]:
            log.info(f"{dt.datetime.now():%H:%M:%S}:Training model for target {col}")
            self.estimators[col] = self.classification_estimator(**self.classification_estimator_params).fit(X_train, y_train[DELAYED_COL])
            log.info(f"{dt.datetime.now():%H:%M:%S}:Model for target {col} trained.")
        
        log.info(f"{dt.datetime.now():%H:%M:%S}:Training model for target {DELAY_DURATION_COL}")
        self.estimators[DELAY_DURATION_COL] = self.regression_estimator(**self.regression_estimator_params).fit(
            X_train.loc[(y_train[DELAYED_COL] == 1.0).values],
            y_train.loc[y_train[DELAYED_COL] == 1.0][DELAY_DURATION_COL]
        )
        log.info(f"{dt.datetime.now():%H:%M:%S}:Model for target {DELAY_DURATION_COL} trained.")

    def predict_proba(self, X_test, target_col):
        """Predict probabilities for a specific target column on the test set."""
        model = self.estimators.get(target_col)
        if model is None:
            raise ValueError(f"Model for target {target_col} has not been trained.")
        return model.predict_proba(X_test)[:, 1]


### **Logistic Regression**

In [38]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score

model = FlightPerformanceModel(
    classification_estimator=LogisticRegression,
    classification_estimator_params=dict(max_iter=1000, random_state=42),
    regression_estimator=LinearRegression,
    regression_estimator_params={},
)

# Fit the logistic regression model
model.fit(X_train, y_train)

# Predict probabilities and calculate AUC for logistic regression model
for target_col in [CANCELLED_COL, DELAYED_COL]:
    y_pred_proba = model.predict_proba(X_test, target_col=target_col)
    logistic_auc = roc_auc_score(y_test[target_col], y_pred_proba)
    print(f"Logistic Regression AUC for {target_col}: {logistic_auc:.4f} vs. 0.50 baseline")

# Predict delay duration and calculate RMSE for linear regression model
y_pred_delay_duration = model.estimators[DELAY_DURATION_COL].predict(X_test.loc[(y_test[DELAYED_COL] == 1.0).values])
linear_rmse = np.sqrt(np.mean((y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL] - y_pred_delay_duration)**2))
baseline_rmse = np.sqrt(np.mean(y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL]**2))
print(f"Linear Regression RMSE for {DELAY_DURATION_COL}: {linear_rmse:.4f} vs. {baseline_rmse:.4f} baseline")

Logistic Regression AUC for Cancelled: 0.6247 vs. 0.50 baseline
Logistic Regression AUC for ArrDel15: 0.6182 vs. 0.50 baseline
Linear Regression RMSE for ArrDelay: 103.4035 vs. 126.3519 baseline


### **RandomForest**

In [123]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

model = FlightPerformanceModel(
    classification_estimator=RandomForestClassifier,
    classification_estimator_params=dict(n_estimators=100, random_state=42),
    regression_estimator=RandomForestRegressor,
    regression_estimator_params=dict(n_estimators=100, random_state=42)
)

# Fit the logistic regression model
model.fit(X_train, y_train)

# Predict probabilities and calculate AUC for logistic regression model
for target_col in [CANCELLED_COL, DELAYED_COL]:
    y_pred_proba = model.predict_proba(X_test, target_col=target_col)
    logistic_auc = roc_auc_score(y_test[target_col], y_pred_proba)
    print(f"RandomForest AUC for {target_col}: {logistic_auc:.4f} vs. 0.50 baseline")

# Predict delay duration and calculate RMSE for linear regression model
y_pred_delay_duration = model.estimators[DELAY_DURATION_COL].predict(X_test.loc[(y_test[DELAYED_COL] == 1.0).values])
linear_rmse = np.sqrt(np.mean((y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL] - y_pred_delay_duration)**2))
baseline_rmse = np.sqrt(np.mean(y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL]**2))
print(f"RandomForest RMSE for {DELAY_DURATION_COL}: {linear_rmse:.4f} vs. {baseline_rmse:.4f} baseline")

22:19:22:Training model for target Cancelled
22:19:22:Training model for target Cancelled
22:20:28:Model for target Cancelled trained.
22:20:28:Model for target Cancelled trained.
22:20:28:Training model for target ArrDel15
22:20:28:Training model for target ArrDel15
22:21:34:Model for target ArrDel15 trained.
22:21:34:Model for target ArrDel15 trained.
22:21:34:Training model for target ArrDelay
22:21:34:Training model for target ArrDelay
22:22:40:Model for target ArrDelay trained.
22:22:40:Model for target ArrDelay trained.


Logistic Regression AUC for Cancelled: 0.5842 vs. 0.50 baseline
Logistic Regression AUC for ArrDel15: 0.6388 vs. 0.50 baseline
Linear Regression RMSE for ArrDelay: 102.9580 vs. 120.7444 baseline
