# **Model Building**

In [90]:
# Change CWD to repo base for imports
import os
from pathlib import Path
notebook_path = Path().resolve()
parent_directory = notebook_path.parent
os.chdir(parent_directory)

# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from etl import get_flight_data

import logging
log = logging.getLogger("modeling")
log.setLevel(logging.INFO)
handler = logging.StreamHandler()  # This will print to the console
handler.setLevel(logging.INFO)
log.addHandler(handler)


df = get_flight_data()

## **Feature Pipeline**

In [None]:
df["month"] = df["FlightDate"].dt.month
df["is_weekend"] = df["FlightDate"].dt.dayofweek.isin([5, 6])

In [87]:
CANCELLED_COL = "Cancelled"
DELAYED_COL = "ArrDel15"
DELAY_DURATION_COL = "ArrDelay"
TARGET_COLS = [CANCELLED_COL, DELAYED_COL, DELAY_DURATION_COL]

feature_base_cols = [
    # origin
    "OriginAirportShortName",
    "OriginDivision",
    # destination
    "DestAirportShortName",
    "DestDivision",
    # flight features
    "Reporting_Airline",
    "Distance",
    "ScheduledDurationMinutes",
    # seasonality features
    "month",
    "is_weekend",
    "day_of_week",
    "hour_of_day",
]

def preprocess_feature_cols(df, max_n_categories=10):
    for col in ["OriginAirportShortName", "DestAirportShortName"]:
        top_categories = df[col].value_counts().nlargest(max_n_categories).index
        df[col] = df[col].where(df[col].isin(top_categories), "Other")
    return df

df_train_test = df[feature_base_cols + TARGET_COLS].sample(frac=0.10, random_state=42)
df_train_test = preprocess_feature_cols(df_train_test)

y = df_train_test[TARGET_COLS].fillna(0)

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

# Sample data (replace this with your actual dataframe)

# Custom transformer for top-k preprocessing
class TopKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, columns, max_n_categories=10):
        self.columns = columns
        self.max_n_categories = max_n_categories
        self.top_categories = {}

    def fit(self, X, y=None):
        # Store top-k categories for each column
        for col in self.columns:
            self.top_categories[col] = X[col].value_counts().nlargest(self.max_n_categories).index
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].where(X[col].isin(self.top_categories[col]), "Other")
        return X

# Define columns for transformers
one_hot_columns = [
    "OriginAirportShortName", "DestAirportShortName",
    "OriginDivision", "DestDivision", "Reporting_Airline", 
    "month", "day_of_week", "hour_of_day"
]
numerical_columns = ["Distance", "ScheduledDurationMinutes", "is_weekend"]

# Define the pipeline
pipeline = Pipeline([
    ('top_k_preprocess', TopKPreprocessor(columns=["OriginAirportShortName", "DestAirportShortName"], max_n_categories=10)),
    ('preprocess', ColumnTransformer([
        ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), one_hot_columns),
        ('scaler', StandardScaler(), numerical_columns)
    ], remainder='passthrough')),
    ('final_scaler', StandardScaler())  # Standard scaling applied to all columns after preprocessing
])

# Fit the pipeline fully
pipeline.fit(df_train_test[feature_base_cols])

# Function to get feature names after transformation
def get_feature_names(column_transformer, input_features):
    feature_names = []
    for name, transformer, columns in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'passthrough':
            feature_names.extend(columns)  # for passthrough columns, retain original names
        elif isinstance(transformer, OneHotEncoder):
            feature_names.extend(transformer.get_feature_names_out(columns))
        elif isinstance(transformer, StandardScaler):
            feature_names.extend(columns)  # StandardScaler retains original column names
    return feature_names

# Get column names after transformation
preprocessor = pipeline.named_steps['preprocess']
column_names = get_feature_names(preprocessor, df_train_test[feature_base_cols].columns)

# Transform the data and create a DataFrame with feature names
X_transformed = pipeline.transform(df_train_test[feature_base_cols])
X_transformed = pd.DataFrame(X_transformed, columns=column_names)

# Display transformed DataFrame with column names
print("Transformed DataFrame with Column Names:")
X_transformed.head()

Transformed DataFrame with Column Names:


Unnamed: 0,OriginAirportShortName_Charlotte Douglas International,OriginAirportShortName_Chicago O'Hare International,OriginAirportShortName_Dallas/Fort Worth International,OriginAirportShortName_Denver International,OriginAirportShortName_Harry Reid International,OriginAirportShortName_Hartsfield-Jackson Atlanta International,OriginAirportShortName_Los Angeles International,OriginAirportShortName_Orlando International,OriginAirportShortName_Other,OriginAirportShortName_Phoenix Sky Harbor International,...,hour_of_day_17.0,hour_of_day_18.0,hour_of_day_19.0,hour_of_day_20.0,hour_of_day_21.0,hour_of_day_22.0,hour_of_day_23.0,Distance,ScheduledDurationMinutes,is_weekend
0,-0.173815,-0.200266,-0.211908,-0.212466,-0.167627,-0.225977,-0.169079,-0.156511,0.66779,-0.16575,...,-0.25704,-0.257569,-0.239979,-0.217424,5.235773,-0.159289,-0.093926,-0.930684,-0.710513,-0.622034
1,-0.173815,-0.200266,-0.211908,-0.212466,-0.167627,-0.225977,-0.169079,-0.156511,0.66779,-0.16575,...,-0.25704,-0.257569,-0.239979,-0.217424,-0.190994,-0.159289,-0.093926,-0.397644,-0.817752,-0.622034
2,-0.173815,-0.200266,-0.211908,-0.212466,-0.167627,-0.225977,-0.169079,-0.156511,0.66779,-0.16575,...,-0.25704,-0.257569,-0.239979,-0.217424,-0.190994,-0.159289,-0.093926,1.248412,-0.174315,-0.622034
3,-0.173815,-0.200266,-0.211908,-0.212466,5.965612,-0.225977,-0.169079,-0.156511,-1.497478,-0.16575,...,-0.25704,-0.257569,-0.239979,-0.217424,-0.190994,-0.159289,-0.093926,0.366716,1.336788,1.60763
4,-0.173815,-0.200266,-0.211908,-0.212466,-0.167627,-0.225977,-0.169079,-0.156511,0.66779,-0.16575,...,3.890452,-0.257569,-0.239979,-0.217424,-0.190994,-0.159289,-0.093926,-0.680926,-0.476536,-0.622034


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.1, random_state=42)

## **Model Fitting**

In [111]:
from abc import ABC, abstractmethod
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd

# Constants for target columns
CANCELLED_COL = "Cancelled"
DELAYED_COL = "ArrDel15"
DELAY_DURATION_COL = "ArrDelay"
TARGET_COLS = [CANCELLED_COL, DELAYED_COL, DELAY_DURATION_COL]
TARGETS = {
    CANCELLED_COL: {"type": "classification"},
    DELAYED_COL: {"type": "classification"},
    DELAY_DURATION_COL: {"type": "regression"}
}

class FlightPerformanceModel():
    def __init__(
            self,
            classification_estimator,
            regression_estimator,
            classification_estimator_params={},
            regression_estimator_params={},
            targets = TARGETS
        ):
        self.classification_estimator = classification_estimator
        self.regression_estimator = regression_estimator
        self.classification_estimator_params = classification_estimator_params
        self.regression_estimator_params = regression_estimator_params
        self.targets = targets
        self.estimators = {}

    def fit(self, X_train, y_train):
        """Fit models for each target column."""
        for col in [CANCELLED_COL, DELAYED_COL]:
            self.estimators[DELAYED_COL] = self.classification_estimator(**self.classification_estimator_params).fit(X_train, y_train[DELAYED_COL])
            log.info(f"Model for target {col} trained.")
        self.estimators[DELAY_DURATION_COL] = self.regression_estimator(**self.regression_estimator_params).fit(
            X_train.loc[(y_train[DELAYED_COL] == 1.0).values],
            y_train.loc[y_train[DELAYED_COL] == 1.0][DELAY_DURATION_COL]
        )
        log.info(f"Model for target {DELAY_DURATION_COL} trained.")

    def predict_proba(self, X_test, target_col):
        """Predict probabilities for a specific target column on the test set."""
        model = self.estimators.get(target_col)
        if model is None:
            raise ValueError(f"Model for target {target_col} has not been trained.")
        return model.predict_proba(X_test)[:, 1]


### **Logistic Regression**

In [109]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score

model = FlightPerformanceModel(
    classification_estimator=LogisticRegression,
    classification_estimator_params=dict(max_iter=1000, random_state=42),
    regression_estimator=LinearRegression,
    regression_estimator_params={},
)

# Fit the logistic regression model
model.fit(X_train, y_train)

# Predict probabilities and calculate AUC for logistic regression model
for target_col in [CANCELLED_COL, DELAYED_COL]:
    y_pred_proba = model.predict_proba(X_test, target_col=target_col)
    logistic_auc = roc_auc_score(y_test[target_col], y_pred_proba)
    print(f"Logistic Regression AUC for {target_col}: {logistic_auc:.4f} vs. 0.50 baseline")

# Predict delay duration and calculate RMSE for linear regression model
y_pred_delay_duration = model.estimators[DELAY_DURATION_COL].predict(X_test.loc[(y_test[DELAYED_COL] == 1.0).values])
linear_rmse = np.sqrt(np.mean((y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL] - y_pred_delay_duration)**2))
baseline_rmse = np.sqrt(np.mean(y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL]**2))
print(f"Linear Regression RMSE for {DELAY_DURATION_COL}: {linear_rmse:.4f} vs. {baseline_rmse:.4f} baseline")

Logistic Regression AUC for Cancelled: 0.7531 vs. 0.50 baseline
Logistic Regression AUC for ArrDel15: 0.6633 vs. 0.50 baseline
Linear Regression RMSE for ArrDelay: 97.3364 vs. 120.7444 baseline


### **RandomForest**

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

model = FlightPerformanceModel(
    classification_estimator=RandomForestClassifier,
    classification_estimator_params=dict(n_estimators=100, random_state=42),
    regression_estimator=RandomForestRegressor,
    regression_estimator_params=dict(n_estimators=100, random_state=42)
)

# Fit the logistic regression model
model.fit(X_train, y_train)

# Predict probabilities and calculate AUC for logistic regression model
for target_col in [CANCELLED_COL, DELAYED_COL]:
    y_pred_proba = model.predict_proba(X_test, target_col=target_col)
    logistic_auc = roc_auc_score(y_test[target_col], y_pred_proba)
    print(f"Logistic Regression AUC for {target_col}: {logistic_auc:.4f} vs. 0.50 baseline")

# Predict delay duration and calculate RMSE for linear regression model
y_pred_delay_duration = model.estimators[DELAY_DURATION_COL].predict(X_test.loc[(y_test[DELAYED_COL] == 1.0).values])
linear_rmse = np.sqrt(np.mean((y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL] - y_pred_delay_duration)**2))
baseline_rmse = np.sqrt(np.mean(y_test.loc[y_test[DELAYED_COL] == 1.0][DELAY_DURATION_COL]**2))
print(f"Linear Regression RMSE for {DELAY_DURATION_COL}: {linear_rmse:.4f} vs. {baseline_rmse:.4f} baseline")