# **Nonlinear Model Testing**
This notebook tests several nonlinear models.

In [8]:
# Change CWD to repo base for imports
import os
from pathlib import Path

notebook_path = Path().resolve()
parent_directory = notebook_path.parent
if notebook_path.cwd().__str__().split("/")[-1] != "theory_of_ml_group4":
    os.chdir(parent_directory)

# library imports
from etl import get_flight_data
from models import FlightPerformanceModel
import constants as c
# computation
import numpy as np
# model utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
# models
from sklearn.linear_model import ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVR
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import logging

log = logging.getLogger("modeling")
log.setLevel(logging.INFO)

In [2]:
df = get_flight_data(sample_frac=0.50)
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

reading data/202312_flights.csv
reading data/202307_flights.csv
reading data/202402_flights.csv
reading data/202209_flights.csv
reading data/202302_flights.csv
reading data/202210_flights.csv
reading data/202308_flights.csv
reading data/202404_flights.csv
reading data/202301_flights.csv
skipped data/state_region_map.csv
reading data/202401_flights.csv
reading data/202304_flights.csv
reading data/202311_flights.csv
reading data/202406_flights.csv
reading data/202211_flights.csv
reading data/202303_flights.csv
reading data/202208_flights.csv
reading data/202403_flights.csv
reading data/202306_flights.csv
skipped data/airport_id_map.csv
reading data/202310_flights.csv
reading data/202305_flights.csv
reading data/202212_flights.csv
skipped data/airline_codes_map.csv
reading data/202405_flights.csv
reading data/202309_flights.csv
reading data/202207_flights.csv


In [3]:
def compute_classifier_metrics(model, targets, name):
    """Compute accuracy and AUC for a binary classifier model"""
    y_pred = model.predict(df_test, targets)
    accuracy = accuracy_score(df_test[targets].fillna(0), y_pred)
    print(f"{name} accuracy for {targets}: {accuracy:.4f} vs. 0.50 baseline")
    y_pred_proba = model.predict_proba(df_test, targets)
    auc = roc_auc_score(df_test[targets].fillna(0), y_pred_proba)
    print(f"{name} AUC for {targets}: {auc:.4f} vs. 0.50 baseline")


def compute_regression_metrics(model, name):
    """Compute RMSE for a regression model"""
    y_pred_delay_duration = model.predict(df_test.loc[(df_test[c.DELAYED_COL] == 1.0).values],
                                          target_col=c.DELAY_DURATION_COL)
    model_rmse = np.sqrt(
        np.mean((df_test.loc[df_test[c.DELAYED_COL] == 1.0][c.DELAY_DURATION_COL] - y_pred_delay_duration) ** 2))
    baseline_rmse = np.sqrt(np.mean(df_test.loc[df_test[c.DELAYED_COL] == 1.0][c.DELAY_DURATION_COL] ** 2))
    print(f"{name} RMSE for {c.DELAY_DURATION_COL}: {model_rmse:.4f} vs. {baseline_rmse:.4f} baseline")

## ElasticNet and Naive Bayes

In [5]:
bayes_elastic_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": GaussianNB,
            "params": {}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": GaussianNB,
            "params": {}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": ElasticNet,
            "params": {}
        },
    },
)
bayes_elastic_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
13:49:59:Training model for target Cancelled
13:50:01:Model for target Cancelled trained.
13:50:01:Training model for target ArrDel15
13:50:03:Model for target ArrDel15 trained.
13:50:03:Training model for target ArrDelay
13:50:03:Model for target ArrDelay trained.


In [6]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(bayes_elastic_model, target_col, "Naive Bayes")
compute_regression_metrics(bayes_elastic_model, "ElasticNet")

Naive Bayes accuracy for Cancelled: 0.7873 vs. 0.50 baseline
Naive Bayes AUC for Cancelled: 0.6477 vs. 0.50 baseline
Naive Bayes accuracy for ArrDel15: 0.6635 vs. 0.50 baseline
Naive Bayes AUC for ArrDel15: 0.5878 vs. 0.50 baseline
ElasticNet RMSE for ArrDelay: 98.5478 vs. 121.0417 baseline


## Linear SVM and QDA

In [10]:
svm_qda_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": QuadraticDiscriminantAnalysis,
            "params": {}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": QuadraticDiscriminantAnalysis,
            "params": {}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": LinearSVR,
            "params": {"random_state": 42}
        },
    },
)
svm_qda_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
13:51:58:Training model for target Cancelled
13:52:06:Model for target Cancelled trained.
13:52:06:Training model for target ArrDel15
13:52:13:Model for target ArrDel15 trained.
13:52:13:Training model for target ArrDelay
13:52:30:Model for target ArrDelay trained.


In [11]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(svm_qda_model, target_col, "QDA")
compute_regression_metrics(svm_qda_model, "Linear SVM")

QDA accuracy for Cancelled: 0.0272 vs. 0.50 baseline
QDA AUC for Cancelled: 0.6580 vs. 0.50 baseline
QDA accuracy for ArrDel15: 0.2989 vs. 0.50 baseline
QDA AUC for ArrDel15: 0.5468 vs. 0.50 baseline
Linear SVM RMSE for ArrDelay: 102.7457 vs. 121.0417 baseline


## Random Forest

In [12]:
# Uses 100 trees
rf_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": RandomForestClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": RandomForestClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": RandomForestRegressor,
            "params": {"n_jobs": -1, "random_state": 42}
        },
    },
)
rf_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
13:54:03:Training model for target Cancelled
13:55:03:Model for target Cancelled trained.
13:55:03:Training model for target ArrDel15
13:56:08:Model for target ArrDel15 trained.
13:56:08:Training model for target ArrDelay
13:57:04:Model for target ArrDelay trained.


In [13]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(rf_model, target_col, "Random Forest")
compute_regression_metrics(rf_model, "Random Forest")

Random Forest accuracy for Cancelled: 0.9793 vs. 0.50 baseline
Random Forest AUC for Cancelled: 0.6343 vs. 0.50 baseline
Random Forest accuracy for ArrDel15: 0.7321 vs. 0.50 baseline
Random Forest AUC for ArrDel15: 0.5789 vs. 0.50 baseline
Random Forest RMSE for ArrDelay: 107.8420 vs. 121.0417 baseline


## Bagging

In [14]:
# Uses 10 trees
bagging_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": BaggingClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": BaggingClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": BaggingRegressor,
            "params": {"n_jobs": -1, "random_state": 42}
        },
    },
)
bagging_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
13:57:48:Training model for target Cancelled
13:58:45:Model for target Cancelled trained.
13:58:45:Training model for target ArrDel15
13:59:35:Model for target ArrDel15 trained.
13:59:35:Training model for target ArrDelay
13:59:43:Model for target ArrDelay trained.


In [15]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(bagging_model, target_col, "Bagging")
compute_regression_metrics(bagging_model, "Bagging")

Bagging accuracy for Cancelled: 0.9792 vs. 0.50 baseline
Bagging AUC for Cancelled: 0.5934 vs. 0.50 baseline
Bagging accuracy for ArrDel15: 0.7355 vs. 0.50 baseline
Bagging AUC for ArrDel15: 0.5760 vs. 0.50 baseline
Bagging RMSE for ArrDelay: 110.9502 vs. 121.0417 baseline


## Histogram Based Gradient Boosting

In [16]:
boost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": HistGradientBoostingClassifier,
            "params": {"random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": HistGradientBoostingClassifier,
            "params": {"random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": HistGradientBoostingRegressor,
            "params": {"random_state": 42}
        },
    },
)
boost_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
14:00:17:Training model for target Cancelled
14:00:39:Model for target Cancelled trained.
14:00:39:Training model for target ArrDel15
14:01:04:Model for target ArrDel15 trained.
14:01:04:Training model for target ArrDelay
14:01:11:Model for target ArrDelay trained.


In [17]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(boost_model, target_col, "Histogram Gradient Boosting")
compute_regression_metrics(boost_model, "Histogram Gradient Boosting")

Histogram Gradient Boosting accuracy for Cancelled: 0.9846 vs. 0.50 baseline
Histogram Gradient Boosting AUC for Cancelled: 0.7645 vs. 0.50 baseline
Histogram Gradient Boosting accuracy for ArrDel15: 0.7941 vs. 0.50 baseline
Histogram Gradient Boosting AUC for ArrDel15: 0.6215 vs. 0.50 baseline
Histogram Gradient Boosting RMSE for ArrDelay: 98.1281 vs. 121.0417 baseline


## XGBoost

In [18]:
# Uses 100 trees
xgboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": XGBRegressor,
            "params": {"random_state": 42}
        },
    },
)
xgboost_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
14:01:38:Training model for target Cancelled
14:01:43:Model for target Cancelled trained.
14:01:43:Training model for target ArrDel15
14:01:48:Model for target ArrDel15 trained.
14:01:48:Training model for target ArrDelay
14:01:50:Model for target ArrDelay trained.


In [19]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(xgboost_model, target_col, "XGBoost")
compute_regression_metrics(xgboost_model, "XGBoost")

XGBoost accuracy for Cancelled: 0.9846 vs. 0.50 baseline
XGBoost AUC for Cancelled: 0.7856 vs. 0.50 baseline
XGBoost accuracy for ArrDel15: 0.7941 vs. 0.50 baseline
XGBoost AUC for ArrDel15: 0.6319 vs. 0.50 baseline
XGBoost RMSE for ArrDelay: 98.1731 vs. 121.0417 baseline


## XGBoost with 1000 trees on larger training set

In [20]:
# Uses 1000 trees
full_xgboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": XGBRegressor,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
    },
)
full_xgboost_model.fit(df_train)

Number of train rows with NaN: 1
14:02:33:Training model for target Cancelled
14:03:49:Model for target Cancelled trained.
14:03:49:Training model for target ArrDel15
14:05:06:Model for target ArrDel15 trained.
14:05:06:Training model for target ArrDelay
14:05:24:Model for target ArrDelay trained.


In [21]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(full_xgboost_model, target_col, "Full XGBoost")
compute_regression_metrics(full_xgboost_model, "Full XGBoost")

Full XGBoost accuracy for Cancelled: 0.9846 vs. 0.50 baseline
Full XGBoost AUC for Cancelled: 0.8101 vs. 0.50 baseline
Full XGBoost accuracy for ArrDel15: 0.7944 vs. 0.50 baseline
Full XGBoost AUC for ArrDel15: 0.6494 vs. 0.50 baseline
Full XGBoost RMSE for ArrDelay: 99.4478 vs. 121.0417 baseline


## CatBoost

In [22]:
catboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": CatBoostRegressor,
            "params": {"random_state": 42, "verbose": False}
        },
    },
)
catboost_model.fit(df_train.sample(frac=0.5, random_state=42))

Number of train rows with NaN: 0
14:05:52:Training model for target Cancelled
14:08:00:Model for target Cancelled trained.
14:08:00:Training model for target ArrDel15
14:10:04:Model for target ArrDel15 trained.
14:10:04:Training model for target ArrDelay
14:10:25:Model for target ArrDelay trained.


In [23]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(catboost_model, target_col, "CatBoost")
compute_regression_metrics(catboost_model, "CatBoost")

CatBoost accuracy for Cancelled: 0.9846 vs. 0.50 baseline
CatBoost AUC for Cancelled: 0.7958 vs. 0.50 baseline
CatBoost accuracy for ArrDel15: 0.7942 vs. 0.50 baseline
CatBoost AUC for ArrDel15: 0.6395 vs. 0.50 baseline
CatBoost RMSE for ArrDelay: 98.1762 vs. 121.0417 baseline


## CatBoost on larger training set

In [24]:
full_catboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": CatBoostRegressor,
            "params": {"random_state": 42, "verbose": False}
        },
    },
)
full_catboost_model.fit(df_train)

Number of train rows with NaN: 1
14:11:51:Training model for target Cancelled
14:15:59:Model for target Cancelled trained.
14:15:59:Training model for target ArrDel15
14:19:59:Model for target ArrDel15 trained.
14:19:59:Training model for target ArrDelay
14:20:38:Model for target ArrDelay trained.


In [25]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(full_catboost_model, target_col, "Full CatBoost")
compute_regression_metrics(full_catboost_model, "Full CatBoost")

Full CatBoost accuracy for Cancelled: 0.9846 vs. 0.50 baseline
Full CatBoost AUC for Cancelled: 0.8047 vs. 0.50 baseline
Full CatBoost accuracy for ArrDel15: 0.7943 vs. 0.50 baseline
Full CatBoost AUC for ArrDel15: 0.6434 vs. 0.50 baseline
Full CatBoost RMSE for ArrDelay: 97.9780 vs. 121.0417 baseline
