# **Nonlinear Model Testing**
This notebook tests several nonlinear models.

In [None]:
# Change CWD to repo base for imports
import os
from pathlib import Path

notebook_path = Path().resolve()
parent_directory = notebook_path.parent
if notebook_path.cwd().__str__().split("/")[-1] != "theory_of_ml_group4":
    os.chdir(parent_directory)

# library imports
from etl import get_flight_data
from models import FlightPerformanceModel
import constants as c
# computation
import numpy as np
# model utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
# models
from sklearn.linear_model import ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import logging

log = logging.getLogger("modeling")
log.setLevel(logging.INFO)

In [None]:
df = get_flight_data()
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
def compute_classifier_metrics(model, targets, name):
    """Compute accuracy and AUC for a binary classifier model"""
    y_pred = model.predict(df_test, targets)
    accuracy = accuracy_score(df_test[targets].fillna(0), y_pred)
    print(f"{name} accuracy for {targets}: {accuracy:.4f} vs. 0.50 baseline")
    y_pred_proba = model.predict_proba(df_test, targets)
    auc = roc_auc_score(df_test[targets].fillna(0), y_pred_proba)
    print(f"{name} AUC for {targets}: {auc:.4f} vs. 0.50 baseline")


def compute_regression_metrics(model, name):
    """Compute RMSE for a regression model"""
    y_pred_delay_duration = model.predict(df_test.loc[(df_test[c.DELAYED_COL] == 1.0).values],
                                          target_col=c.DELAY_DURATION_COL)
    model_rmse = np.sqrt(
        np.mean((df_test.loc[df_test[c.DELAYED_COL] == 1.0][c.DELAY_DURATION_COL] - y_pred_delay_duration) ** 2))
    baseline_rmse = np.sqrt(np.mean(df_test.loc[df_test[c.DELAYED_COL] == 1.0][c.DELAY_DURATION_COL] ** 2))
    print(f"{name} RMSE for {c.DELAY_DURATION_COL}: {model_rmse:.4f} vs. {baseline_rmse:.4f} baseline")

## ElasticNet and Naive Bayes

In [None]:
bayes_elastic_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": GaussianNB,
            "params": {}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": GaussianNB,
            "params": {}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": ElasticNet,
            "params": {}
        },
    },
)
bayes_elastic_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(bayes_elastic_model, target_col, "Naive Bayes")
compute_regression_metrics(bayes_elastic_model, "ElasticNet")

## Random Forest

In [None]:
# Uses 100 trees
rf_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": RandomForestClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": RandomForestClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": RandomForestRegressor,
            "params": {"n_jobs": -1, "random_state": 42}
        },
    },
)
rf_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(rf_model, target_col, "Random Forest")
compute_regression_metrics(rf_model, "Random Forest")

## Bagging

In [None]:
# Uses 10 trees
bagging_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": BaggingClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": BaggingClassifier,
            "params": {"n_jobs": -1, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": BaggingRegressor,
            "params": {"n_jobs": -1, "random_state": 42}
        },
    },
)
bagging_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(bagging_model, target_col, "Bagging")
compute_regression_metrics(bagging_model, "Bagging")

## Histogram Based Gradient Boosting

In [None]:
boost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": HistGradientBoostingClassifier,
            "params": {"random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": HistGradientBoostingClassifier,
            "params": {"random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": HistGradientBoostingRegressor,
            "params": {"random_state": 42}
        },
    },
)
boost_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(boost_model, target_col, "Histogram Gradient Boosting")
compute_regression_metrics(boost_model, "Histogram Gradient Boosting")

## XGBoost

In [None]:
# Uses 100 trees
xgboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": XGBRegressor,
            "params": {"random_state": 42}
        },
    },
)
xgboost_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(xgboost_model, target_col, "XGBoost")
compute_regression_metrics(xgboost_model, "XGBoost")

## XGBoost with 1000 trees on full training set

In [None]:
# Uses 1000 trees
full_xgboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": XGBClassifier,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": XGBRegressor,
            "params": {"n_estimators": 1000, "random_state": 42}
        },
    },
)
full_xgboost_model.fit(df_train)

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(full_xgboost_model, target_col, "Full XGBoost")
compute_regression_metrics(full_xgboost_model, "Full XGBoost")

## CatBoost

In [None]:
catboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": CatBoostRegressor,
            "params": {"random_state": 42, "verbose": False}
        },
    },
)
catboost_model.fit(df_train.sample(frac=0.5, random_state=42))

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(catboost_model, target_col, "CatBoost")
compute_regression_metrics(catboost_model, "CatBoost")

## CatBoost on full training set

In [None]:
full_catboost_model = FlightPerformanceModel(
    targets={
        c.CANCELLED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAYED_COL: {
            "type": "classification",
            "estimator": CatBoostClassifier,
            "params": {"random_state": 42, "verbose": False}
        },
        c.DELAY_DURATION_COL: {
            "type": "regression",
            "estimator": CatBoostRegressor,
            "params": {"random_state": 42, "verbose": False}
        },
    },
)
full_catboost_model.fit(df_train)

In [None]:
for target_col in [c.CANCELLED_COL, c.DELAYED_COL]:
    compute_classifier_metrics(full_catboost_model, target_col, "Full CatBoost")
compute_regression_metrics(full_catboost_model, "Full CatBoost")