In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal
from numpy import array, ndarray, argsort, arange

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer

import sys
sys.path.insert(1, '../../../utils')
from dslabs_functions import CLASS_EVAL_METRICS, DELTA_IMPROVE, plot_bar_chart, plot_multiline_chart, plot_evaluation_results, plot_horizontal_bar_chart, HEIGHT, plot_line_chart

import seaborn as sns

from matplotlib.pyplot import figure, savefig, show, subplots

import pandas as pd


In [None]:
flight_df = pd.read_csv("../data/raw/Combined_Flights_2022.csv")
flight_df.head()

In [None]:
flight_target = flight_df["Cancelled"]
flight_features = flight_df.drop(columns=["Cancelled"])

flight_features = flight_features.dropna(axis=1, how="all") #drops all cols with all NaN

flight_features = flight_features.dropna(axis=1, how="any") #drops all cols with any NaN

flight_target = flight_target.loc[flight_features.index]

flight_features = flight_features.select_dtypes(include=["number"])

print("Final target distribution:")
print(flight_target.value_counts(dropna=False))

In [None]:
sample_size = 200000  # smaller because data set is huge, models take too long

flight_features_sampled = flight_features.sample(n=sample_size, random_state=42)
flight_target_sampled = flight_target.loc[flight_features_sampled.index]

print("\nSampled features shape:", flight_features_sampled.shape)
print("Sampled target distribution:")
print(flight_target_sampled.value_counts(dropna=False))

flight_features_train, flight_features_test, flight_target_train, flight_target_test = train_test_split(
    flight_features_sampled,
    flight_target_sampled,
    test_size=0.3,
    random_state=42,
    stratify=flight_target_sampled
)

**NAIVE BAYES ANALYSIS - FLIGHT DATA**

In [None]:
flight_target_train_bin = flight_target_train.astype(int)
flight_target_test_bin = flight_target_test.astype(int)

file_tag = "flights"
eval_metric = "accuracy"

figure()
flight_nb_best_model_acc, flight_nb_params_acc = naive_Bayes_study(
    flight_features_train,
    flight_target_train_bin,
    flight_features_test,
    flight_target_test_bin,
    metric=eval_metric
)
savefig(f"{file_tag}_nb_{eval_metric}_study.png")
show()

figure()
flight_nb_best_model_rec, flight_nb_params_rec = naive_Bayes_study(
    flight_features_train,
    flight_target_train_bin,
    flight_features_test,
    flight_target_test_bin,
    metric="recall"
)
savefig(f"../charts/lab1_baseline/{file_tag}_nb_recall_study.png")
show()

In [None]:
prd_trn_nb_f = flight_nb_best_model_acc.predict(flight_features_train)
prd_tst_nb_f = flight_nb_best_model_acc.predict(flight_features_test)

nb_labels_f = sorted(np.unique(flight_target_train_bin))

figure()
plot_evaluation_results(
    flight_nb_params_acc,
    array(flight_target_train_bin),
    array(prd_trn_nb_f),
    array(flight_target_test_bin),
    array(prd_tst_nb_f),
    nb_labels_f
)
savefig(f'../charts/lab1_baseline/{file_tag}_{flight_nb_params_acc["name"]}_best_{flight_nb_params_acc["metric"]}_eval.png')
show()

**LOGISTIC REGRESSION ANALYSIS - FLIGHT DATA**

In [None]:
lr_flight_best_models, lr_flight_final_metrics = logistic_regression_study(
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test,
    nr_max_iterations=100,
    lag=5
)

In [None]:
best_lr_flight = lr_flight_best_models["l1"]

pred_train_lr_flight = best_lr_flight.predict(flight_features_train)
pred_test_lr_flight = best_lr_flight.predict(flight_features_test)

lb_lr_flight = LabelBinarizer()
y_train_bin_lr_flight = lb_lr_flight.fit_transform(flight_target_train).ravel()
y_test_bin_lr_flight = lb_lr_flight.transform(flight_target_test).ravel()
pred_train_bin_lr_flight = lb_lr_flight.transform(pred_train_lr_flight).ravel()
pred_test_bin_lr_flight = lb_lr_flight.transform(pred_test_lr_flight).ravel()

lr_flight_train_metrics = {
    "Accuracy": accuracy_score(flight_target_train, pred_train_lr_flight),
    "Recall": recall_score(flight_target_train, pred_train_lr_flight, average="weighted", zero_division=0),
    "Precision": precision_score(flight_target_train, pred_train_lr_flight, average="weighted", zero_division=0),
    "F1": f1_score(flight_target_train, pred_train_lr_flight, average="weighted", zero_division=0),
    "AUC": roc_auc_score(y_train_bin_lr_flight, pred_train_bin_lr_flight),
}

lr_flight_test_metrics = {
    "Accuracy": accuracy_score(flight_target_test, pred_test_lr_flight),
    "Recall": recall_score(flight_target_test, pred_test_lr_flight, average="weighted", zero_division=0),
    "Precision": precision_score(flight_target_test, pred_test_lr_flight, average="weighted", zero_division=0),
    "F1": f1_score(flight_target_test, pred_test_lr_flight, average="weighted", zero_division=0),
    "AUC": roc_auc_score(y_test_bin_lr_flight, pred_test_bin_lr_flight),
}

lr_flight_train_metrics, lr_flight_test_metrics

metrics = list(lr_flight_train_metrics.keys())
train_vals = [lr_flight_train_metrics[m] for m in metrics]
test_vals  = [lr_flight_test_metrics[m] for m in metrics]

groups = np.array([0, 1])
group_labels = ["Train", "Test"]

n_metrics = len(metrics)
group_width = 0.8
bar_width = group_width / n_metrics

colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple"][:n_metrics]

plt.figure(figsize=(10, 6))
for i, metric in enumerate(metrics):
    positions = groups - group_width/2 + (i + 0.5) * bar_width
    plt.bar(positions,
            [train_vals[i], test_vals[i]],
            width=bar_width,
            label=metric,
            color=colors[i])
    for x, val in zip(positions, [train_vals[i], test_vals[i]]):
        plt.text(x, val + 0.01, f"{val:.3f}", ha="center", fontsize=8)

plt.xticks(groups, group_labels)
plt.ylim(0, 1.05)
plt.ylabel("Score")
plt.title("Logistic Regression (L1, Flights) – Train vs Test Performance")
plt.legend(title="Metric")
plt.tight_layout()
plt.show()


cm_lr_flight = confusion_matrix(flight_target_test, pred_test_lr_flight)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_lr_flight, annot=True, fmt='d', cmap='Blues')
plt.title("Logistic Regression (L1, Flights) – Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()
