In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
import pandas as pd
traf_df = pd.read_csv("data/traffic_accidents.csv")
traf_df.head()


In [None]:
traf_target = traf_df["crash_type"] #pulling target out before dropping non-numerics

traf_features = traf_df.drop(columns=["crash_type"])

traf_features = traf_features.dropna(axis=1, how="all") # dropping empty variables col-wise
traf_features = traf_features.dropna(axis=0, how="any") #dropping rows with any missing values

traf_target = traf_target.loc[traf_features.index] #target and remaining rows aligned

traf_features = traf_features.select_dtypes(include=["number"]) #keeping only numeric cols

print(traf_features.info())
print("Features shape:", traf_features.shape)
print("Target shape:", traf_target.shape)


In [None]:
# train/test splitting
traf_features_train, traf_features_test, traf_target_train, traf_target_test = train_test_split(
    traf_features,
    traf_target,
    test_size=0.3,        #.7 train, .3 test
    random_state=42,      
    stratify=traf_target       # keeps class proportions similar in train and test
)

print(traf_features_train.shape, traf_features_test.shape)
print(traf_target_train.shape, traf_target_test.shape)

In [None]:
#eval helper, maybe delete later

def evaluate_model(model_name, model, features_train, target_train, features_test, target_test):
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)

    accuracy = accuracy_score(target_test, predictions)
    precision = precision_score(target_test, predictions, average="weighted", zero_division=0)
    recall = recall_score(target_test, predictions, average="weighted", zero_division=0)

    print(f"--- {model_name} ---")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print()
    
    return accuracy, precision, recall, predictions

In [None]:
# Dictionary to store all traffic results
traf_results = {}

traf_naive_bayes = GaussianNB()
acc, prec, rec, preds = evaluate_model(
    "Naive Bayes (Traffic)",
    traf_naive_bayes,
    traf_features_train,
    traf_target_train,
    traf_features_test,
    traf_target_test
)
traf_results['Naive Bayes'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

traf_logistic_regression = LogisticRegression(max_iter=1000)
acc, prec, rec, preds = evaluate_model(
    "Logistic Regression (Traffic)",
    traf_logistic_regression,
    traf_features_train,
    traf_target_train,
    traf_features_test,
    traf_target_test
)
traf_results['Logistic Regression'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

traf_knn = KNeighborsClassifier(n_neighbors=5)
acc, prec, rec, preds = evaluate_model(
    "KNN (Traffic, k=5)",
    traf_knn,
    traf_features_train,
    traf_target_train,
    traf_features_test,
    traf_target_test
)
traf_results['KNN'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

traf_decision_tree = DecisionTreeClassifier(random_state=42)
acc, prec, rec, preds = evaluate_model(
    "Decision Tree (Traffic)",
    traf_decision_tree,
    traf_features_train,
    traf_target_train,
    traf_features_test,
    traf_target_test
)
traf_results['Decision Tree'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

traf_mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)
acc, prec, rec, preds = evaluate_model(
    "MLP (Traffic, 50 hidden units)",
    traf_mlp,
    traf_features_train,
    traf_target_train,
    traf_features_test,
    traf_target_test
)
traf_results['MLP'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

In [None]:
# Performance comparison for Traffic Accidents dataset
models = list(traf_results.keys())
accuracy_scores = [traf_results[m]['accuracy'] for m in models]
precision_scores = [traf_results[m]['precision'] for m in models]
recall_scores = [traf_results[m]['recall'] for m in models]

x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, accuracy_scores, width, label='Accuracy', alpha=0.8)
bars2 = ax.bar(x, precision_scores, width, label='Precision', alpha=0.8)
bars3 = ax.bar(x + width, recall_scores, width, label='Recall', alpha=0.8)

ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('Scores', fontsize=12)
ax.set_title('Traffic Accidents: Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha='right')
ax.legend()
ax.set_ylim([0.75, 0.90])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('charts/traffic_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Confusion Matrices for Traffic Accidents dataset
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, (model_name, results) in enumerate(traf_results.items()):
    cm = confusion_matrix(traf_target_test, results['predictions'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], 
                cbar_kws={'label': 'Count'})
    axes[idx].set_title(f'{model_name}\nAccuracy: {results["accuracy"]:.4f}', 
                        fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Predicted Label', fontsize=10)
    axes[idx].set_ylabel('True Label', fontsize=10)
    axes[idx].tick_params(labelsize=8)

# Hide the extra subplot
axes[5].axis('off')

plt.suptitle('Traffic Accidents: Confusion Matrices', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('charts/traffic_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
flight_df = pd.read_csv("data/Combined_Flights_2022.csv")
flight_df.head()

In [None]:
flight_target = flight_df["Cancelled"]
flight_features = flight_df.drop(columns=["Cancelled"])

flight_features = flight_features.dropna(axis=1, how="all")

flight_features = flight_features.dropna(axis=1, how="any")

flight_features = flight_features.dropna(axis=0, how="any")
flight_target = flight_target.loc[flight_features.index]

flight_features = flight_features.select_dtypes(include=["number"])

print("Final target distribution:")
print(flight_target.value_counts(dropna=False))

In [None]:
sample_size = 200000  # smaller because data set is huge, models take too long

flight_features_sampled = flight_features.sample(n=sample_size, random_state=42)
flight_target_sampled = flight_target.loc[flight_features_sampled.index]

print("\nSampled features shape:", flight_features_sampled.shape)
print("Sampled target distribution:")
print(flight_target_sampled.value_counts(dropna=False))

flight_features_train, flight_features_test, flight_target_train, flight_target_test = train_test_split(
    flight_features_sampled,
    flight_target_sampled,
    test_size=0.3,
    random_state=42,
    stratify=flight_target_sampled
)

In [None]:
# Dictionary to store all flight results
flight_results = {}

flight_nb = GaussianNB()
acc, prec, rec, preds = evaluate_model(
    "Naive Bayes (Flights)",
    flight_nb,
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test
)
flight_results['Naive Bayes'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

flight_lr = LogisticRegression(max_iter=2000)
acc, prec, rec, preds = evaluate_model(
    "Logistic Regression (Flights)",
    flight_lr,
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test
)
flight_results['Logistic Regression'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

flight_knn = KNeighborsClassifier(n_neighbors=5)
acc, prec, rec, preds = evaluate_model(
    "KNN (Flights, k=5)",
    flight_knn,
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test
)
flight_results['KNN'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

flight_dt = DecisionTreeClassifier(random_state=42)
acc, prec, rec, preds = evaluate_model(
    "Decision Tree (Flights)",
    flight_dt,
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test
)
flight_results['Decision Tree'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

flight_mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)
acc, prec, rec, preds = evaluate_model(
    "MLP (Flights, 50 hidden units)",
    flight_mlp,
    flight_features_train,
    flight_target_train,
    flight_features_test,
    flight_target_test
)
flight_results['MLP'] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'predictions': preds}

In [None]:
# Performance comparison for Flights dataset
models = list(flight_results.keys())
accuracy_scores = [flight_results[m]['accuracy'] for m in models]
precision_scores = [flight_results[m]['precision'] for m in models]
recall_scores = [flight_results[m]['recall'] for m in models]

x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, accuracy_scores, width, label='Accuracy', alpha=0.8)
bars2 = ax.bar(x, precision_scores, width, label='Precision', alpha=0.8)
bars3 = ax.bar(x + width, recall_scores, width, label='Recall', alpha=0.8)

ax.set_xlabel('Models', fontsize=12)
ax.set_ylabel('Scores', fontsize=12)
ax.set_title('Flight Cancellations: Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha='right')
ax.legend()
ax.set_ylim([0.93, 0.98])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('charts/flights_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Confusion Matrices for Flights dataset
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, (model_name, results) in enumerate(flight_results.items()):
    cm = confusion_matrix(flight_target_test, results['predictions'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', ax=axes[idx], 
                cbar_kws={'label': 'Count'})
    axes[idx].set_title(f'{model_name}\nAccuracy: {results["accuracy"]:.4f}', 
                        fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Predicted Label', fontsize=10)
    axes[idx].set_ylabel('True Label', fontsize=10)
    axes[idx].tick_params(labelsize=8)

# Hide the extra subplot
axes[5].axis('off')

plt.suptitle('Flight Cancellations: Confusion Matrices', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('charts/flights_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()