In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal
from numpy import array, ndarray, argsort, arange

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer

import sys
sys.path.insert(1, '../../../utils')
from dslabs_functions import CLASS_EVAL_METRICS, DELTA_IMPROVE, plot_bar_chart, plot_multiline_chart, plot_evaluation_results, plot_horizontal_bar_chart, HEIGHT, plot_line_chart

import seaborn as sns

from matplotlib.pyplot import figure, savefig, show, subplots

import pandas as pd


In [None]:
traf_df = pd.read_csv("../data/raw/traffic_accidents.csv")
traf_df.head()

In [None]:
traf_target = traf_df["crash_type"] #pulling target out before dropping non-numerics

traf_features = traf_df.drop(columns=["crash_type"])

traf_features = traf_features.dropna(axis=1, how="all") # dropping empty variables col-wise
traf_features = traf_features.dropna(axis=0, how="any") #dropping rows with any missing values

traf_target = traf_target.loc[traf_features.index] #target and remaining rows aligned

traf_features = traf_features.select_dtypes(include=["number"]) #keeping only numeric cols

print(traf_features.info())
print("Features shape:", traf_features.shape)
print("Target shape:", traf_target.shape)


In [None]:
# train/test splitting
traf_features_train, traf_features_test, traf_target_train, traf_target_test = train_test_split(
    traf_features,
    traf_target,
    test_size=0.3,        #.7 train, .3 test
    random_state=42,      
    stratify=traf_target       # keeps class proportions similar in train and test
)

binary_map = {
    "NO INJURY / DRIVE AWAY": 0,
    "INJURY AND / OR TOW DUE TO CRASH": 1
}

traf_target_train_bin = traf_target_train.map(binary_map)
traf_target_test_bin = traf_target_test.map(binary_map)

**NAIVE BAYES ANALYSIS - TRAFFIC DATA**

In [None]:
def naive_Bayes_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    metric: str = "accuracy"
):
    estimators = {
        "GaussianNB": GaussianNB(),
        "MultinomialNB": MultinomialNB(),
        "BernoulliNB": BernoulliNB(),
    }

    xvalues = []
    yvalues = []
    best_model = None
    best_params = {"name": "", "metric": metric, "params": ()}
    best_performance = 0

    for clf_name in estimators:
        xvalues.append(clf_name)
        estimators[clf_name].fit(trnX, trnY)
        prdY = estimators[clf_name].predict(tstX)
        val = CLASS_EVAL_METRICS[metric](tstY, prdY)
        if val - best_performance > DELTA_IMPROVE:
            best_performance = val
            best_params["name"] = clf_name
            best_params[metric] = val
            best_model = estimators[clf_name]
        yvalues.append(val)

    # get Axes from DSLabs helper
    ax = plot_bar_chart(
        xvalues,
        yvalues,
        title=f"Naive Bayes Models ({metric})",
        ylabel=metric,
        percentage=True,
    )

    # remove the default labels that plot_bar_chart added
    for t in ax.texts:
        t.set_visible(False)

    # add our own labels with more precision (change .6f if you want)
    for bar in ax.patches:
        height = bar.get_height()
        ax.annotate(
            f"{height:.4f}",
            (bar.get_x() + bar.get_width() / 2, height),
            ha="center",
            va="bottom",
            fontsize=7,
        )

    return best_model, best_params

file_tag = "traffic"
eval_metric = "accuracy"

file_tag = "traffic"
eval_metric = "accuracy"

figure()
traf_nb_best_model_acc, traf_nb_params_acc = naive_Bayes_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    metric=eval_metric
)
savefig(f"../charts/lab1_baseline/{file_tag}_nb_{eval_metric}_study.png")
show()

figure()
traf_nb_best_model_rec, traf_nb_params_rec = naive_Bayes_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    metric="recall"
)
savefig("../charts/lab1_baseline/traffic_nb_recall_study.png")
show()

In [None]:
prd_trn_nb = traf_nb_best_model_acc.predict(traf_features_train)
prd_tst_nb = traf_nb_best_model_acc.predict(traf_features_test)

nb_labels = sorted(np.unique(traf_target_train_bin))

figure()
plot_evaluation_results(
    traf_nb_params_acc,
    array(traf_target_train_bin),
    array(prd_trn_nb),
    array(traf_target_test_bin),
    array(prd_tst_nb),
    nb_labels
)
savefig(f'../charts/lab1_baseline/traffic_{traf_nb_params_acc["name"]}_best_{traf_nb_params_acc["metric"]}_eval.png')
show()

**LOGISTIC REGRESSION ANALYSIS - TRAFFIC DATA**

In [None]:
def logistic_regression_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    nr_max_iterations: int = 2500,
    lag: int = 500,
    metric: str = "accuracy",
):
    nr_iterations = list(range(lag, nr_max_iterations + 1, lag))
    penalty_types = ["l1", "l2"]  # only valid with solver='liblinear'

    best_model = None
    best_params = {"name": "LR", "metric": metric, "params": ()}
    best_performance = 0.0

    values = {}
    for penalty in penalty_types:
        y_tst_values = []
        for n_iter in nr_iterations:
            clf = LogisticRegression(
                penalty=penalty,
                max_iter=n_iter,
                solver="liblinear",
                verbose=False,
            )
            clf.fit(trnX, trnY)
            prdY = clf.predict(tstX)
            val = CLASS_EVAL_METRICS[metric](tstY, prdY)
            y_tst_values.append(val)

            if val - best_performance > DELTA_IMPROVE:
                best_performance = val
                best_params["params"] = (penalty, n_iter)
                best_model = clf

        values[penalty] = y_tst_values

    plot_multiline_chart(
        nr_iterations,
        values,
        title=f"LR models ({metric})",
        xlabel="nr iterations",
        ylabel=metric,
        percentage=True,
    )

    print(
        f'LR best for {best_params["params"][1]} iterations '
        f'(penalty={best_params["params"][0]}) with {metric}={best_performance:.6f}'
    )

    return best_model, best_params

file_tag = "traffic"
eval_metric = "accuracy"

figure()
traf_lr_best_model, traf_lr_params = logistic_regression_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    nr_max_iterations=2500,
    lag=500,
    metric=eval_metric,
)
savefig(f"../charts/lab1_baseline/{file_tag}_lr_{eval_metric}_study.png")
show()

In [None]:
prd_trn_lr = traf_lr_best_model.predict(traf_features_train)
prd_tst_lr = traf_lr_best_model.predict(traf_features_test)

lr_labels = sorted(np.unique(traf_target_train_bin))

figure()
plot_evaluation_results(
    traf_lr_params,
    array(traf_target_train_bin),
    array(prd_trn_lr),
    array(traf_target_test_bin),
    array(prd_tst_lr),
    lr_labels,
)
savefig(f'../charts/lab1_baseline/traffic_{traf_lr_params["name"]}_best_{traf_lr_params["metric"]}_eval.png')
show()

**KNN ANALYSIS - TRAFFIC DATA**

In [None]:
def knn_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    k_max: int = 19,
    lag: int = 2,
    metric: str = "accuracy",
):
    dist: list[Literal["manhattan", "euclidean", "chebyshev"]] = [
        "manhattan",
        "euclidean",
        "chebyshev",
    ]

    kvalues = [i for i in range(1, k_max + 1, lag)]
    best_model: KNeighborsClassifier | None = None
    best_params = {"name": "KNN", "metric": metric, "params": ()}
    best_performance = 0.0

    values: dict[str, list] = {}
    for d in dist:
        y_tst_values = []
        for k in kvalues:
            clf = KNeighborsClassifier(n_neighbors=k, metric=d)
            clf.fit(trnX, trnY)
            prdY = clf.predict(tstX)
            val = CLASS_EVAL_METRICS[metric](tstY, prdY)
            y_tst_values.append(val)
            if val - best_performance > DELTA_IMPROVE:
                best_performance = val
                best_params["params"] = (k, d)
                best_model = clf
        values[d] = y_tst_values

    print(f'KNN best with k={best_params["params"][0]} and {best_params["params"][1]}')

    plot_multiline_chart(
        kvalues,
        values,
        title=f"KNN Models ({metric})",
        xlabel="k",
        ylabel=metric,
        percentage=True,
    )

    return best_model, best_params

file_tag = "traffic"
eval_metric = "accuracy"

figure()
traf_knn_best_model, traf_knn_params = knn_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    k_max=25,
    lag=2,
    metric=eval_metric,
)
savefig(f"../charts/lab1_baseline/{file_tag}_knn_{eval_metric}_study.png")
show()

In [None]:
prd_trn_knn = traf_knn_best_model.predict(traf_features_train)
prd_tst_knn = traf_knn_best_model.predict(traf_features_test)

knn_labels = sorted(np.unique(traf_target_train_bin))

figure()
plot_evaluation_results(
    traf_knn_params,
    array(traf_target_train_bin),
    array(prd_trn_knn),
    array(traf_target_test_bin),
    array(prd_tst_knn),
    knn_labels,
)
savefig(f"../charts/lab1_baseline/traffic_{traf_knn_params['name']}_best_{traf_knn_params['metric']}_eval.png")
show()

In [None]:
distance = traf_knn_params["params"][1]   # best distance from KNN study, e.g. 'euclidean'
K_MAX = 25
kvalues = [i for i in range(1, K_MAX, 2)]

y_tst_values = []
y_trn_values = []
acc_metric = "accuracy"

for k in kvalues:
    clf = KNeighborsClassifier(n_neighbors=k, metric=distance)
    clf.fit(traf_features_train, traf_target_train_bin)
    prd_tst_Y = clf.predict(traf_features_test)
    prd_trn_Y = clf.predict(traf_features_train)

    y_tst_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_test_bin, prd_tst_Y))
    y_trn_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_train_bin, prd_trn_Y))

figure()
plot_multiline_chart(
    kvalues,
    {"Train": y_trn_values, "Test": y_tst_values},
    title=f"KNN overfitting study for {distance}",
    xlabel="K",
    ylabel=acc_metric,
    percentage=True,
)
savefig("../charts/lab1_baseline/traffic_knn_overfitting.png")
show()

**DECISION TREES ANALYSIS - TRAFFIC DATA**

In [None]:
def trees_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    d_max: int = 10,
    lag: int = 2,
    metric: str = "accuracy",
):
    criteria = ["entropy", "gini"]
    depths = [i for i in range(2, d_max + 1, lag)]

    best_model = None
    best_params = {"name": "DT", "metric": metric, "params": ()}
    best_performance = 0.0

    values = {}
    for c in criteria:
        y_tst_values = []
        for d in depths:
            clf = DecisionTreeClassifier(
                max_depth=d,
                criterion=c,
                min_impurity_decrease=0,
                random_state=42,
            )
            clf.fit(trnX, trnY)
            prdY = clf.predict(tstX)
            val = CLASS_EVAL_METRICS[metric](tstY, prdY)
            y_tst_values.append(val)
            if val - best_performance > DELTA_IMPROVE:
                best_performance = val
                best_params["params"] = (c, d)
                best_model = clf
        values[c] = y_tst_values

    print(f'DT best with {best_params["params"][0]} and d={best_params["params"][1]}')

    plot_multiline_chart(
        depths,
        values,
        title=f"DT Models ({metric})",
        xlabel="d",
        ylabel=metric,
        percentage=True,
    )

    return best_model, best_params

file_tag = "traffic"
eval_metric = "accuracy"

figure()
traf_dt_best_model, traf_dt_params = trees_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    d_max=25,
    lag=2,
    metric=eval_metric,
)
savefig(f"../charts/lab1_baseline/{file_tag}_dt_{eval_metric}_study.png")
show()

In [None]:
prd_trn_dt = traf_dt_best_model.predict(traf_features_train)
prd_tst_dt = traf_dt_best_model.predict(traf_features_test)

dt_labels = sorted(np.unique(traf_target_train_bin))

figure()
plot_evaluation_results(
    traf_dt_params,
    array(traf_target_train_bin),
    array(prd_trn_dt),
    array(traf_target_test_bin),
    array(prd_tst_dt),
    dt_labels,
)
savefig(f'../charts/lab1_baseline/traffic_{traf_dt_params["name"]}_best_{traf_dt_params["metric"]}_eval.png')
show()

In [None]:
max_depth2show = 3   # adjust if you want deeper/shallower views
dt_feature_names = list(traf_features.columns)

dt_class_names = sorted(traf_target.unique())
tree_filename = f"traffic_dt_tree_depth{max_depth2show}"

figure(figsize=(18, 10))
plot_tree(
    traf_dt_best_model,
    max_depth=max_depth2show,
    feature_names=dt_feature_names,
    class_names=dt_class_names,
    filled=True,
    rounded=True,
    impurity=False,
    precision=2,
)
savefig(f"../charts/lab1_baseline/{tree_filename}.png")
show()

In [None]:
importances = traf_dt_best_model.feature_importances_

indices = argsort(importances)[::-1]
dt_vars = list(traf_features.columns)
elems = []
imp_values = []

# print ranked list like professor
for f in range(len(dt_vars)):
    feature_name = dt_vars[indices[f]]
    feature_imp = importances[indices[f]]

    elems.append(feature_name)
    imp_values.append(feature_imp)

    print(f"{f+1}. {feature_name} ({feature_imp})")

figure()
plot_horizontal_bar_chart(
    elems,
    imp_values,
    title="Decision Tree variables importance",
    xlabel="importance",
    ylabel="variables",
    percentage=True,
)
savefig("../charts/lab1_baseline/traffic_dt_importance_ranking.png")
show()

In [None]:
crit = traf_dt_params["params"][0]   # 'entropy' or 'gini'
d_max = 12
depths = [i for i in range(2, d_max + 1, 1)]

y_tst_values = []
y_trn_values = []
acc_metric = "accuracy"

for d in depths:
    clf = DecisionTreeClassifier(
        max_depth=d,
        criterion=crit,
        min_impurity_decrease=0,
        random_state=42,
    )
    clf.fit(traf_features_train, traf_target_train_bin)
    prd_tst_Y = clf.predict(traf_features_test)
    prd_trn_Y = clf.predict(traf_features_train)

    y_tst_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_test_bin, prd_tst_Y))
    y_trn_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_train_bin, prd_trn_Y))

figure()
plot_multiline_chart(
    depths,
    {"Train": y_trn_values, "Test": y_tst_values},
    title=f"DT overfitting study for {crit}",
    xlabel="max_depth",
    ylabel=acc_metric,
    percentage=True,
)
savefig("../charts/lab1_baseline/traffic_dt_accuracy_overfitting.png")
show()

**MLP ANALYSIS - TRAFFIC DATA**

In [None]:
LAG = 100
NR_MAX_ITER = 400

def mlp_study(
    trnX: ndarray,
    trnY: array,
    tstX: ndarray,
    tstY: array,
    nr_max_iterations: int = 400,
    lag: int = 100,
    metric: str = "accuracy",
):
    nr_iterations = [lag] + [i for i in range(2 * lag, nr_max_iterations + 1, lag)]

    lr_types: list[Literal["constant", "invscaling", "adaptive"]] = [
        "constant",
        "invscaling",
        "adaptive",
    ]
    learning_rates = [0.5, 0.05, 0.005, 0.005]

    best_model: MLPClassifier | None = None
    best_params = {"name": "MLP", "metric": metric, "params": ()}
    best_performance = 0.0

    _, axs = subplots(1, len(lr_types), figsize=(len(lr_types) * HEIGHT, HEIGHT), squeeze=False)

    for i, lr_type in enumerate(lr_types):
        values = {}
        for lr in learning_rates:
            warm_start = False
            y_tst_values = []
            for _ in range(len(nr_iterations)):
                clf = MLPClassifier(
                    learning_rate=lr_type,
                    learning_rate_init=lr,
                    max_iter=lag,
                    warm_start=warm_start,
                    activation="logistic",
                    solver="sgd",
                    verbose=False,
                )
                clf.fit(trnX, trnY)
                prdY = clf.predict(tstX)
                val = CLASS_EVAL_METRICS[metric](tstY, prdY)
                y_tst_values.append(val)
                warm_start = True
                if val - best_performance > DELTA_IMPROVE:
                    best_performance = val
                    best_params["params"] = (lr_type, lr, nr_iterations[len(y_tst_values) - 1])
                    best_model = clf
            values[lr] = y_tst_values

        plot_multiline_chart(
            nr_iterations,
            values,
            ax=axs[0, i],
            title=f"MLP with {lr_type}",
            xlabel="nr iterations",
            ylabel=metric,
            percentage=True,
        )

    print(
        f'MLP best for {best_params["params"][2]} iterations '
        f'(lr_type={best_params["params"][0]} and lr={best_params["params"][1]}) '
        f'with {metric}={best_performance:.6f}'
    )

    return best_model, best_params

file_tag = "traffic"
eval_metric = "accuracy"

figure()
traf_mlp_best_model, traf_mlp_params = mlp_study(
    traf_features_train,
    traf_target_train_bin,
    traf_features_test,
    traf_target_test_bin,
    nr_max_iterations=NR_MAX_ITER,
    lag=LAG,
    metric=eval_metric,
)
savefig(f"../charts/lab1_baseline/{file_tag}_mlp_{eval_metric}_study.png")
show()

In [None]:
prd_trn_mlp = traf_mlp_best_model.predict(traf_features_train)
prd_tst_mlp = traf_mlp_best_model.predict(traf_features_test)

mlp_labels = sorted(np.unique(traf_target_train_bin))

figure()
plot_evaluation_results(
    traf_mlp_params,
    array(traf_target_train_bin),
    array(prd_trn_mlp),
    array(traf_target_test_bin),
    array(prd_tst_mlp),
    mlp_labels,
)
savefig(f"../charts/lab1_baseline/traffic_{traf_mlp_params['name']}_best_{traf_mlp_params['metric']}_eval.png")
show()

In [None]:
lr_type = traf_mlp_params["params"][0]
lr = traf_mlp_params["params"][1]

nr_iterations = [i for i in range(LAG, NR_MAX_ITER + 1, LAG)]

y_tst_values = []
y_trn_values = []
acc_metric = "accuracy"

for n in nr_iterations:
    clf = MLPClassifier(
        learning_rate=lr_type,
        learning_rate_init=lr,
        max_iter=n,
        activation="logistic",
        solver="adam",
        verbose=False,
        random_state=42,
    )
    clf.fit(traf_features_train, traf_target_train_bin)
    prd_tst_Y = clf.predict(traf_features_test)
    prd_trn_Y = clf.predict(traf_features_train)

    y_tst_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_test_bin, prd_tst_Y))
    y_trn_values.append(CLASS_EVAL_METRICS[acc_metric](traf_target_train_bin, prd_trn_Y))

figure()
plot_multiline_chart(
    nr_iterations,
    {"Train": y_trn_values, "Test": y_tst_values},
    title=f"MLP overfitting study for lr_type={lr_type} and lr={lr}",
    xlabel="nr_iterations",
    ylabel=acc_metric,
    percentage=True,
)
savefig("../charts/lab1_baseline/traffic_mlp_accuracy_overfitting.png")
show()

In [None]:
figure()
plot_line_chart(
    arange(len(traf_mlp_best_model.loss_curve_)),
    traf_mlp_best_model.loss_curve_,
    title="Loss curve for MLP best model training",
    xlabel="iterations",
    ylabel="loss",
    percentage=False,
)
savefig(f"../charts/lab1_baseline/{file_tag}_mlp_{eval_metric}_loss_curve.png")
show()