In [16]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")

In [3]:
# Load data
characteristics = pd.read_csv('source/characteristics.csv')
locations = pd.read_csv('source/locations.csv')
users = pd.read_csv('source/users.csv')
vehicles = pd.read_csv('source/vehicles.csv')

In [4]:
# Merge dataframes on AccID
data = characteristics.merge(locations, on='AccID').merge(users, on='AccID').merge(vehicles, on='AccID')

In [5]:
# Update the feature columns based on the available columns in the dataframe
feature_cols = ['time', 'lum', 'atm_condition', 'vehicle_category', 'age', 'traffic_regime', 'route_category']
target_col = 'gravity'

In [6]:
# Convert time column to a numerical format (e.g., seconds since midnight)
data['time'] = pd.to_datetime(data['time'], format='%H:%M:%S').dt.hour * 3600 + pd.to_datetime(data['time'], format='%H:%M:%S').dt.minute * 60 + pd.to_datetime(data['time'], format='%H:%M:%S').dt.second

In [7]:
# Handle missing values before splitting the data
imputer = SimpleImputer(strategy='median')
data[feature_cols] = imputer.fit_transform(data[feature_cols])

In [8]:
# Ensure stratified split
X = data[feature_cols]
y = data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [9]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [10]:
# Preprocessing pipeline
numeric_features = ['age', 'time']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['lum', 'atm_condition', 'vehicle_category', 'traffic_regime', 'route_category']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [13]:
# Define the model pipeline with RandomForestClassifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))])

In [14]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [None]:
# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, scoring='f1_macro')
grid_search.fit(X_train_smote, y_train_smote)

In [None]:
# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
# Predict and evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Confusion Matrix Heatmap
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')

# Adding a custom legend
plt.text(5.5, 0.5, '1 - Unharmed', verticalalignment='center', color='black', fontsize=12)
plt.text(5.5, 1.5, '2 - Killed', verticalalignment='center', color='black', fontsize=12)
plt.text(5.5, 2.5, '3 - Hospitalized', verticalalignment='center', color='black', fontsize=12)
plt.text(5.5, 3.5, '4 - Slightly injured', verticalalignment='center', color='black', fontsize=12)

plt.show()