In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_pickle('source\data.pkl')

data.info()

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Convert 'day', 'month', and 'time' to integers
data_processed['day'] = data_processed['day'].astype(int)
data_processed['month'] = data_processed['month'].astype(int)
data_processed['time'] = data_processed['time'].astype(int)

# Cyclical encoding for temporal features
data_processed['day_sin'] = np.sin(2 * np.pi * data_processed['day'] / 31)  
data_processed['day_cos'] = np.cos(2 * np.pi * data_processed['day'] / 31)

data_processed['month_sin'] = np.sin(2 * np.pi * data_processed['month'] / 12)
data_processed['month_cos'] = np.cos(2 * np.pi * data_processed['month'] / 12)

data_processed['time_sin'] = np.sin(2 * np.pi * data_processed['time'] / 86340000) 
data_processed['time_cos'] = np.cos(2 * np.pi * data_processed['time'] / 86340000)

data_processed.drop(columns=['day','month','time'],inplace=True)

# Selecting features and target variable
features_dummy = ['year', 'lum', 'atm_condition', 'collision_type',
       'route_category', 'traffic_regime', 'total_number_lanes',
       'reserved_lane_code', 'longitudinal_profile', 'plan',
       'surface_condition', 'infra', 'accident_situation',
       'traffic_direction', 'vehicle_category', 'fixed_obstacle',
       'mobile_obstacle', 'initial_impact_point', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'reason_travel',
       'safety_equipment1']

# These features will be standardized
features_scaler = ['lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal', 'maximum_speed', 'age']

# These features are between -1 and 1 and do not need any standardazations. 
features_temporal = ['day_sin', 'day_cos', 'month_sin', 'month_cos', 'time_sin', 'time_cos']
target = 'gravity'

X = data_processed.drop(columns=[target])
y = data_processed[target]
y = y.astype(int)

X = pd.get_dummies(X, columns=features_dummy, drop_first=True)

# stratify will split the dataset according to the distribution of the classes to compensate for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       447670 non-null  object 
 1   day                         447670 non-null  object 
 2   month                       447670 non-null  object 
 3   year                        447670 non-null  object 
 4   time                        447670 non-null  object 
 5   lum                         447670 non-null  object 
 6   atm_condition               447670 non-null  object 
 7   collision_type              447670 non-null  object 
 8   lat                         447670 non-null  float64
 9   long                        447670 non-null  float64
 10  route_category              447670 non-null  object 
 11  traffic_regime              447670 non-null  object 
 12  total_number_lanes          447670 non-null  object 
 13  reserved_lane_

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define MODEL
rf = RandomForestClassifier()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Setting GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Model adjustment
grid_search.fit(X_train, y_train)

# Best parameters
print("Best hyperparameters:", grid_search.best_params_)

In [2]:
# Best hyperparameters to tune
param_grid = {
    'n_estimators': [500],
    'max_depth': [30],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

Apply ML model v1---->

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Initialize RandomForest
rf = RandomForestClassifier(random_state=42)

In [4]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1_macro', cv=3, verbose=2, n_jobs=-1)

In [5]:
# Fit the model with the grid search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [6]:
# Best hyperparameters
print("Best Parameters found: ", grid_search.best_params_)

Best Parameters found:  {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}


In [7]:
# Predict on the test set with the best model from GridSearchCV
y_pred_grid = grid_search.best_estimator_.predict(X_test)

In [9]:
from sklearn.metrics import classification_report

# Initial classification report
print("Initial classification report after hyperparameter tuning:")
print(classification_report(y_test, y_pred_grid))

Initial classification report after hyperparameter tuning:
              precision    recall  f1-score   support

           1       0.72      0.86      0.78     37537
           2       0.54      0.02      0.03      2256
           3       0.55      0.37      0.44     13565
           4       0.67      0.66      0.66     36176

    accuracy                           0.68     89534
   macro avg       0.62      0.48      0.48     89534
weighted avg       0.67      0.68      0.66     89534



In [10]:
# Threshold Tuning for Higher Precision for Fatalities
# Get prediction probabilities
y_probs = grid_search.best_estimator_.predict_proba(X_test)

In [11]:
# Check how classes are ordered in the model
class_labels = grid_search.best_estimator_.classes_
print("Class labels in the model:", class_labels)

# Verify the index for Class 2 (Fatal)
fatal_class_index = np.where(class_labels == 2)[0][0]
print(f"Index for Class 2 (Fatal): {fatal_class_index}")


Class labels in the model: [1 2 3 4]
Index for Class 2 (Fatal): 1


In [12]:
# Use probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, 1]  

In [14]:
from sklearn.metrics import precision_recall_curve

# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

In [15]:
# Find the threshold that balances both precision and recall
f1_scores = 2 * (precision * recall) / (precision + recall)  # Calculate F1-scores for each threshold
best_threshold_index = np.argmax(f1_scores)  # Find the index with the highest F1-score
best_threshold = thresholds[best_threshold_index]


In [16]:
# Apply the threshold to classify fatal cases
y_pred_adjusted = (fatal_probs >= best_threshold).astype(int)

In [17]:
# Generate classification report after adjusting the threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold tuning:")
print(classification_report(y_test == 2, y_pred_adjusted))

Best Threshold: 0.11821933370405252
Classification report after further threshold tuning:
              precision    recall  f1-score   support

       False       0.98      0.97      0.98     87278
        True       0.26      0.37      0.30      2256

    accuracy                           0.96     89534
   macro avg       0.62      0.67      0.64     89534
weighted avg       0.97      0.96      0.96     89534



Apply ML model v2---->

In [18]:
# Define class weights to penalize misclassifications of Class 2 (Fatal)
class_weights = {1: 1, 2: 20, 3: 1, 4: 1}  # Increase weight for Class 2

# Train the Random Forest classifier with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
print("Cost-Sensitive Random Forest Classification Report:")
print(classification_report(y_test, y_pred))


Cost-Sensitive Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.85      0.78     37537
           2       0.41      0.05      0.08      2256
           3       0.54      0.38      0.45     13565
           4       0.67      0.65      0.66     36176

    accuracy                           0.68     89534
   macro avg       0.58      0.48      0.49     89534
weighted avg       0.66      0.68      0.66     89534



In [None]:
Apply ML model v3---->

In [19]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Get the prediction probabilities for the test set
y_probs = rf_model.predict_proba(X_test)

# Extract the probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, np.where(rf_model.classes_ == 2)[0][0]]  

# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

# Find the best threshold for an optimal balance of precision and recall (based on F1 score)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_index = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_index]

# Apply the best threshold
y_pred_threshold = (fatal_probs >= best_threshold).astype(int)

# Generate classification report for the adjusted threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold adjustment:")
print(classification_report(y_test == 2, y_pred_threshold))


Best Threshold: 0.12
Classification report after further threshold adjustment:
              precision    recall  f1-score   support

       False       0.98      0.96      0.97     87278
        True       0.22      0.39      0.28      2256

    accuracy                           0.95     89534
   macro avg       0.60      0.68      0.63     89534
weighted avg       0.96      0.95      0.96     89534



Apply ML model v4----> Addressing the Class Imbalance

In [25]:
# Check class distribution of the target variable
print(y_train.value_counts())


gravity
1    150149
4    144706
3     54260
2      9021
Name: count, dtype: int64


In [26]:
# Random Forest with class_weight='balanced'
rf_model_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model_balanced.fit(X_train, y_train)

# Predict and evaluate
y_pred_balanced = rf_model_balanced.predict(X_test)


In [27]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train the Random Forest model on the resampled data
rf_model_resampled = RandomForestClassifier(random_state=42)
rf_model_resampled.fit(X_train_res, y_train_res)

# Predict and evaluate
y_pred_resampled = rf_model_resampled.predict(X_test)


In [33]:
from sklearn.metrics import classification_report

# Evaluate the model performance
print(classification_report(y_test, y_pred_resampled))


              precision    recall  f1-score   support

           1       0.73      0.84      0.78     37537
           2       0.33      0.16      0.22      2256
           3       0.49      0.50      0.49     13565
           4       0.70      0.61      0.65     36176

    accuracy                           0.68     89534
   macro avg       0.56      0.53      0.53     89534
weighted avg       0.67      0.68      0.67     89534

