In [1]:
# Import the necessary packages
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
from imblearn.over_sampling import SMOTE

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="SMOTE")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="GridSearchCV")
warnings.filterwarnings("ignore", category=FutureWarning, module="train_test_split")

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [4]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [5]:
data_processed = data_processed.drop(['AccID','birth_year','vehicleID','num_veh'], axis=1)

In [6]:
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [7]:
# Normalization/Standardization: Normalize or standardize numerical features
numerical_columns = data_processed.select_dtypes(include=['float64']).columns

scaler = StandardScaler()
data_processed[numerical_columns] = scaler.fit_transform(data_processed[numerical_columns])

In [8]:
data_processed.head()

Unnamed: 0,day,month,year,time,lum,atm_condition,collision_type,lat,long,route_category,...,initial_impact_point,manv,motor,seat,user_category,gravity,gender,reason_travel,safety_equipment1,age
0,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,5,23,1,2,2,4,2,0,1,-1.141512
1,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,5,23,1,1,1,4,2,5,1,-0.663829
2,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,3,11,1,1,1,1,1,0,1,1.140752
3,1.635754,1.273089,-1.318561,-2.084448,3,1,6,0.82129,-0.104276,1,...,1,0,1,1,1,4,2,0,1,-0.716905
4,1.407189,1.273089,-1.318561,0.212747,1,1,4,0.823628,-0.124441,1,...,1,2,1,1,1,1,1,0,1,-0.823057


In [9]:
# Selecting features and target variable
features = ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1','maximum_speed','age','lat','long','distance_upstream_terminal','total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [10]:
# Handling categorical features with label encoding
le = LabelEncoder()

In [11]:
# Splitting the data into train and test sets
X = data_processed[features]
y = data_processed[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

found 0 physical cores < 1
  File "C:\Users\sd10725\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [13]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],          # Number of trees in the forest
    'max_depth': [20, 30],               # Maximum depth of the tree
    'min_samples_split': [5, 10],        # Minimum number of samples to split a node
    'min_samples_leaf': [2, 4],          # Minimum number of samples per leaf
    'max_features': ['sqrt'],            # Max features considered for splitting
    'class_weight': [{1: 1, 2: 20, 3: 1, 4: 1}]  # Increased weight for fatalities
}

Apply ML model v1---->

In [14]:
# Initialize RandomForest
rf = RandomForestClassifier(random_state=42)

In [15]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1_macro', cv=3, verbose=2, n_jobs=-1)

In [16]:
# Fit the model with the grid search
grid_search.fit(X_res, y_res)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [17]:
# Best hyperparameters
print("Best Parameters found: ", grid_search.best_params_)

Best Parameters found:  {'class_weight': {1: 1, 2: 20, 3: 1, 4: 1}, 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [18]:
# Predict on the test set with the best model from GridSearchCV
y_pred_grid = grid_search.best_estimator_.predict(X_test)

In [19]:
# Initial classification report
print("Initial classification report after hyperparameter tuning:")
print(classification_report(y_test, y_pred_grid))

Initial classification report after hyperparameter tuning:
              precision    recall  f1-score   support

           1       0.75      0.79      0.77     37371
           2       0.11      0.46      0.18      2335
           3       0.45      0.33      0.38     13737
           4       0.69      0.58      0.63     36091

    accuracy                           0.63     89534
   macro avg       0.50      0.54      0.49     89534
weighted avg       0.66      0.63      0.64     89534



In [20]:
# Threshold Tuning for Higher Precision for Fatalities
# Get prediction probabilities
y_probs = grid_search.best_estimator_.predict_proba(X_test)

In [21]:
# Check how classes are ordered in the model
class_labels = grid_search.best_estimator_.classes_
print("Class labels in the model:", class_labels)

# Verify the index for Class 2 (Fatal)
fatal_class_index = np.where(class_labels == 2)[0][0]
print(f"Index for Class 2 (Fatal): {fatal_class_index}")


Class labels in the model: [1 2 3 4]
Index for Class 2 (Fatal): 1


In [22]:
# Use probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, 1]  

In [23]:
# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

In [24]:
# Find the threshold that balances both precision and recall
f1_scores = 2 * (precision * recall) / (precision + recall)  # Calculate F1-scores for each threshold
best_threshold_index = np.argmax(f1_scores)  # Find the index with the highest F1-score
best_threshold = thresholds[best_threshold_index]


In [25]:
# Apply the threshold to classify fatal cases
y_pred_adjusted = (fatal_probs >= best_threshold).astype(int)

In [26]:
# Generate classification report after adjusting the threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold tuning:")
print(classification_report(y_test == 2, y_pred_adjusted))

Best Threshold: 0.5206057499306351
Classification report after further threshold tuning:
              precision    recall  f1-score   support

       False       0.98      0.95      0.96     87199
        True       0.14      0.32      0.20      2335

    accuracy                           0.93     89534
   macro avg       0.56      0.63      0.58     89534
weighted avg       0.96      0.93      0.94     89534



Apply ML model v2---->

In [27]:
# Define class weights to penalize misclassifications of Class 2 (Fatal)
class_weights = {1: 1, 2: 20, 3: 1, 4: 1}  # Increase weight for Class 2

# Train the Random Forest classifier with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights)
rf_model.fit(X_res, y_res)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
print("Cost-Sensitive Random Forest Classification Report:")
print(classification_report(y_test, y_pred))


Cost-Sensitive Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.74      0.81      0.77     37371
           2       0.26      0.14      0.18      2335
           3       0.45      0.54      0.49     13737
           4       0.68      0.59      0.63     36091

    accuracy                           0.66     89534
   macro avg       0.54      0.52      0.52     89534
weighted avg       0.66      0.66      0.66     89534



In [None]:
Apply ML model v3---->

In [28]:
# Create interaction features for more complex relationships

# Interaction between lighting conditions and time of accident (e.g., night-time accidents may be more fatal)
data_processed['lighting_time_interaction'] = data_processed['lum'] * data_processed['time']

# Interaction between weather conditions and location (e.g., certain locations may be more dangerous in bad weather)
data_processed['weather_location_interaction'] = data_processed['atm_condition'] * (data_processed['lat'] + data_processed['long'])

# Update the feature list with the new interaction terms
features.extend(['lighting_time_interaction', 'weather_location_interaction'])

# Re-train the model with the new features included
X = data_processed[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for balancing the classes
X_res, y_res = smote.fit_resample(X_train, y_train)

# Train the Random Forest model with the newly engineered features and class weights
rf_model.fit(X_res, y_res)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
print("Random Forest Classification Report with Feature Engineering:")
print(classification_report(y_test, y_pred))


Random Forest Classification Report with Feature Engineering:
              precision    recall  f1-score   support

           1       0.74      0.80      0.77     37371
           2       0.25      0.14      0.18      2335
           3       0.45      0.54      0.49     13737
           4       0.68      0.59      0.64     36091

    accuracy                           0.66     89534
   macro avg       0.53      0.52      0.52     89534
weighted avg       0.66      0.66      0.66     89534



In [None]:
Apply ML model v4---->

In [29]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Get the prediction probabilities for the test set
y_probs = rf_model.predict_proba(X_test)

# Extract the probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, np.where(rf_model.classes_ == 2)[0][0]]  

# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

# Find the best threshold for an optimal balance of precision and recall (based on F1 score)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_index = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_index]

# Apply the best threshold
y_pred_threshold = (fatal_probs >= best_threshold).astype(int)

# Generate classification report for the adjusted threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold adjustment:")
print(classification_report(y_test == 2, y_pred_threshold))


Best Threshold: 0.2
Classification report after further threshold adjustment:
              precision    recall  f1-score   support

       False       0.98      0.95      0.97     87199
        True       0.17      0.37      0.23      2335

    accuracy                           0.94     89534
   macro avg       0.58      0.66      0.60     89534
weighted avg       0.96      0.94      0.95     89534

