In [1]:
# Import the necessary packages
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Import libraries needed to execute the code
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import offsetbox
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
from sklearn.utils import resample
from matplotlib.image import imread
from sklearn.decomposition import PCA
from sklearn.datasets import make_swiss_roll
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import LocallyLinearEmbedding, Isomap, TSNE

%matplotlib inline


In [2]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [3]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [4]:
# Feature Engineering: Create new time-based features
data_processed['hour'] = data_processed['time'] // 1000000
data_processed['day_of_week'] = pd.to_datetime(data_processed[['year', 'month', 'day']]).dt.dayofweek

In [5]:
data_processed = data_processed.drop(['year', 'month', 'day','AccID','birth_year','vehicleID','num_veh','time'], axis=1)

In [6]:
# Normalization/Standardization: Normalize or standardize numerical features
numerical_columns = data_processed.select_dtypes(include=['float64']).columns

scaler = StandardScaler()
data_processed[numerical_columns] = scaler.fit_transform(data_processed[numerical_columns])

In [7]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 33 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   lum                         447670 non-null  int64  
 1   atm_condition               447670 non-null  int64  
 2   collision_type              447670 non-null  int64  
 3   lat                         447670 non-null  float64
 4   long                        447670 non-null  float64
 5   route_category              447670 non-null  int64  
 6   traffic_regime              447670 non-null  int64  
 7   total_number_lanes          447670 non-null  int64  
 8   reserved_lane_code          447670 non-null  int64  
 9   longitudinal_profile        447670 non-null  int64  
 10  upstream_terminal_number    447670 non-null  float64
 11  distance_upstream_terminal  447670 non-null  float64
 12  plan                        447670 non-null  int64  
 13  surface_condit

In [8]:
data_processed.head()

Unnamed: 0,lum,atm_condition,collision_type,lat,long,route_category,traffic_regime,total_number_lanes,reserved_lane_code,longitudinal_profile,...,motor,seat,user_category,gravity,gender,reason_travel,safety_equipment1,age,hour,day_of_week
0,4,1,2,0.805698,-0.063104,1,3,10,0,1,...,1,2,2,4,2,0,1,-1.141512,5,5
1,4,1,2,0.805698,-0.063104,1,3,10,0,1,...,1,1,1,4,2,5,1,-0.663829,5,5
2,4,1,2,0.805698,-0.063104,1,3,10,0,1,...,1,1,1,1,1,0,1,1.140752,5,5
3,3,1,6,0.82129,-0.104276,1,1,2,0,4,...,1,1,1,4,2,0,1,-0.716905,10,5
4,1,1,4,0.823628,-0.124441,1,3,8,0,1,...,1,1,1,1,1,0,1,-0.823057,54,3


In [9]:
# Selecting features and target variable
features = ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1','maximum_speed','age','lat','long','distance_upstream_terminal','total_number_lanes']
target = 'gravity'

In [10]:
# Handling categorical features with label encoding
le = LabelEncoder()

In [11]:
# Encoding categorical variables
for col in ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1']:
    data[col] = le.fit_transform(data[col])

In [12]:
# Splitting the data into train and test sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

  File "C:\Users\sd10725\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [14]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [10, 20, 30],                 # Maximum depth of trees
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split
    'min_samples_leaf': [1, 2, 4],             # Minimum samples in leaf nodes
    'max_features': ['auto', 'sqrt'],          # Number of features to consider at each split
    'bootstrap': [True, False],                # Whether bootstrap samples are used
    'class_weight': [{1: 1, 2: 20, 3: 1, 4: 1}] # Increased weight for fatalities
}

In [15]:
# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

In [16]:
# Train the model
log_reg_model.fit(X_res, y_res)

In [17]:
# Predict on the test set
y_pred_log_reg = log_reg_model.predict(X_test)

In [18]:
# Evaluate the model
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)

In [19]:
conf_matrix_log_reg

array([[25335,  3274,  2984,  5778],
       [  359,   910,   628,   438],
       [ 2497,  3328,  3769,  4143],
       [10776,  3946,  5631, 15738]], dtype=int64)

In [20]:
print(class_report_log_reg)

              precision    recall  f1-score   support

           1       0.65      0.68      0.66     37371
           2       0.08      0.39      0.13      2335
           3       0.29      0.27      0.28     13737
           4       0.60      0.44      0.51     36091

    accuracy                           0.51     89534
   macro avg       0.41      0.44      0.40     89534
weighted avg       0.56      0.51      0.53     89534

