In [1]:
# Import the necessary packages
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression




In [2]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [3]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [4]:
# 1. Feature Engineering: Create new time-based features
data_processed['hour'] = data_processed['time'] // 1000000
data_processed['day_of_week'] = pd.to_datetime(data_processed[['year', 'month', 'day']]).dt.dayofweek

In [None]:
data = data.drop(['year', 'month', 'day'], axis=1)

In [5]:
data.head()

Unnamed: 0,AccID,day,month,year,time,lum,atm_condition,collision_type,lat,long,...,manv,motor,seat,user_category,gravity,gender,birth_year,reason_travel,safety_equipment1,age
0,201900000001,30,11,2019,5400000,4,1,2,48.89621,2.47012,...,23,1,2,2,4,2,2002.0,0,1,17.0
1,201900000001,30,11,2019,5400000,4,1,2,48.89621,2.47012,...,23,1,1,1,4,2,1993.0,5,1,26.0
2,201900000001,30,11,2019,5400000,4,1,2,48.89621,2.47012,...,11,1,1,1,1,1,1959.0,0,1,60.0
3,201900000002,30,11,2019,10200000,3,1,6,48.9307,2.3688,...,0,1,1,1,4,2,1994.0,0,1,25.0
4,201900000003,28,11,2019,54900000,1,1,4,48.935872,2.319174,...,2,1,1,1,1,1,1996.0,0,1,23.0


In [None]:
# Selecting features and target variable
features = ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1','maximum_speed','age','lat','long','distance_upstream_terminal','total_number_lanes','day','month','year','time']
target = 'gravity'

In [None]:
# Handling categorical features with label encoding
le = LabelEncoder()

In [None]:
# Encoding categorical variables
for col in ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1']:
    data[col] = le.fit_transform(data[col])

In [None]:
# Splitting the data into train and test sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [10, 20, 30],                 # Maximum depth of trees
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split
    'min_samples_leaf': [1, 2, 4],             # Minimum samples in leaf nodes
    'max_features': ['auto', 'sqrt'],          # Number of features to consider at each split
    'bootstrap': [True, False],                # Whether bootstrap samples are used
    'class_weight': [{1: 1, 2: 20, 3: 1, 4: 1}] # Increased weight for fatalities
}

In [None]:
# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
# Train the model
log_reg_model.fit(X_res, y_res)

In [None]:
# Predict on the test set
y_pred_log_reg = log_reg_model.predict(X_test)

In [None]:
# Evaluate the model
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)

In [None]:
conf_matrix_log_reg

In [None]:
print(class_report_log_reg)