In [1]:
# Import the necessary packages
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="SMOTE")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="GridSearchCV")
warnings.filterwarnings("ignore", category=FutureWarning, module="train_test_split")

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [4]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [5]:
# Feature Engineering: Create new time-based features
#data_processed['hour'] = data_processed['time'] // 1000000
#data_processed['day_of_week'] = pd.to_datetime(data_processed[['year', 'month', 'day']]).dt.dayofweek

In [6]:
#data_processed = data_processed.drop(['year', 'month', 'day','AccID','birth_year','vehicleID','num_veh','time'], axis=1)

In [7]:
data_processed = data_processed.drop(['AccID','birth_year','vehicleID','num_veh'], axis=1)

In [8]:
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [9]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   day                         447670 non-null  float64
 1   month                       447670 non-null  float64
 2   year                        447670 non-null  float64
 3   time                        447670 non-null  float64
 4   lum                         447670 non-null  int64  
 5   atm_condition               447670 non-null  int64  
 6   collision_type              447670 non-null  int64  
 7   lat                         447670 non-null  float64
 8   long                        447670 non-null  float64
 9   route_category              447670 non-null  int64  
 10  traffic_regime              447670 non-null  int64  
 11  total_number_lanes          447670 non-null  int64  
 12  reserved_lane_code          447670 non-null  int64  
 13  longitudinal_p

In [10]:
# Normalization/Standardization: Normalize or standardize numerical features
numerical_columns = data_processed.select_dtypes(include=['float64']).columns

scaler = StandardScaler()
data_processed[numerical_columns] = scaler.fit_transform(data_processed[numerical_columns])

In [11]:
data_processed.head()

Unnamed: 0,day,month,year,time,lum,atm_condition,collision_type,lat,long,route_category,...,initial_impact_point,manv,motor,seat,user_category,gravity,gender,reason_travel,safety_equipment1,age
0,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,5,23,1,2,2,4,2,0,1,-1.141512
1,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,5,23,1,1,1,4,2,5,1,-0.663829
2,1.635754,1.273089,-1.318561,-2.331127,4,1,2,0.805698,-0.063104,1,...,3,11,1,1,1,1,1,0,1,1.140752
3,1.635754,1.273089,-1.318561,-2.084448,3,1,6,0.82129,-0.104276,1,...,1,0,1,1,1,4,2,0,1,-0.716905
4,1.407189,1.273089,-1.318561,0.212747,1,1,4,0.823628,-0.124441,1,...,1,2,1,1,1,1,1,0,1,-0.823057


In [12]:
# Selecting features and target variable
features = ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1','maximum_speed','age','lat','long','distance_upstream_terminal','total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [13]:
# Handling categorical features with label encoding
le = LabelEncoder()

In [14]:
# Splitting the data into train and test sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

found 0 physical cores < 1
  File "C:\Users\sd10725\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [16]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [10, 20, 30],                 # Maximum depth of trees
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split
    'min_samples_leaf': [1, 2, 4],             # Minimum samples in leaf nodes
    #'max_features': ['auto', 'sqrt'],          # Number of features to consider at each split
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],                # Whether bootstrap samples are used
    'class_weight': [{1: 1, 2: 20, 3: 1, 4: 1}] # Increased weight for fatalities
}

Apply ML model ---->

In [17]:
# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

In [None]:
# Set up Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1, scoring='f1_macro')

In [None]:
# Fit the model
grid_search.fit(X_res, y_res)

In [None]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Evaluate the model with the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Classification report
print(classification_report(y_test, y_pred))