In [None]:
# Import the necessary packages
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
from imblearn.over_sampling import SMOTE

In [None]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="SMOTE")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="GridSearchCV")
warnings.filterwarnings("ignore", category=FutureWarning, module="train_test_split")

In [None]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [None]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [None]:
data_processed = data_processed.drop(['AccID','birth_year','vehicleID','num_veh'], axis=1)

In [None]:
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [None]:
# Normalization/Standardization: Normalize or standardize numerical features
numerical_columns = data_processed.select_dtypes(include=['float64']).columns

scaler = StandardScaler()
data_processed[numerical_columns] = scaler.fit_transform(data_processed[numerical_columns])

In [None]:
# Selecting features and target variable
features = ['lum','atm_condition','collision_type','route_category','traffic_regime','reserved_lane_code','longitudinal_profile','upstream_terminal_number','plan','surface_condition','infra','accident_situation','traffic_direction','vehicle_category','fixed_obstacle','mobile_obstacle',
'initial_impact_point','manv','motor','seat','user_category','gender','reason_travel','safety_equipment1','maximum_speed','age','lat','long','distance_upstream_terminal','total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [None]:
# Handling categorical features with label encoding
le = LabelEncoder()

In [None]:
# Splitting the data into train and test sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

Apply ML model ---->