In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [4]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [5]:
# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

In [6]:
# Converting 'time', 'day', 'month', and 'year' to float type
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [7]:
# Converting specified variables to categorical type
categorical_columns = [
    'lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 
    'total_number_lanes', 'reserved_lane_code', 'longitudinal_profile', 'plan', 
    'surface_condition', 'infra', 'accident_situation', 'maximum_speed', 
    'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 
    'initial_impact_point', 'manv', 'motor', 'seat', 'user_category', 'gravity', 
    'gender', 'reason_travel', 'safety_equipment1'
]

# Converting the specified columns to categorical
data_processed[categorical_columns] = data_processed[categorical_columns].astype('category')

# Checking the conversion
data_processed.dtypes[categorical_columns]


lum                     category
atm_condition           category
collision_type          category
route_category          category
traffic_regime          category
total_number_lanes      category
reserved_lane_code      category
longitudinal_profile    category
plan                    category
surface_condition       category
infra                   category
accident_situation      category
maximum_speed           category
traffic_direction       category
vehicle_category        category
fixed_obstacle          category
mobile_obstacle         category
initial_impact_point    category
manv                    category
motor                   category
seat                    category
user_category           category
gravity                 category
gender                  category
reason_travel           category
safety_equipment1       category
dtype: object

In [8]:
# Selecting features and target variable
features = ['lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 'reserved_lane_code', 
            'longitudinal_profile', 'upstream_terminal_number', 'plan', 'surface_condition', 'infra', 'accident_situation', 
            'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv', 
            'motor', 'seat', 'user_category', 'gender', 'reason_travel', 'safety_equipment1', 'maximum_speed', 'age', 
            'lat', 'long', 'distance_upstream_terminal', 'total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [9]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   day                         447670 non-null  float64 
 1   month                       447670 non-null  float64 
 2   year                        447670 non-null  float64 
 3   time                        447670 non-null  float64 
 4   lum                         447670 non-null  category
 5   atm_condition               447670 non-null  category
 6   collision_type              447670 non-null  category
 7   lat                         447670 non-null  float64 
 8   long                        447670 non-null  float64 
 9   route_category              447670 non-null  category
 10  traffic_regime              447670 non-null  category
 11  total_number_lanes          447670 non-null  category
 12  reserved_lane_code          447670 non-null  category
 13 

In [10]:
# Handling categorical features with One Hot Encoding
X = pd.get_dummies(data_processed[features], drop_first=True)
y = data_processed[target]

In [11]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['float64']).columns

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [13]:
# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (358136, 230)
Shape of X_test: (89534, 230)


Apply ML model ---->