In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Converting 'time', 'day', 'month', and 'year' to float type
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

# Selecting features and target variable
features = ['lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 'reserved_lane_code', 
            'longitudinal_profile', 'upstream_terminal_number', 'plan', 'surface_condition', 'infra', 'accident_situation', 
            'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv', 
            'motor', 'seat', 'user_category', 'gender', 'reason_travel', 'safety_equipment1', 'maximum_speed', 'age', 
            'lat', 'long', 'distance_upstream_terminal', 'total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

# Handling categorical features with One Hot Encoding
X = pd.get_dummies(data_processed[features], drop_first=True)
y = data_processed[target]

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['float64']).columns

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (358136, 34)
Shape of X_test: (89534, 34)


Apply ML model ---->

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [36]:
# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

In [37]:
# Train the model
log_reg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# Predict on the test set
y_pred_log_reg = log_reg_model.predict(X_test)

In [39]:
# Evaluate the model
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)

In [40]:
print(conf_matrix_log_reg)

[[29039     5   704  7623]
 [  790     4   539  1002]
 [ 4083     4  2142  7508]
 [13119     2  1441 21529]]


In [41]:
print(class_report_log_reg)

              precision    recall  f1-score   support

           1       0.62      0.78      0.69     37371
           2       0.27      0.00      0.00      2335
           3       0.44      0.16      0.23     13737
           4       0.57      0.60      0.58     36091

    accuracy                           0.59     89534
   macro avg       0.47      0.38      0.38     89534
weighted avg       0.56      0.59      0.56     89534

