In [1]:
# Import libraries needed to execute the code
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Import the clean data
data = pd.read_csv('source/data_balanced.csv', low_memory=False)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375372 entries, 0 to 375371
Data columns (total 41 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       375372 non-null  float64
 1   day                         375372 non-null  float64
 2   month                       375372 non-null  float64
 3   year                        375372 non-null  float64
 4   time                        375372 non-null  float64
 5   lum                         375372 non-null  float64
 6   atm_condition               375372 non-null  float64
 7   collision_type              375372 non-null  float64
 8   lat                         375372 non-null  float64
 9   long                        375372 non-null  float64
 10  route_category              375372 non-null  float64
 11  traffic_regime              375372 non-null  float64
 12  total_number_lanes          375372 non-null  float64
 13  reserved_lane_

In [4]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,AccID,day,month,year,time,lum,atm_condition,collision_type,lat,long,route_category,traffic_regime,total_number_lanes,reserved_lane_code,longitudinal_profile,upstream_terminal_number,distance_upstream_terminal,plan,surface_condition,infra,accident_situation,maximum_speed,vehicleID,num_veh,traffic_direction,vehicle_category,fixed_obstacle,mobile_obstacle,initial_impact_point,manv,motor,seat,user_category,gravity,gender,birth_year,reason_travel,safety_equipment1,age,hour,day_of_week
0,-1.318804,1.635754,1.273089,-1.318561,-2.331127,1.411937,-0.367274,-0.969978,0.805698,-0.063104,-1.905894,1.674506,5.385555,-0.309597,-0.421612,-0.167154,1.946753,1.079913,-0.325791,0.540267,-0.333808,0.509356,123648,0,0.532801,0.331811,-0.026831,-1.346191,0.050306,0.481868,-0.216864,-0.41758,-0.533801,-1.118975,-0.679263,-1.21696,-1.17778,-0.425963,1.140752,-2.328651,5
1,-1.318804,1.407189,1.273089,-1.318561,0.212747,-0.605854,-0.367274,0.109312,0.823628,-0.124441,-1.905894,1.674506,3.954056,-0.309597,-0.421612,-0.120995,0.755404,2.58912,-0.325791,-0.384897,-0.333808,1.4946,123643,0,-0.685619,-0.42316,-0.338378,0.357934,-0.78621,-0.634402,-0.216864,-0.41758,-0.533801,-1.118975,-0.679263,0.741952,-1.17778,-0.425963,-0.823057,0.189353,3
2,-1.318804,1.407189,1.273089,-1.318561,0.212747,-0.605854,-0.367274,0.109312,0.823628,-0.124441,-1.905894,1.674506,3.954056,-0.309597,-0.421612,-0.120995,0.755404,2.58912,-0.325791,-0.384897,-0.333808,1.4946,123645,7,-0.685619,-0.42316,-0.338378,0.357934,0.468565,1.970228,4.584023,-0.41758,-0.533801,-1.118975,-0.679263,-0.846355,-0.810813,-0.425963,0.769221,0.189353,3
3,-1.318804,1.635754,1.273089,-1.318561,1.153209,2.084535,-0.367274,0.109312,0.77004,-0.080159,-1.905894,1.674506,1.806807,-0.309597,-0.421612,-0.213313,0.156751,-0.429295,-0.325791,-0.384897,-0.333808,1.4946,123640,0,0.532801,-0.42316,-0.338378,0.357934,0.468565,1.970228,-0.216864,-0.41758,-0.533801,-1.118975,-0.679263,0.583121,-1.17778,-0.425963,-0.663829,1.165721,5
4,-1.318804,1.635754,1.273089,-1.318561,1.153209,2.084535,-0.367274,0.109312,0.77004,-0.080159,-1.905894,1.674506,1.806807,-0.309597,-0.421612,-0.213313,0.156751,-0.429295,-0.325791,-0.384897,-0.333808,1.4946,123641,4,0.532801,-0.42316,-0.338378,0.357934,0.468565,1.970228,-0.216864,-0.41758,-0.533801,-1.118975,-0.679263,-0.740468,0.657055,-0.425963,0.663069,1.165721,5


In [5]:
# Select features and target variable
X = data.drop(columns=['gravity'])
y = data['gravity']

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Convert target variables to integer type if needed
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [8]:
# Remap the class labels if necessary (e.g., -1 to 0, 0 to 1)
y_train_mapped = y_train.replace({-1: 0, 0: 1})
y_test_mapped = y_test.replace({-1: 0, 0: 1})

In [9]:
# Convert the data to numpy arrays if necessary
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Ensure that the target variable is in integer format
y_train_np = np.array(y_train_mapped).astype(int)
y_test_np = np.array(y_test_mapped).astype(int)

# Initialize the Gradient Boosting classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train_np, y_train_np)

# Predict on the test set
y_pred_mapped = gb_model.predict(X_test_np)

# Remap predictions back to original labels
y_pred = pd.Series(y_pred_mapped).replace({0: -1, 1: 0})

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(conf_matrix)
print(class_report)


[[50633  5642]
 [ 4805 51532]]
              precision    recall  f1-score   support

          -1       0.91      0.90      0.91     56275
           0       0.90      0.91      0.91     56337

    accuracy                           0.91    112612
   macro avg       0.91      0.91      0.91    112612
weighted avg       0.91      0.91      0.91    112612

