In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [4]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [5]:
# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

In [6]:
# Converting 'time', 'day', 'month', and 'year' to float type
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [7]:
# Selecting features and target variable
features = ['lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 'reserved_lane_code', 
            'longitudinal_profile', 'upstream_terminal_number', 'plan', 'surface_condition', 'infra', 'accident_situation', 
            'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv', 
            'motor', 'seat', 'user_category', 'gender', 'reason_travel', 'safety_equipment1', 'maximum_speed', 'age', 
            'lat', 'long', 'distance_upstream_terminal', 'total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [8]:
# Handling categorical features with One Hot Encoding
X = pd.get_dummies(data_processed[features], drop_first=True)
y = data_processed[target]

In [9]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['float64']).columns

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [11]:
# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (358136, 34)
Shape of X_test: (89534, 34)


Apply ML model ---->

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model on the training data
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Output the results
accuracy, class_report, conf_matrix


(0.6662385239127036,
 '              precision    recall  f1-score   support\n\n           1       0.71      0.84      0.77     37371\n           2       0.46      0.05      0.09      2335\n           3       0.53      0.35      0.42     13737\n           4       0.65      0.65      0.65     36091\n\n    accuracy                           0.67     89534\n   macro avg       0.59      0.47      0.48     89534\nweighted avg       0.65      0.67      0.65     89534\n',
 array([[31380,     8,   735,  5248],
        [  384,   116,  1090,   745],
        [ 2140,   102,  4802,  6693],
        [10286,    27,  2425, 23353]], dtype=int64))

In [13]:
print(accuracy)
print(class_report)
print(conf_matrix)

0.6662385239127036
              precision    recall  f1-score   support

           1       0.71      0.84      0.77     37371
           2       0.46      0.05      0.09      2335
           3       0.53      0.35      0.42     13737
           4       0.65      0.65      0.65     36091

    accuracy                           0.67     89534
   macro avg       0.59      0.47      0.48     89534
weighted avg       0.65      0.67      0.65     89534

[[31380     8   735  5248]
 [  384   116  1090   745]
 [ 2140   102  4802  6693]
 [10286    27  2425 23353]]


In [15]:
# Displaying feature importances from the model


feature_importances = pd.Series(gb_model.feature_importances_, index=X_train.columns)
feature_importances_sorted = feature_importances.sort_values(ascending=False)

feature_importances_sorted.head(10)


safety_equipment1    0.301052
mobile_obstacle      0.123860
vehicle_category     0.105107
fixed_obstacle       0.072014
seat                 0.066672
user_category        0.065032
lat                  0.039827
maximum_speed        0.037239
collision_type       0.034812
gender               0.023456
dtype: float64

Apply ML model v2---->

In [16]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Grid Search
grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the Grid Search on the training data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)


Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 300}
Best Score: 0.6847845574305756


Apply ML model v3 ---->

In [17]:
# Apply Gradient Boosting with specified parameters
gb_model_tuned = GradientBoostingClassifier(
    learning_rate=0.1, 
    max_depth=5, 
    min_samples_split=2, 
    n_estimators=300, 
    random_state=42
)

# Train the model on the training data
gb_model_tuned.fit(X_train, y_train)

# Predict on the test set
y_pred_tuned = gb_model_tuned.predict(X_test)

# Calculate accuracy
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

# Generate classification report
class_report_tuned = classification_report(y_test, y_pred_tuned)

# Generate confusion matrix
conf_matrix_tuned = confusion_matrix(y_test, y_pred_tuned)

# Output the results
accuracy_tuned, class_report_tuned, conf_matrix_tuned

(0.6862867737395849,
 '              precision    recall  f1-score   support\n\n           1       0.74      0.83      0.78     37371\n           2       0.41      0.08      0.14      2335\n           3       0.53      0.45      0.49     13737\n           4       0.67      0.67      0.67     36091\n\n    accuracy                           0.69     89534\n   macro avg       0.59      0.51      0.52     89534\nweighted avg       0.67      0.69      0.68     89534\n',
 array([[30980,    21,   904,  5466],
        [  281,   197,  1270,   587],
        [ 1591,   197,  6166,  5783],
        [ 8733,    61,  3194, 24103]], dtype=int64))

In [18]:
print(class_report_tuned)

              precision    recall  f1-score   support

           1       0.74      0.83      0.78     37371
           2       0.41      0.08      0.14      2335
           3       0.53      0.45      0.49     13737
           4       0.67      0.67      0.67     36091

    accuracy                           0.69     89534
   macro avg       0.59      0.51      0.52     89534
weighted avg       0.67      0.69      0.68     89534



Apply ML model v4 ---->

In [19]:
# Updating the dataset to classify accidents as 'Fatal' and 'Non-Fatal' based on 'gravity'
# 'gravity' = 2 represents fatal accidents

# Create a binary target variable
data_processed['fatal_accident'] = data_processed['gravity'].apply(lambda x: 1 if x == 2 else 0)

# Update target variable
target_binary = 'fatal_accident'

# Splitting the data into train and test sets for binary classification
X_binary = pd.get_dummies(data_processed[features], drop_first=True)
y_binary = data_processed[target_binary]

X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)

# Standardization
X_train_binary[numerical_columns] = scaler.fit_transform(X_train_binary[numerical_columns])
X_test_binary[numerical_columns] = scaler.transform(X_test_binary[numerical_columns])

# Apply Gradient Boosting with specified parameters for binary classification
gb_model_binary = GradientBoostingClassifier(
    learning_rate=0.1, 
    max_depth=5, 
    min_samples_split=2, 
    n_estimators=300, 
    random_state=42
)

# Train the model on the binary training data
gb_model_binary.fit(X_train_binary, y_train_binary)

# Predict on the binary test set
y_pred_binary = gb_model_binary.predict(X_test_binary)

# Calculate accuracy for binary classification
accuracy_binary = accuracy_score(y_test_binary, y_pred_binary)

# Generate classification report for binary classification
class_report_binary = classification_report(y_test_binary, y_pred_binary)

# Generate confusion matrix for binary classification
conf_matrix_binary = confusion_matrix(y_test_binary, y_pred_binary)

# Output the binary classification results
accuracy_binary, class_report_binary, conf_matrix_binary


(0.973484933098041,
 '              precision    recall  f1-score   support\n\n           0       0.98      1.00      0.99     87199\n           1       0.44      0.07      0.11      2335\n\n    accuracy                           0.97     89534\n   macro avg       0.71      0.53      0.55     89534\nweighted avg       0.96      0.97      0.96     89534\n',
 array([[87008,   191],
        [ 2183,   152]], dtype=int64))

In [20]:
print(class_report_binary)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     87199
           1       0.44      0.07      0.11      2335

    accuracy                           0.97     89534
   macro avg       0.71      0.53      0.55     89534
weighted avg       0.96      0.97      0.96     89534

