In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_pickle('source\data.pkl')

data.info()

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Convert 'day', 'month', and 'time' to integers
data_processed['day'] = data_processed['day'].astype(int)
data_processed['month'] = data_processed['month'].astype(int)
data_processed['time'] = data_processed['time'].astype(int)

# Cyclical encoding for temporal features
data_processed['day_sin'] = np.sin(2 * np.pi * data_processed['day'] / 31)  
data_processed['day_cos'] = np.cos(2 * np.pi * data_processed['day'] / 31)

data_processed['month_sin'] = np.sin(2 * np.pi * data_processed['month'] / 12)
data_processed['month_cos'] = np.cos(2 * np.pi * data_processed['month'] / 12)

data_processed['time_sin'] = np.sin(2 * np.pi * data_processed['time'] / 86340000) 
data_processed['time_cos'] = np.cos(2 * np.pi * data_processed['time'] / 86340000)

data_processed.drop(columns=['day','month','time'],inplace=True)

# Selecting features and target variable
features_dummy = ['year', 'lum', 'atm_condition', 'collision_type',
       'route_category', 'traffic_regime', 'total_number_lanes',
       'reserved_lane_code', 'longitudinal_profile', 'plan',
       'surface_condition', 'infra', 'accident_situation',
       'traffic_direction', 'vehicle_category', 'fixed_obstacle',
       'mobile_obstacle', 'initial_impact_point', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'reason_travel',
       'safety_equipment1']

# These features will be standardized
features_scaler = ['lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal', 'maximum_speed', 'age']

# These features are between -1 and 1 and do not need any standardazations. 
features_temporal = ['day_sin', 'day_cos', 'month_sin', 'month_cos', 'time_sin', 'time_cos']
target = 'gravity'

X = data_processed.drop(columns=[target])
y = data_processed[target]
y = y.astype(int)

X = pd.get_dummies(X, columns=features_dummy, drop_first=True)

# stratify will split the dataset according to the distribution of the classes to compensate for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       447670 non-null  object 
 1   day                         447670 non-null  object 
 2   month                       447670 non-null  object 
 3   year                        447670 non-null  object 
 4   time                        447670 non-null  object 
 5   lum                         447670 non-null  object 
 6   atm_condition               447670 non-null  object 
 7   collision_type              447670 non-null  object 
 8   lat                         447670 non-null  float64
 9   long                        447670 non-null  float64
 10  route_category              447670 non-null  object 
 11  traffic_regime              447670 non-null  object 
 12  total_number_lanes          447670 non-null  object 
 13  reserved_lane_

Apply ML v1 -------->

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize AdaBoost Classifier
ada_clf = AdaBoostClassifier(n_estimators=100, random_state=42)

# Fit the model
ada_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ada_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

print(accuracy)
print(class_report)


0.6367413496548797
              precision    recall  f1-score   support

           1       0.69      0.80      0.74     37537
           2       0.29      0.03      0.06      2256
           3       0.50      0.36      0.42     13565
           4       0.62      0.61      0.61     36176

    accuracy                           0.64     89534
   macro avg       0.52      0.45      0.46     89534
weighted avg       0.62      0.64      0.62     89534



Apply ML v2 -------->

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Grid search to optimize for f1_weighted 
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'base_estimator__max_depth': [1, 2]  
}

# Initialize a base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=1, class_weight='balanced')

# Initialize the AdaBoost classifier with the base estimator
ada_clf = AdaBoostClassifier(base_estimator=base_estimator, random_state=42)

# Perform grid search with cross-validation, optimizing for weighted F1 score
grid_search = GridSearchCV(estimator=ada_clf, param_grid=param_grid, scoring='f1_weighted', cv=3, n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params_f1 = grid_search.best_params_
best_score_f1 = grid_search.best_score_

print(f"Best parameters: {best_params_f1}")
print(f"Best F1 score: {best_score_f1}")


In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

# Initialize the base estimator with max_depth=2
base_estimator = DecisionTreeClassifier(max_depth=2)

# Initialize AdaBoostClassifier with the best hyperparameters
ada_clf_best = AdaBoostClassifier(
    base_estimator=base_estimator, 
    n_estimators=200, 
    learning_rate=1, 
    random_state=42
)

# Train the classifier on the training data
ada_clf_best.fit(X_train, y_train)

# Predict on the test set
y_pred_best = ada_clf_best.predict(X_test)

# Alternatively, you can use cross-validation to get predictions
# y_pred_best = cross_val_predict(ada_clf_best, X_train, y_train, cv=5)

# Generate and display the classification report
class_report_best = classification_report(y_test, y_pred_best)
print("Classification Report for AdaBoost with best parameters:")
print(class_report_best)


Classification Report for AdaBoost with best parameters:
              precision    recall  f1-score   support

           1       0.71      0.82      0.76     37537
           2       0.30      0.12      0.17      2256
           3       0.51      0.39      0.44     13565
           4       0.64      0.62      0.63     36176

    accuracy                           0.66     89534
   macro avg       0.54      0.49      0.50     89534
weighted avg       0.64      0.66      0.65     89534



Apply ML OVER SAMPLING - v3 -------->

In [4]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the AdaBoostClassifier with the best hyperparameters on the resampled dataset
ada_clf_best = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=42)
ada_clf_best.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_resampled = ada_clf_best.predict(X_test)

# Generate the classification report
class_report_resampled = classification_report(y_test, y_pred_resampled)
print(class_report_resampled)



found 0 physical cores < 1
  File "C:\Users\sd10725\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


              precision    recall  f1-score   support

           1       0.70      0.75      0.73     37537
           2       0.19      0.16      0.17      2256
           3       0.43      0.44      0.44     13565
           4       0.62      0.58      0.60     36176

    accuracy                           0.62     89534
   macro avg       0.49      0.48      0.48     89534
weighted avg       0.62      0.62      0.62     89534



Apply ML UNDER SAMPLING - v4 -------->

In [5]:
from imblearn.under_sampling import RandomUnderSampler

# Apply RandomUnderSampler to reduce the number of samples in the majority classes
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Train the AdaBoostClassifier with the best hyperparameters on the resampled dataset
ada_clf_best = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=42)
ada_clf_best.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_resampled = ada_clf_best.predict(X_test)

# Generate the classification report
class_report_resampled = classification_report(y_test, y_pred_resampled)
print(class_report_resampled)


              precision    recall  f1-score   support

           1       0.71      0.75      0.73     37537
           2       0.12      0.61      0.20      2256
           3       0.38      0.40      0.39     13565
           4       0.67      0.44      0.53     36176

    accuracy                           0.57     89534
   macro avg       0.47      0.55      0.46     89534
weighted avg       0.63      0.57      0.59     89534



Apply ML CLASS WEIGHT - v5 -------->

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Define a base classifier with class weights
base_clf = DecisionTreeClassifier(class_weight='balanced', max_depth=2)

# Use this base classifier in AdaBoost
ada_clf_weighted = AdaBoostClassifier(base_estimator=base_clf, n_estimators=200, learning_rate=1, random_state=42)

# Train on the original dataset
ada_clf_weighted.fit(X_train, y_train)

# Predict on the test set
y_pred_weighted = ada_clf_weighted.predict(X_test)

# Generate the classification report
class_report_weighted = classification_report(y_test, y_pred_weighted)
print(class_report_weighted)


              precision    recall  f1-score   support

           1       0.73      0.79      0.76     37537
           2       0.13      0.62      0.22      2256
           3       0.38      0.42      0.40     13565
           4       0.71      0.46      0.55     36176

    accuracy                           0.59     89534
   macro avg       0.49      0.57      0.48     89534
weighted avg       0.65      0.59      0.61     89534



Apply ML BINARY - Fatal vs Non-fatal v6 -------->

In [7]:
# Grouping classes into two: Fatal (class 2) and Non-fatal (classes 1, 3, and 4)

# Create a new target variable with two classes
y_grouped = y.copy()
y_grouped = y_grouped.replace({1: 0, 3: 0, 4: 0, 2: 1})  # 0 for non-fatal, 1 for fatal

# Split the dataset again with the new target
X_train_grouped, X_test_grouped, y_train_grouped, y_test_grouped = train_test_split(X, y_grouped, test_size=0.2, stratify=y_grouped, random_state=42)

# Initialize AdaBoost classifier with the best hyperparameters from previous tests
ada_clf_grouped = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=42)

# Train the classifier on the new grouped target
ada_clf_grouped.fit(X_train_grouped, y_train_grouped)

# Predict on the test set
y_pred_grouped = ada_clf_grouped.predict(X_test_grouped)

# Generate the classification report
from sklearn.metrics import classification_report
class_report_grouped = classification_report(y_test_grouped, y_pred_grouped)

print(class_report_grouped)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99     87279
           1       0.38      0.04      0.08      2255

    accuracy                           0.97     89534
   macro avg       0.68      0.52      0.53     89534
weighted avg       0.96      0.97      0.96     89534



Apply ML BINARY - Fatal vs Non-fatal - UNDER SAMPLING v7 -------->

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Apply RandomUnderSampler to reduce the number of samples in the majority class
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_grouped, y_train_grouped)

# Train the classifier on the undersampled data
ada_clf_grouped.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_resampled = ada_clf_grouped.predict(X_test_grouped)

# Generate and print the classification report
print(classification_report(y_test_grouped, y_pred_resampled))


              precision    recall  f1-score   support

           0       0.99      0.80      0.89     87279
           1       0.09      0.80      0.17      2255

    accuracy                           0.80     89534
   macro avg       0.54      0.80      0.53     89534
weighted avg       0.97      0.80      0.87     89534



Apply ML BINARY - Fatal vs Non-fatal - OVER SAMPLING v8 -------->

In [10]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_grouped, y_train_grouped)

# Train the classifier on the resampled data
ada_clf_grouped.fit(X_train_resampled, y_train_resampled)

# Predict and generate classification report
y_pred_resampled = ada_clf_grouped.predict(X_test_grouped)
print(classification_report(y_test_grouped, y_pred_resampled))


              precision    recall  f1-score   support

           0       0.98      0.99      0.98     87279
           1       0.18      0.13      0.15      2255

    accuracy                           0.96     89534
   macro avg       0.58      0.56      0.57     89534
weighted avg       0.96      0.96      0.96     89534



Apply ML BINARY - Fatal vs Non-fatal - CLASS WEIGHT v9 -------->

In [11]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the base classifier with class weights
base_clf = DecisionTreeClassifier(class_weight={0: 1, 1: 10})

# Initialize AdaBoost with the base classifier
ada_clf_weighted = AdaBoostClassifier(base_estimator=base_clf, n_estimators=200, learning_rate=1, random_state=42)

# Train the classifier with class weights
ada_clf_weighted.fit(X_train_grouped, y_train_grouped)

# Predict and generate classification report
y_pred_weighted = ada_clf_weighted.predict(X_test_grouped)
print(classification_report(y_test_grouped, y_pred_weighted))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98     87279
           1       0.16      0.16      0.16      2255

    accuracy                           0.96     89534
   macro avg       0.57      0.57      0.57     89534
weighted avg       0.96      0.96      0.96     89534



Apply ML BINARY - Slightly Injured, 1 for Severely Injured v10 -------->

In [12]:
# Grouping classes into two: Severely Injured (classes 2 and 3) and Slightly Injured (classes 1 and 4)

# Create a new target variable with two classes
y_grouped_2 = y.copy()
y_grouped_2 = y_grouped_2.replace({1: 0, 4: 0, 2: 1, 3: 1})  # 0 for Slightly Injured, 1 for Severely Injured

# Split the dataset again with the new target
X_train_grouped_2, X_test_grouped_2, y_train_grouped_2, y_test_grouped_2 = train_test_split(
    X, y_grouped_2, test_size=0.2, stratify=y_grouped_2, random_state=42
)

# Initialize AdaBoost classifier with the best hyperparameters from previous tests
ada_clf_grouped_2 = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=42)

# Train the classifier on the new grouped target
ada_clf_grouped_2.fit(X_train_grouped_2, y_train_grouped_2)

# Predict on the test set
y_pred_grouped_2 = ada_clf_grouped_2.predict(X_test_grouped_2)

# Generate the classification report
from sklearn.metrics import classification_report
class_report_grouped_2 = classification_report(y_test_grouped_2, y_pred_grouped_2)

print(class_report_grouped_2)


              precision    recall  f1-score   support

           0       0.88      0.95      0.91     73714
           1       0.64      0.38      0.48     15820

    accuracy                           0.85     89534
   macro avg       0.76      0.67      0.70     89534
weighted avg       0.83      0.85      0.84     89534



In [13]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# Group classes into Slightly Injured (0) and Severely Injured (1)
y_grouped = y.copy()
y_grouped = y_grouped.replace({1: 0, 4: 0, 2: 1, 3: 1})

# Split the dataset
X_train_grouped, X_test_grouped, y_train_grouped, y_test_grouped = train_test_split(
    X, y_grouped, test_size=0.2, stratify=y_grouped, random_state=42
)

# Apply RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_grouped, y_train_grouped)

# Train the AdaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=200, learning_rate=1, random_state=42)
ada_clf.fit(X_train_undersampled, y_train_undersampled)

# Predict and generate classification report
y_pred_undersampled = ada_clf.predict(X_test_grouped)
class_report_undersampled = classification_report(y_test_grouped, y_pred_undersampled)
print("Undersampling Classification Report:\n", class_report_undersampled)


Undersampling Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.77      0.85     73714
           1       0.43      0.80      0.56     15820

    accuracy                           0.78     89534
   macro avg       0.69      0.78      0.70     89534
weighted avg       0.85      0.78      0.80     89534



In [14]:
from imblearn.over_sampling import SMOTE

# Group classes into Slightly Injured (0) and Severely Injured (1)
y_grouped = y.copy()
y_grouped = y_grouped.replace({1: 0, 4: 0, 2: 1, 3: 1})

# Split the dataset
X_train_grouped, X_test_grouped, y_train_grouped, y_test_grouped = train_test_split(
    X, y_grouped, test_size=0.2, stratify=y_grouped, random_state=42
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train_grouped, y_train_grouped)

# Train the AdaBoost classifier
ada_clf.fit(X_train_oversampled, y_train_oversampled)

# Predict and generate classification report
y_pred_oversampled = ada_clf.predict(X_test_grouped)
class_report_oversampled = classification_report(y_test_grouped, y_pred_oversampled)
print("Oversampling with SMOTE Classification Report:\n", class_report_oversampled)


Oversampling with SMOTE Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.92      0.90     73714
           1       0.54      0.44      0.49     15820

    accuracy                           0.84     89534
   macro avg       0.71      0.68      0.69     89534
weighted avg       0.82      0.84      0.83     89534



In [15]:
from sklearn.tree import DecisionTreeClassifier

# Group classes into Slightly Injured (0) and Severely Injured (1)
y_grouped = y.copy()
y_grouped = y_grouped.replace({1: 0, 4: 0, 2: 1, 3: 1})

# Split the dataset
X_train_grouped, X_test_grouped, y_train_grouped, y_test_grouped = train_test_split(
    X, y_grouped, test_size=0.2, stratify=y_grouped, random_state=42
)

# Initialize the AdaBoost classifier with class weights
base_clf = DecisionTreeClassifier(class_weight={0: 1, 1: 10})  # Adjust class weights
ada_clf_weighted = AdaBoostClassifier(base_estimator=base_clf, n_estimators=200, learning_rate=1, random_state=42)

# Train the classifier
ada_clf_weighted.fit(X_train_grouped, y_train_grouped)

# Predict and generate classification report
y_pred_weighted = ada_clf_weighted.predict(X_test_grouped)
class_report_weighted = classification_report(y_test_grouped, y_pred_weighted)
print("Class Weights Classification Report:\n", class_report_weighted)


Class Weights Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     73714
           1       0.44      0.46      0.45     15820

    accuracy                           0.80     89534
   macro avg       0.66      0.67      0.66     89534
weighted avg       0.80      0.80      0.80     89534

