In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_pickle('source\data.pkl')

data.info()

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Convert 'day', 'month', and 'time' to integers
data_processed['day'] = data_processed['day'].astype(int)
data_processed['month'] = data_processed['month'].astype(int)
data_processed['time'] = data_processed['time'].astype(int)

# Cyclical encoding for temporal features
data_processed['day_sin'] = np.sin(2 * np.pi * data_processed['day'] / 31)  
data_processed['day_cos'] = np.cos(2 * np.pi * data_processed['day'] / 31)

data_processed['month_sin'] = np.sin(2 * np.pi * data_processed['month'] / 12)
data_processed['month_cos'] = np.cos(2 * np.pi * data_processed['month'] / 12)

data_processed['time_sin'] = np.sin(2 * np.pi * data_processed['time'] / 86340000) 
data_processed['time_cos'] = np.cos(2 * np.pi * data_processed['time'] / 86340000)

data_processed.drop(columns=['day','month','time'],inplace=True)

# Selecting features and target variable
features_dummy = ['year', 'lum', 'atm_condition', 'collision_type',
       'route_category', 'traffic_regime', 'total_number_lanes',
       'reserved_lane_code', 'longitudinal_profile', 'plan',
       'surface_condition', 'infra', 'accident_situation',
       'traffic_direction', 'vehicle_category', 'fixed_obstacle',
       'mobile_obstacle', 'initial_impact_point', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'reason_travel',
       'safety_equipment1']

# These features will be standardized
features_scaler = ['lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal', 'maximum_speed', 'age']

# These features are between -1 and 1 and do not need any standardazations. 
features_temporal = ['day_sin', 'day_cos', 'month_sin', 'month_cos', 'time_sin', 'time_cos']
target = 'gravity'

X = data_processed.drop(columns=[target])
y = data_processed[target]
y = y.astype(int)

X = pd.get_dummies(X, columns=features_dummy, drop_first=True)

# stratify will split the dataset according to the distribution of the classes to compensate for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       447670 non-null  object 
 1   day                         447670 non-null  object 
 2   month                       447670 non-null  object 
 3   year                        447670 non-null  object 
 4   time                        447670 non-null  object 
 5   lum                         447670 non-null  object 
 6   atm_condition               447670 non-null  object 
 7   collision_type              447670 non-null  object 
 8   lat                         447670 non-null  float64
 9   long                        447670 non-null  float64
 10  route_category              447670 non-null  object 
 11  traffic_regime              447670 non-null  object 
 12  total_number_lanes          447670 non-null  object 
 13  reserved_lane_

Apply ML v1 -------->

In [2]:
# Applying the XGBoost model on the preprocessed data, assuming the initial preprocessing code was already executed

# Import XGBoost and metrics packages for classification
import xgboost as xgb
from sklearn.metrics import classification_report

# Modify the target variable 'gravity' to have two classes: 1 (Fatal) and 0 (Non-Fatal)
# 2 -> Fatal, 1/3/4 -> Non-Fatal
y_train = y_train.replace({2: 1, 1: 0, 3: 0, 4: 0})
y_test = y_test.replace({2: 1, 1: 0, 3: 0, 4: 0})

# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Generate the classification report
classification_report_output = classification_report(y_test, y_pred, target_names=["Non-Fatal", "Fatal"])

print(classification_report_output)




              precision    recall  f1-score   support

   Non-Fatal       0.98      1.00      0.99     87278
       Fatal       0.49      0.06      0.10      2256

    accuracy                           0.97     89534
   macro avg       0.73      0.53      0.55     89534
weighted avg       0.96      0.97      0.96     89534



Apply ML v2 -------->

In [3]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  
    scoring='f1_weighted',  
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Perform the search on the training set
random_search.fit(X_train, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

# Evaluate the tuned model on the test set
best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Non-Fatal", "Fatal"]))




Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found:  {'subsample': 0.7, 'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best score found:  0.9651621121453178
              precision    recall  f1-score   support

   Non-Fatal       0.98      1.00      0.99     87278
       Fatal       0.46      0.07      0.12      2256

    accuracy                           0.97     89534
   macro avg       0.72      0.53      0.56     89534
weighted avg       0.96      0.97      0.97     89534



Apply ML v3-------->

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [400],
    'max_depth': [4],
    'learning_rate': [0.3],
    'subsample': [0.7],
    'colsample_bytree': [0.8],
    'gamma': [0.1]
}

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  
    scoring='f1_weighted',  
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Perform the search on the training set
random_search.fit(X_train, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

# Evaluate the tuned model on the test set
best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["Non-Fatal", "Fatal"]))



Fitting 3 folds for each of 1 candidates, totalling 3 fits


Apply ML v4--------> SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import xgboost as xgb

# Initialize SMOTE with a random state for reproducibility
smote = SMOTE(random_state=42)

# Apply SMOTE to the training set
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize and train the XGBoost model with the best parameters found in previous tuning
xgb_model_smote = xgb.XGBClassifier(
    n_estimators=400, 
    max_depth=4, 
    learning_rate=0.3, 
    subsample=0.7, 
    colsample_bytree=0.8, 
    gamma=0.1, 
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='logloss'
)
xgb_model_smote.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred_smote = xgb_model_smote.predict(X_test)

# Generate and print the classification report
classification_report_output_smote = classification_report(y_test, y_pred_smote, target_names=["Non-Fatal", "Fatal"])
print(classification_report_output_smote)


Apply ML v5-------->

Apply ML v6-------->

Apply ML v7-------->

Apply ML v8-------->

Apply ML v10-------->

Apply ML v11-------->