In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_pickle('source\data.pkl')

data.info()

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Convert 'day', 'month', and 'time' to integers
data_processed['day'] = data_processed['day'].astype(int)
data_processed['month'] = data_processed['month'].astype(int)
data_processed['time'] = data_processed['time'].astype(int)

# Cyclical encoding for temporal features
data_processed['day_sin'] = np.sin(2 * np.pi * data_processed['day'] / 31)  
data_processed['day_cos'] = np.cos(2 * np.pi * data_processed['day'] / 31)

data_processed['month_sin'] = np.sin(2 * np.pi * data_processed['month'] / 12)
data_processed['month_cos'] = np.cos(2 * np.pi * data_processed['month'] / 12)

data_processed['time_sin'] = np.sin(2 * np.pi * data_processed['time'] / 86340000) 
data_processed['time_cos'] = np.cos(2 * np.pi * data_processed['time'] / 86340000)

data_processed.drop(columns=['day','month','time'],inplace=True)

# Selecting features and target variable
features_dummy = ['year', 'lum', 'atm_condition', 'collision_type',
       'route_category', 'traffic_regime', 'total_number_lanes',
       'reserved_lane_code', 'longitudinal_profile', 'plan',
       'surface_condition', 'infra', 'accident_situation',
       'traffic_direction', 'vehicle_category', 'fixed_obstacle',
       'mobile_obstacle', 'initial_impact_point', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'reason_travel',
       'safety_equipment1']

# These features will be standardized
features_scaler = ['lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal', 'maximum_speed', 'age']

# These features are between -1 and 1 and do not need any standardazations. 
features_temporal = ['day_sin', 'day_cos', 'month_sin', 'month_cos', 'time_sin', 'time_cos']
target = 'gravity'

X = data_processed.drop(columns=[target])
y = data_processed[target]
y = y.astype(int)

X = pd.get_dummies(X, columns=features_dummy, drop_first=True)

# stratify will split the dataset according to the distribution of the classes to compensate for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       447670 non-null  object 
 1   day                         447670 non-null  object 
 2   month                       447670 non-null  object 
 3   year                        447670 non-null  object 
 4   time                        447670 non-null  object 
 5   lum                         447670 non-null  object 
 6   atm_condition               447670 non-null  object 
 7   collision_type              447670 non-null  object 
 8   lat                         447670 non-null  float64
 9   long                        447670 non-null  float64
 10  route_category              447670 non-null  object 
 11  traffic_regime              447670 non-null  object 
 12  total_number_lanes          447670 non-null  object 
 13  reserved_lane_

Apply ML v1 -------->

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize XGBoost classifier with default parameters
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Adjust the target variable `y` to start from 0
y_train = y_train - 1
y_test = y_test - 1

# Train the XGBoost model again with adjusted labels
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Evaluate the model's performance
accuracy_adjusted = accuracy_score(y_test, y_pred)
classification_rep_adjusted = classification_report(y_test, y_pred)

print(accuracy_adjusted)
print(classification_rep_adjusted)

Apply ML v2 -------->

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=10, scoring='f1', cv=3, verbose=1, random_state=42, n_jobs=-1)


# Fit the model
random_search.fit(X_train, y_train)

# Best parameters from the tuning process
best_params = random_search.best_params_

# Use the best estimator to predict
best_model = random_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

# Calculate accuracy and classification report for the tuned model
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
tuned_classification_report = classification_report(y_test, y_pred_tuned)

print(best_params)
print(tuned_accuracy)
print(tuned_classification_report)


Apply ML v3--------> Best Parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
    'min_child_weight': [5],
}

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=10, scoring='f1', cv=3, verbose=1, random_state=42, n_jobs=-1)


# Fit the model
random_search.fit(X_train, y_train)

# Best parameters from the tuning process
best_params = random_search.best_params_

# Use the best estimator to predict
best_model = random_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

# Calculate accuracy and classification report for the tuned model
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
tuned_classification_report = classification_report(y_test, y_pred_tuned)

print(best_params)
print(tuned_accuracy)
print(tuned_classification_report)

Apply ML v4--------> SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Calculate class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Initialize the XGBoost classifier with class weights
xgb_model_balanced = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=class_weight_dict)

# Train the model on the resampled data
xgb_model_balanced.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred_balanced = xgb_model_balanced.predict(X_test)

# Generate a classification report
report_balanced = classification_report(y_test, y_pred_balanced)

report_balanced

In [None]:
print(report_balanced)

Apply ML v5--------> 

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import numpy as np

# Define a function to train and evaluate the model
def train_and_evaluate(X_train_resampled, y_train_resampled, X_test, y_test, class_weight=None):
    # Initialize the XGBoost classifier
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    
    # Apply class weights if specified
    if class_weight:
        xgb_model.set_params(scale_pos_weight=class_weight)
    
    # Train the model
    xgb_model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on the test set
    y_pred = xgb_model.predict(X_test)
    
    # Generate and return the classification report
    return classification_report(y_test, y_pred)

# Original split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on training data
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Adjust the target variable `y` to start from 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

--------> RandomUnderSampler

In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_res_rus, y_train_res_rus = rus.fit_resample(X_train, y_train_adjusted)
report_rus = train_and_evaluate(X_train_res_rus, y_train_res_rus, X_test, y_test_adjusted)
print("RandomUnderSampler Report:\n", report_rus)

--------> RandomOverSampler - BEST RESULT

In [None]:
ros = RandomOverSampler(random_state=42)
X_train_res_ros, y_train_res_ros = ros.fit_resample(X_train, y_train_adjusted)
report_ros = train_and_evaluate(X_train_res_ros, y_train_res_ros, X_test, y_test_adjusted)
print("RandomOverSampler Report:\n", report_ros)



--------> Class Weight 

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_adjusted), y=y_train_adjusted)
class_weight_dict = dict(zip(np.unique(y_train_adjusted), class_weights))
report_class_weight = train_and_evaluate(X_train, y_train_adjusted, X_test, y_test_adjusted, class_weight=class_weight_dict)
print("Class Weight Argument Report:\n", report_class_weight)


##### Apply ML v7--------> SMOTE and Undersampling Combined

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Define the SMOTE + RandomUnderSampler pipeline
sampling_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),                # SMOTE to oversample minority classes
    ('undersample', RandomUnderSampler(random_state=42))  # Undersample majority classes
])

# Apply the pipeline to the training data
X_train_resampled, y_train_resampled = sampling_pipeline.fit_resample(X_train, y_train_adjusted)

# Initialize the XGBoost classifier
xgb_model_combined = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model on the resampled data
xgb_model_combined.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_combined = xgb_model_combined.predict(X_test)

# Generate a classification report
report_combined = classification_report(y_test_adjusted, y_pred_combined)
print("SMOTE + Undersampling Report:\n", report_combined)


SMOTE + Tomek Links

In [None]:
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled_tomek, y_train_resampled_tomek = smote_tomek.fit_resample(X_train, y_train_adjusted)

# Initialize the XGBoost classifier
xgb_model_tomek = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_tomek.fit(X_train_resampled_tomek, y_train_resampled_tomek)

# Predict on the test set
y_pred_tomek = xgb_model_tomek.predict(X_test)

# Generate a classification report
report_tomek = classification_report(y_test_adjusted, y_pred_tomek)
print("SMOTE + Tomek Links Report:\n", report_tomek)


SMOTE + ENN

In [None]:
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# SMOTE + ENN
smote_enn = SMOTEENN(random_state=42)
X_train_resampled_enn, y_train_resampled_enn = smote_enn.fit_resample(X_train, y_train_adjusted)

# Initialize the XGBoost classifier
xgb_model_enn = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_enn.fit(X_train_resampled_enn, y_train_resampled_enn)

# Predict on the test set
y_pred_enn = xgb_model_enn.predict(X_test)

# Generate a classification report
report_enn = classification_report(y_test_adjusted, y_pred_enn)
print("SMOTE + ENN Report:\n", report_enn)


SMOTE + ENN with Class Weights

In [None]:
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import numpy as np

# Apply SMOTE + ENN for resampling
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train_adjusted)

# Calculate class weights based on the original y_train distribution
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_adjusted), y=y_train_adjusted)
class_weight_dict = dict(zip(np.unique(y_train_adjusted), class_weights))

# Initialize XGBoost classifier with class weights
xgb_model_combined_weighted = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42,
                                            scale_pos_weight=class_weight_dict)

# Train the model on the resampled data
xgb_model_combined_weighted.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_combined_weighted = xgb_model_combined_weighted.predict(X_test)

# Generate a classification report
report_combined_weighted = classification_report(y_test_adjusted, y_pred_combined_weighted)
print("SMOTE + ENN with Class Weights Report:\n", report_combined_weighted)


 LIME explainer

In [None]:
import lime
import lime.lime_tabular
import numpy as np

# Initialize the LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,
    mode='classification',
    training_labels=y_train_adjusted,
    feature_names=X_train.columns,
    class_names=np.unique(y_train_adjusted).astype(str),
    discretize_continuous=True
)

# Choose an instance to explain (e.g., the first instance in the test set)
instance_index = 0
instance = X_test.iloc[instance_index].values.reshape(1, -1)

# Generate LIME explanation
exp = explainer.explain_instance(instance.flatten(), xgb_model_combined_weighted.predict_proba, num_features=10)

# Display the explanation
exp.show_in_notebook(show_table=True)


SHAP explaine

In [None]:
import shap

# Initialize the SHAP explainer (TreeExplainer is optimized for tree-based models like XGBoost)
explainer = shap.TreeExplainer(xgb_model_combined_weighted)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Summary Plot - Shows feature importance across all predictions
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Force Plot - Explains a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][instance_index], X_test.iloc[instance_index])


In [None]:
import lime
import lime.lime_tabular

# Initialize the LIME explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values, 
    mode='classification',
    training_labels=y_train_adjusted,
    feature_names=X_train.columns,
    class_names=np.unique(y_train_adjusted).astype(str),
    discretize_continuous=True
)

# Explain the same instance used in SHAP
exp = lime_explainer.explain_instance(
    X_test.iloc[instance_index].values, 
    xgb_model_combined_weighted.predict_proba, 
    num_features=10
)

# Display LIME explanation
exp.show_in_notebook(show_table=True)
