In [None]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Import the clean data
data = pd.read_pickle('source\data.pkl')

data.info()

# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

# Convert 'day', 'month', and 'time' to integers
data_processed['day'] = data_processed['day'].astype(int)
data_processed['month'] = data_processed['month'].astype(int)
data_processed['time'] = data_processed['time'].astype(int)

# Cyclical encoding for temporal features
data_processed['day_sin'] = np.sin(2 * np.pi * data_processed['day'] / 31)  
data_processed['day_cos'] = np.cos(2 * np.pi * data_processed['day'] / 31)

data_processed['month_sin'] = np.sin(2 * np.pi * data_processed['month'] / 12)
data_processed['month_cos'] = np.cos(2 * np.pi * data_processed['month'] / 12)

data_processed['time_sin'] = np.sin(2 * np.pi * data_processed['time'] / 86340000) 
data_processed['time_cos'] = np.cos(2 * np.pi * data_processed['time'] / 86340000)

data_processed.drop(columns=['day','month','time'],inplace=True)

# Selecting features and target variable
features_dummy = ['year', 'lum', 'atm_condition', 'collision_type',
       'route_category', 'traffic_regime', 'total_number_lanes',
       'reserved_lane_code', 'longitudinal_profile', 'plan',
       'surface_condition', 'infra', 'accident_situation',
       'traffic_direction', 'vehicle_category', 'fixed_obstacle',
       'mobile_obstacle', 'initial_impact_point', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'reason_travel',
       'safety_equipment1']

# These features will be standardized
features_scaler = ['lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal', 'maximum_speed', 'age']

# These features are between -1 and 1 and do not need any standardazations. 
features_temporal = ['day_sin', 'day_cos', 'month_sin', 'month_cos', 'time_sin', 'time_cos']
target = 'gravity'

X = data_processed.drop(columns=[target])
y = data_processed[target]
y = y.astype(int)

X = pd.get_dummies(X, columns=features_dummy, drop_first=True)

# stratify will split the dataset according to the distribution of the classes to compensate for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
X_train[features_scaler] = scaler.fit_transform(X_train[features_scaler])
X_test[features_scaler] = scaler.transform(X_test[features_scaler])

# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Apply ML v1 -------->

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize XGBoost classifier with default parameters
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Adjust the target variable `y` to start from 0
y_train = y_train - 1
y_test = y_test - 1

# Train the XGBoost model again with adjusted labels
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Evaluate the model's performance
accuracy_adjusted = accuracy_score(y_test, y_pred)
classification_rep_adjusted = classification_report(y_test, y_pred)

In [None]:
print(accuracy_adjusted)
print(classification_rep_adjusted)


Apply ML v2 -------->

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

# Initialize XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=10, scoring='f1', cv=3, verbose=1, random_state=42, n_jobs=-1)


# Fit the model
random_search.fit(X_train, y_train)

# Best parameters from the tuning process
best_params = random_search.best_params_

# Use the best estimator to predict
best_model = random_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

# Calculate accuracy and classification report for the tuned model
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
tuned_classification_report = classification_report(y_test, y_pred_tuned)

print(best_params)
print(tuned_accuracy)
print(tuned_classification_report)


Apply ML v3-------->

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Calculate class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Initialize the XGBoost classifier with class weights
xgb_model_balanced = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=class_weight_dict)

# Train the model on the resampled data
xgb_model_balanced.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred_balanced = xgb_model_balanced.predict(X_test)

# Generate a classification report
report_balanced = classification_report(y_test, y_pred_balanced)

report_balanced


In [None]:
print(report_balanced)

Apply ML v4-------->

In [None]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 1: Apply Recursive Feature Elimination (RFE) using XGBoost

# Initialize the XGBoost model
xgb_model_rfe = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize RFE to select the top 20 features (you can adjust this number as needed)
rfe = RFE(estimator=xgb_model_rfe, n_features_to_select=20)

# Fit RFE
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train the XGBoost model with the selected features
xgb_model_rfe.fit(X_train_rfe, y_train)

# Predict on the test set with RFE-selected features
y_pred_rfe = xgb_model_rfe.predict(X_test_rfe)

# Generate a classification report
report_rfe = classification_report(y_test, y_pred_rfe)


# Step 2: Apply Regularization (Lasso) using Logistic Regression

# Initialize the Logistic Regression model with L1 regularization
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Fit the model with L1 regularization on the RFE-selected features
log_reg_l1.fit(X_train_rfe, y_train)

# Predict on the test set
y_pred_l1 = log_reg_l1.predict(X_test_rfe)

# Generate a classification report for L1 regularization
report_l1 = classification_report(y_test, y_pred_l1)

# Display both reports
print(report_rfe)
print(report_l1)


Apply ML v5-------->

In [None]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Initialize the XGBoost model
xgb_model_rfe = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize RFE to select the top 20 features
rfe = RFE(estimator=xgb_model_rfe, n_features_to_select=20)

# Fit RFE
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train the XGBoost model
xgb_model_rfe.fit(X_train_rfe, y_train)

# Predict on the test set
y_pred_rfe = xgb_model_rfe.predict(X_test_rfe)

# Generate a classification report
print("RFE with XGBoost Classification Report:")
print(classification_report(y_test, y_pred_rfe))


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model with L1 regularization
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Fit the model with L1 regularization on the RFE-selected features
log_reg_l1.fit(X_train_rfe, y_train)

# Predict on the test set
y_pred_l1 = log_reg_l1.predict(X_test_rfe)

# Generate a classification report for L1 regularization
print("L1 Regularization with Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_l1))


Apply ML v6-------->

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

# Define the parameter grid for Randomized Search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [50, 100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Adjust the target variable `y` to start from 0
y_train = y_train - 1
y_test = y_test - 1

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=20, scoring='f1', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit Randomized Search
random_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model with the best found hyperparameters on the test set
best_xgb_model = random_search.best_estimator_
y_pred_best = best_xgb_model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_best))


In [None]:
Apply ML v7-------->

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define a smaller range of hyperparameters for further fine-tuning
param_grid_finetune = {
    'learning_rate': [0.05, 0.07, 0.1],
    'n_estimators': [200, 300, 400],
    'max_depth': [8, 10, 12],
    'subsample': [1.0],
    'colsample_bytree': [1.0]
}

# Initialize the XGBoost classifier
xgb_model_finetune = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize GridSearchCV for fine-tuning
grid_search_finetune = GridSearchCV(estimator=xgb_model_finetune, param_grid=param_grid_finetune,
                                    scoring='f1', cv=3, verbose=1, n_jobs=-1)

# Fit the fine-tuning search
grid_search_finetune.fit(X_train, y_train)

# Best hyperparameters from fine-tuning
print("Best Fine-Tuned Hyperparameters:", grid_search_finetune.best_params_)

# Predict with the best model
best_xgb_model_finetune = grid_search_finetune.best_estimator_
y_pred_finetune = best_xgb_model_finetune.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_finetune))


Apply ML v8-------->

In [None]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Compute class weights to penalize misclassifications of minority class
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Initialize the XGBoost classifier with class weight adjustments
xgb_model_weighted = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    random_state=42,
    scale_pos_weight=class_weight_dict[1]  # Focus on Class 1
)

# Train the XGBoost model on SMOTE-oversampled data
xgb_model_weighted.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_weighted = xgb_model_weighted.predict(X_test)

# Classification report
print("Classification Report after applying SMOTE and Class Weights:")
print(classification_report(y_test, y_pred_weighted))


Apply ML v9-------->

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import classification_report, precision_recall_curve
import matplotlib.pyplot as plt

# Apply Borderline-SMOTE (or you can choose SMOTE + Tomek Links)
borderline_smote = BorderlineSMOTE(random_state=42)

# Apply SMOTE to balance the data
X_train_smote, y_train_smote = borderline_smote.fit_resample(X_train, y_train)

# Train the XGBoost model after SMOTE
xgb_model_smote = XGBClassifier(
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    random_state=42,
    scale_pos_weight=class_weight_dict[1]  # Adjust class weight for class 1
)
xgb_model_smote.fit(X_train_smote, y_train_smote)

# Predict probabilities for threshold tuning
y_pred_proba = xgb_model_smote.predict_proba(X_test)

# Tune the threshold for Class 1
threshold = 0.35  # Adjust this value based on experimentation
y_pred_threshold = (y_pred_proba[:, 1] >= threshold).astype(int)

# Get classification report for threshold-tuned predictions
print("Classification Report after SMOTE and Threshold Tuning:")
print(classification_report(y_test, y_pred_threshold))

Apply ML v10-------->

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Convert the original target classes into binary classification (Fatal vs Non-Fatal)
# Assign Fatal (Class 1) to 1, and Non-Fatal (Classes 0, 2, 3) to 0
y_train_binary = y_train.replace({0: 0, 2: 0, 3: 0, 1: 1})
y_test_binary = y_test.replace({0: 0, 2: 0, 3: 0, 1: 1})

# Define a smaller range of hyperparameters for further fine-tuning
param_grid_finetune = {
    'learning_rate': [0.05, 0.07, 0.1],
    'n_estimators': [200, 300, 400],
    'max_depth': [8, 10, 12],
    'subsample': [1.0],
    'colsample_bytree': [1.0]
}

# Initialize the XGBoost classifier
xgb_model_finetune = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize GridSearchCV for fine-tuning
grid_search_finetune = GridSearchCV(estimator=xgb_model_finetune, param_grid=param_grid_finetune,
                                    scoring='f1', cv=3, verbose=1, n_jobs=-1)

# Fit the fine-tuning search with binary classification data
grid_search_finetune.fit(X_train, y_train_binary)

# Best hyperparameters from fine-tuning
print("Best Fine-Tuned Hyperparameters:", grid_search_finetune.best_params_)

# Predict with the best model on the binary test data
best_xgb_model_finetune = grid_search_finetune.best_estimator_
y_pred_finetune = best_xgb_model_finetune.predict(X_test)

# Classification report for binary classification (Fatal vs Non-Fatal)
print("Classification Report (Fatal vs Non-Fatal):")
print(classification_report(y_test_binary, y_pred_finetune))


Apply ML v11-------->

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, precision_recall_curve
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Step 1: Apply SMOTE for oversampling the minority class (Fatal)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train_binary)

# Step 2: Adjust class weights in XGBoost (scale_pos_weight for Class 1)
scale_pos_weight = len(y_train_binary[y_train_binary == 0]) / len(y_train_binary[y_train_binary == 1])

# Step 3: Define hyperparameter grid for GridSearchCV
param_grid_finetune = {
    'learning_rate': [0.05, 0.07, 0.1],
    'n_estimators': [200, 300, 400],
    'max_depth': [8, 10, 12],
    'subsample': [1.0],
    'colsample_bytree': [1.0],
    'scale_pos_weight': [scale_pos_weight]  
}

# Initialize the XGBoost model with adjusted class weights
xgb_model_weighted = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize GridSearchCV for fine-tuning with class weights
grid_search_weighted = GridSearchCV(estimator=xgb_model_weighted, param_grid=param_grid_finetune,
                                    scoring='f1', cv=3, verbose=1, n_jobs=-1)

# Fit the fine-tuning search with oversampled data (SMOTE)
grid_search_weighted.fit(X_train_smote, y_train_smote)

# Best hyperparameters from fine-tuning
print("Best Hyperparameters (with Class Weights and SMOTE):", grid_search_weighted.best_params_)

# Predict probabilities for threshold tuning
best_xgb_model_weighted = grid_search_weighted.best_estimator_
y_pred_proba = best_xgb_model_weighted.predict_proba(X_test)

# Step 4: Adjust threshold for Class 1 to prioritize recall
threshold = 0.35  # You can experiment with different thresholds
y_pred_threshold = (y_pred_proba[:, 1] >= threshold).astype(int)

In [None]:
# Classification report after adjusting threshold
print("Classification Report (After SMOTE, Class Weights, and Threshold Tuning):")
print(classification_report(y_test_binary, y_pred_threshold))