In [42]:
# 1. Data Manipulation dan Umum
import numpy as np
import pandas as pd
from collections import Counter

# 2. Visualisasi Data
import matplotlib.pyplot as plt
import seaborn as sns

# 3. Preprocessing dan Evaluasi Model
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    
)

# 4. Model Klasifikasi Scikit-learn
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

# 5. Model Gradient Boosting Tambahan
import xgboost as xgb
# import lightgbm as lgb
from xgboost import XGBClassifier

# 6. Penanganan Data Tidak Seimbang (Imbalanced-learn)
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

# 7. Category Encoders
import category_encoders as ce

# Tambahan jika diperlukan
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LassoCV


# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import cross_val_score

# Ordinal Encoding

In [43]:
df_encoded_label = pd.read_csv('../datasets/encoded_label.csv')

In [44]:
df_encoded_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12296 entries, 0 to 12295
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Time                         12296 non-null  int64
 1   Age_band_of_driver           12296 non-null  int64
 2   Sex_of_driver                12296 non-null  int64
 3   Educational_level            12296 non-null  int64
 4   Vehicle_driver_relation      12296 non-null  int64
 5   Driving_experience           12296 non-null  int64
 6   Type_of_vehicle              12296 non-null  int64
 7   Owner_of_vehicle             12296 non-null  int64
 8   Service_year_of_vehicle      12296 non-null  int64
 9   Area_accident_occured        12296 non-null  int64
 10  Lanes_or_Medians             12296 non-null  int64
 11  Road_allignment              12296 non-null  int64
 12  Types_of_Junction            12296 non-null  int64
 13  Road_surface_type            12296 non-null  i

In [45]:
df_encoded_label.head()

Unnamed: 0,Time,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Area_accident_occured,...,Number_of_casualties,Vehicle_movement,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity,Day
0,17,0,1,0,0,0,3,3,3,8,...,2,2,2,4,3,4,5,3,2,1
1,17,0,1,4,0,3,4,3,2,4,...,2,2,2,4,3,4,5,9,2,1
2,17,0,1,4,0,0,0,3,5,7,...,2,2,1,1,2,4,5,11,1,1
3,1,0,1,4,0,2,4,0,5,4,...,2,2,0,0,2,2,5,11,2,3
4,1,0,1,4,0,1,3,3,2,2,...,2,2,2,4,3,4,5,9,2,3


In [46]:
df_encoded_label.isnull().sum().sum()

0

In [47]:
X = df_encoded_label.drop(columns=['Accident_severity'])
y = df_encoded_label['Accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [48]:
def evaluate_encoding(X_encoded, name):
    score = np.mean(cross_val_score(RandomForestClassifier(), X_encoded, y_train, cv=5))
    print(f"{name}: {score:.4f}")
    return score

print("Cross-Validation Scores:")
# score_ohe = evaluate_encoding(X_train_ohe, "One-Hot Encoding")
score_le = evaluate_encoding(X_train, "Label Encoding")
# score_te = evaluate_encoding(X_train_te, "Target Encoding")
# score_freq = evaluate_encoding(X_train_freq, "Frequency Encoding")

print("\nMutual Information Scores:")
mi_scores = {
    # 'One-Hot': mutual_info_classif(X_train_ohe, y_train).mean(),
    'Label Encoding': mutual_info_classif(X_train, y_train).mean(),
    # 'Target Encoding': mutual_info_classif(X_train_te, y_train).mean(),
    # 'Frequency Encoding': mutual_info_classif(X_train_freq, y_train).mean()
}
for key, value in mi_scores.items():
    print(f"{key}: {value:.4f}")

Cross-Validation Scores:


Label Encoding: 0.8490

Mutual Information Scores:
Label Encoding: 0.0027


In [49]:
model = RandomForestClassifier().fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))

Time                           0.100257
Day                            0.068679
Cause_of_accident              0.063659
Area_accident_occured          0.058243
Driving_experience             0.054751
Lanes_or_Medians               0.049787
Type_of_vehicle                0.045893
Number_of_vehicles_involved    0.045353
Service_year_of_vehicle        0.044005
Types_of_Junction              0.041250
Vehicle_movement               0.040619
Number_of_casualties           0.039306
Educational_level              0.036342
Age_band_of_casualty           0.036164
Age_band_of_driver             0.032454
Type_of_collision              0.031372
Sex_of_casualty                0.024239
Light_conditions               0.021447
Fitness_of_casuality           0.019999
Weather_conditions             0.019641
Casualty_severity              0.018380
Vehicle_driver_relation        0.018229
Road_surface_conditions        0.017824
Road_allignment                0.017544
Pedestrian_movement            0.017107


In [50]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

le_rf_f1_score = f1_score(y_test, rf_pred, average='weighted')
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_weighted')
le_rf_cv = rf_cv_scores.mean()

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

le_xgb_f1_score = f1_score(y_test, xgb_pred, average='weighted')
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1_weighted')
le_xgb_cv = xgb_cv_scores.mean()

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

le_knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='f1_weighted')
le_knn_cv = knn_cv_scores.mean()

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

le_logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='f1_weighted')
le_logreg_cv = logreg_cv_scores.mean()

# Extra Trees
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train, y_train)
extra_trees_pred = extra_trees_model.predict(X_test)

le_extra_trees_f1_score = f1_score(y_test, extra_trees_pred, average='weighted')
extra_trees_cv_scores = cross_val_score(extra_trees_model, X, y, cv=5, scoring='f1_weighted')
le_extra_trees_cv = extra_trees_cv_scores.mean()

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

le_gb_f1_score = f1_score(y_test, gb_pred, average='weighted')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1_weighted')
le_gb_cv = gb_cv_scores.mean()

print("Random Forest\t\t -> F1 Score:", le_rf_f1_score, "Cross-Validation F1 Score:", le_rf_cv)
print("XGBoost\t\t\t -> F1 Score:", le_xgb_f1_score, "Cross-Validation F1 Score:", le_xgb_cv)
print("KNN\t\t\t -> F1 Score:", le_knn_f1_score, "Cross-Validation F1 Score:", le_knn_cv)
print("Logistic Regression\t -> F1 Score:", le_logreg_f1_score, "Cross-Validation F1 Score:", le_logreg_cv)
print("Extra Trees\t\t -> F1 Score:", le_extra_trees_f1_score, "Cross-Validation F1 Score:", le_extra_trees_cv)
print("Gradient Boosting\t -> F1 Score:", le_gb_f1_score, "Cross-Validation F1 Score:", le_gb_cv)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest		 -> F1 Score: 0.7780535988447094 Cross-Validation F1 Score: 0.7781573758459753
XGBoost			 -> F1 Score: 0.8065751796157861 Cross-Validation F1 Score: 0.7900133469415439
KNN			 -> F1 Score: 0.7698761508322021 Cross-Validation F1 Score: 0.776552859193384
Logistic Regression	 -> F1 Score: 0.7753317112558925 Cross-Validation F1 Score: 0.7752608859289839
Extra Trees		 -> F1 Score: 0.7807216971341048 Cross-Validation F1 Score: 0.7815537044786545
Gradient Boosting	 -> F1 Score: 0.7863801336805892 Cross-Validation F1 Score: 0.7827910728712185


# One-Hot Encoding

In [51]:
df_encoded_ohe = pd.read_csv('../datasets/encoded_ohe.csv')

In [52]:
df_encoded_ohe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12296 entries, 0 to 12295
Columns: 180 entries, Number_of_vehicles_involved to Weather_conditions_Windy
dtypes: int64(180)
memory usage: 16.9 MB


In [53]:
df_encoded_ohe.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Accident_severity,Age_band_of_casualty_18-30,Age_band_of_casualty_31-50,Age_band_of_casualty_Over 51,Age_band_of_casualty_Under 18,Age_band_of_casualty_unknown,Age_band_of_driver_Dewasa,Age_band_of_driver_Lansia,...,Vehicle_movement_Unknown,Weather_conditions_Cloudy,Weather_conditions_Fog or mist,Weather_conditions_Normal,Weather_conditions_Other,Weather_conditions_Raining,Weather_conditions_Raining and Windy,Weather_conditions_Snow,Weather_conditions_Unknown,Weather_conditions_Windy
0,2,2,2,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
1,2,2,2,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
2,2,2,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,2,2,2,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,2,2,2,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0


In [54]:
df_encoded_ohe.isna().sum().sum()

0

In [55]:
X = df_encoded_ohe.drop(columns=['Accident_severity'])
y = df_encoded_ohe['Accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [56]:
def evaluate_encoding(X_encoded, name):
    score = np.mean(cross_val_score(RandomForestClassifier(), X_encoded, y_train, cv=5))
    print(f"{name}: {score:.4f}")
    return score

print("Cross-Validation Scores:")
score_ohe = evaluate_encoding(X_train, "One-Hot Encoding")
# score_le = evaluate_encoding(X_train, "Label Encoding")
# score_te = evaluate_encoding(X_train_te, "Target Encoding")
# score_freq = evaluate_encoding(X_train_freq, "Frequency Encoding")

print("\nMutual Information Scores:")
mi_scores = {
    'One-Hot Encoding': mutual_info_classif(X_train, y_train).mean(),
    # 'Label Encoding': mutual_info_classif(X_train_le, y_train).mean(),
    # 'Target Encoding': mutual_info_classif(X_train_te, y_train).mean(),
    # 'Frequency Encoding': mutual_info_classif(X_train_freq, y_train).mean()
}
for key, value in mi_scores.items():
    print(f"{key}: {value:.4f}")

Cross-Validation Scores:
One-Hot Encoding: 0.8480

Mutual Information Scores:
One-Hot Encoding: 0.0019


In [57]:
model = RandomForestClassifier().fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))  

Number_of_vehicles_involved                                                                                                                                  0.035009
Number_of_casualties                                                                                                                                         0.026278
Type_of_vehicle_Private Vehicle                                                                                                                              0.014039
Lanes_or_Medians_Two-way (divided with broken lines road marking)                                                                                            0.013234
Service_year_of_vehicle_Unknown                                                                                                                              0.013215
                                                                                                                                                               ...   
Weat

In [58]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

le_rf_f1_score = f1_score(y_test, rf_pred, average='weighted')
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_weighted')
le_rf_cv = rf_cv_scores.mean()

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

le_xgb_f1_score = f1_score(y_test, xgb_pred, average='weighted')
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1_weighted')
le_xgb_cv = xgb_cv_scores.mean()

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

le_knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='f1_weighted')
le_knn_cv = knn_cv_scores.mean()

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

le_logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='f1_weighted')
le_logreg_cv = logreg_cv_scores.mean()

# Extra Trees
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train, y_train)
extra_trees_pred = extra_trees_model.predict(X_test)

le_extra_trees_f1_score = f1_score(y_test, extra_trees_pred, average='weighted')
extra_trees_cv_scores = cross_val_score(extra_trees_model, X, y, cv=5, scoring='f1_weighted')
le_extra_trees_cv = extra_trees_cv_scores.mean()

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

le_gb_f1_score = f1_score(y_test, gb_pred, average='weighted')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1_weighted')
le_gb_cv = gb_cv_scores.mean()

print("Random Forest\t\t -> F1 Score:", le_rf_f1_score, "Cross-Validation F1 Score:", le_rf_cv)
print("XGBoost\t\t\t -> F1 Score:", le_xgb_f1_score, "Cross-Validation F1 Score:", le_xgb_cv)
print("KNN\t\t\t -> F1 Score:", le_knn_f1_score, "Cross-Validation F1 Score:", le_knn_cv)
print("Logistic Regression\t -> F1 Score:", le_logreg_f1_score, "Cross-Validation F1 Score:", le_logreg_cv)
print("Extra Trees\t\t -> F1 Score:", le_extra_trees_f1_score, "Cross-Validation F1 Score:", le_extra_trees_cv)
print("Gradient Boosting\t -> F1 Score:", le_gb_f1_score, "Cross-Validation F1 Score:", le_gb_cv)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest		 -> F1 Score: 0.7763131610656833 Cross-Validation F1 Score: 0.775652551765879
XGBoost			 -> F1 Score: 0.8059528868980814 Cross-Validation F1 Score: 0.7887827684810069
KNN			 -> F1 Score: 0.7853001611869224 Cross-Validation F1 Score: 0.7821442612968064
Logistic Regression	 -> F1 Score: 0.7778447585612219 Cross-Validation F1 Score: 0.7759850670459498
Extra Trees		 -> F1 Score: 0.779019205475845 Cross-Validation F1 Score: 0.7788667598404019
Gradient Boosting	 -> F1 Score: 0.7883860963879212 Cross-Validation F1 Score: 0.7873701269008627


# Target Encoding

In [59]:
df_encoded_target = pd.read_csv('../datasets/encoded_target.csv')

In [60]:
df_encoded_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12296 entries, 0 to 12295
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Time                         12296 non-null  float64
 1   Age_band_of_driver           12296 non-null  float64
 2   Sex_of_driver                12296 non-null  float64
 3   Educational_level            12296 non-null  float64
 4   Vehicle_driver_relation      12296 non-null  float64
 5   Driving_experience           12296 non-null  float64
 6   Type_of_vehicle              12296 non-null  float64
 7   Owner_of_vehicle             12296 non-null  float64
 8   Service_year_of_vehicle      12296 non-null  float64
 9   Area_accident_occured        12296 non-null  float64
 10  Lanes_or_Medians             12296 non-null  float64
 11  Road_allignment              12296 non-null  float64
 12  Types_of_Junction            12296 non-null  float64
 13  Road_surface_typ

In [61]:
df_encoded_target.head()

Unnamed: 0,Time,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Area_accident_occured,...,Number_of_casualties,Vehicle_movement,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity,Day
0,1.857843,1.83563,1.831932,1.830556,1.834806,1.846154,1.8331,1.831243,1.84139,1.806796,...,2,1.834306,1.840812,1.838427,1.840812,1.837209,1.83433,1.811785,2,1.864286
1,1.857843,1.83563,1.831932,1.832195,1.834806,1.83149,1.830928,1.831243,1.825781,1.839757,...,2,1.834306,1.840812,1.838427,1.840812,1.837209,1.83433,1.796687,2,1.864286
2,1.857843,1.83563,1.831932,1.832195,1.834806,1.846154,1.82622,1.831243,1.834045,1.829268,...,2,1.834306,1.823036,1.83551,1.826444,1.837209,1.83433,1.831912,1,1.864286
3,1.813433,1.83563,1.831932,1.832195,1.834806,1.834768,1.830928,1.84342,1.834045,1.839757,...,2,1.834306,1.839893,1.812998,1.826444,1.832103,1.83433,1.831912,2,1.823329
4,1.813433,1.83563,1.831932,1.832195,1.834806,1.819923,1.8331,1.831243,1.825781,1.842105,...,2,1.834306,1.840812,1.838427,1.840812,1.837209,1.83433,1.796687,2,1.823329


In [62]:
df_encoded_target.isna().sum().sum()

0

In [63]:
X = df_encoded_target.drop(columns=['Accident_severity'])
y = df_encoded_target['Accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [64]:
def evaluate_encoding(X_encoded, name):
    score = np.mean(cross_val_score(RandomForestClassifier(), X_encoded, y_train, cv=5))
    print(f"{name}: {score:.4f}")
    return score

print("Cross-Validation Scores:")
# score_ohe = evaluate_encoding(X_train, "One-Hot Encoding")
# score_le = evaluate_encoding(X_train, "Label Encoding")
score_te = evaluate_encoding(X_train, "Target Encoding")
# score_freq = evaluate_encoding(X_train_freq, "Frequency Encoding")

print("\nMutual Information Scores:")
mi_scores = {
    # 'One-Hot': mutual_info_classif(X_train, y_train).mean(),
    # 'Label Encoding': mutual_info_classif(X_train_le, y_train).mean(),
    'Target Encoding': mutual_info_classif(X_train, y_train).mean(),
    # 'Frequency Encoding': mutual_info_classif(X_train_freq, y_train).mean()
}
for key, value in mi_scores.items():
    print(f"{key}: {value:.4f}")

Cross-Validation Scores:
Target Encoding: 0.8484

Mutual Information Scores:
Target Encoding: 0.0028


In [65]:
model = RandomForestClassifier().fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))

Time                           0.095080
Day                            0.067916
Cause_of_accident              0.063100
Area_accident_occured          0.056832
Driving_experience             0.052925
Lanes_or_Medians               0.050159
Service_year_of_vehicle        0.047677
Type_of_vehicle                0.047011
Types_of_Junction              0.044224
Number_of_vehicles_involved    0.043879
Vehicle_movement               0.038544
Number_of_casualties           0.037694
Age_band_of_casualty           0.037005
Educational_level              0.035785
Type_of_collision              0.033886
Age_band_of_driver             0.033144
Light_conditions               0.024554
Sex_of_casualty                0.024302
Fitness_of_casuality           0.020675
Weather_conditions             0.019684
Vehicle_driver_relation        0.018806
Casualty_severity              0.018301
Road_surface_conditions        0.017757
Road_allignment                0.017185
Pedestrian_movement            0.015544


In [66]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

le_rf_f1_score = f1_score(y_test, rf_pred, average='weighted')
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_weighted')
le_rf_cv = rf_cv_scores.mean()

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

le_xgb_f1_score = f1_score(y_test, xgb_pred, average='weighted')
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1_weighted')
le_xgb_cv = xgb_cv_scores.mean()

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

le_knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='f1_weighted')
le_knn_cv = knn_cv_scores.mean()

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

le_logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='f1_weighted')
le_logreg_cv = logreg_cv_scores.mean()

# Extra Trees
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train, y_train)
extra_trees_pred = extra_trees_model.predict(X_test)

le_extra_trees_f1_score = f1_score(y_test, extra_trees_pred, average='weighted')
extra_trees_cv_scores = cross_val_score(extra_trees_model, X, y, cv=5, scoring='f1_weighted')
le_extra_trees_cv = extra_trees_cv_scores.mean()

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

le_gb_f1_score = f1_score(y_test, gb_pred, average='weighted')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1_weighted')
le_gb_cv = gb_cv_scores.mean()

print("Random Forest\t\t -> F1 Score:", le_rf_f1_score, "Cross-Validation F1 Score:", le_rf_cv)
print("XGBoost\t\t\t -> F1 Score:", le_xgb_f1_score, "Cross-Validation F1 Score:", le_xgb_cv)
print("KNN\t\t\t -> F1 Score:", le_knn_f1_score, "Cross-Validation F1 Score:", le_knn_cv)
print("Logistic Regression\t -> F1 Score:", le_logreg_f1_score, "Cross-Validation F1 Score:", le_logreg_cv)
print("Extra Trees\t\t -> F1 Score:", le_extra_trees_f1_score, "Cross-Validation F1 Score:", le_extra_trees_cv)
print("Gradient Boosting\t -> F1 Score:", le_gb_f1_score, "Cross-Validation F1 Score:", le_gb_cv)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest		 -> F1 Score: 0.7782623892498449 Cross-Validation F1 Score: 0.7778052676575216
XGBoost			 -> F1 Score: 0.8064923794875875 Cross-Validation F1 Score: 0.7887919111680166
KNN			 -> F1 Score: 0.803496180663444 Cross-Validation F1 Score: 0.7881377857781608
Logistic Regression	 -> F1 Score: 0.7753317112558925 Cross-Validation F1 Score: 0.7752608859289839
Extra Trees		 -> F1 Score: 0.7879738054674976 Cross-Validation F1 Score: 0.7855870197518424
Gradient Boosting	 -> F1 Score: 0.7912643104576311 Cross-Validation F1 Score: 0.7855155405108427


# K-FOLD Encoding

In [67]:
df_encoded_kfold = pd.read_csv('../datasets/encoded_kfold.csv')

In [68]:
df_encoded_kfold.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12296 entries, 0 to 12295
Data columns (total 29 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Number_of_vehicles_involved      12296 non-null  int64  
 1   Number_of_casualties             12296 non-null  int64  
 2   Accident_severity                12296 non-null  int64  
 3   Time_encoded                     12296 non-null  float64
 4   Age_band_of_driver_encoded       12296 non-null  float64
 5   Sex_of_driver_encoded            12296 non-null  float64
 6   Educational_level_encoded        12296 non-null  float64
 7   Vehicle_driver_relation_encoded  12296 non-null  float64
 8   Driving_experience_encoded       12296 non-null  float64
 9   Type_of_vehicle_encoded          12296 non-null  float64
 10  Owner_of_vehicle_encoded         12296 non-null  float64
 11  Service_year_of_vehicle_encoded  12296 non-null  float64
 12  Area_accident_occu

In [69]:
df_encoded_kfold.head()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties,Accident_severity,Time_encoded,Age_band_of_driver_encoded,Sex_of_driver_encoded,Educational_level_encoded,Vehicle_driver_relation_encoded,Driving_experience_encoded,Type_of_vehicle_encoded,...,Weather_conditions_encoded,Type_of_collision_encoded,Vehicle_movement_encoded,Sex_of_casualty_encoded,Age_band_of_casualty_encoded,Casualty_severity_encoded,Fitness_of_casuality_encoded,Pedestrian_movement_encoded,Cause_of_accident_encoded,Day_encoded
0,2,2,2,1.858859,1.839339,1.832658,1.816327,1.834763,1.85633,1.832211,...,1.828962,1.818667,1.836234,1.840357,1.837924,1.840357,1.837264,1.835567,1.799342,1.868401
1,2,2,2,1.86345,1.833484,1.828528,1.82972,1.831345,1.826158,1.828852,...,1.824391,1.833895,1.829303,1.835586,1.833778,1.835586,1.829889,1.831704,1.788868,1.857143
2,2,2,1,1.852729,1.834851,1.834373,1.834109,1.83814,1.84446,1.82325,...,1.827177,1.822481,1.835802,1.822914,1.834257,1.825842,1.839441,1.836024,1.831005,1.860759
3,2,2,2,1.837838,1.839339,1.832658,1.833608,1.834763,1.836674,1.834716,...,1.828962,1.841928,1.836234,1.838242,1.816115,1.827679,1.833333,1.835567,1.8358,1.816132
4,2,2,2,1.81982,1.833484,1.828528,1.82972,1.831345,1.818225,1.83039,...,1.824391,1.833895,1.829303,1.835586,1.833778,1.835586,1.829889,1.831704,1.788868,1.817708


In [70]:
df_encoded_kfold.isna().sum().sum()

0

In [71]:
X = df_encoded_kfold.drop(columns=['Accident_severity'])
y = df_encoded_kfold['Accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [72]:
def evaluate_encoding(X_encoded, name):
    score = np.mean(cross_val_score(RandomForestClassifier(), X_encoded, y_train, cv=5))
    print(f"{name}: {score:.4f}")
    return score

print("Cross-Validation Scores:")
# score_ohe = evaluate_encoding(X_train, "One-Hot Encoding")
# score_le = evaluate_encoding(X_train, "Label Encoding")
# score_te = evaluate_encoding(X_train, "Target Encoding")
score_freq = evaluate_encoding(X_train, "Kfold Target Encoding")

print("\nMutual Information Scores:")
mi_scores = {
    # 'One-Hot': mutual_info_classif(X_train, y_train).mean(),
    # 'Label Encoding': mutual_info_classif(X_train_le, y_train).mean(),
    # 'Target Encoding': mutual_info_classif(X_train, y_train).mean(),
    'Kfold Target Encoding': mutual_info_classif(X_train, y_train).mean()
}
for key, value in mi_scores.items():
    print(f"{key}: {value:.4f}")

Cross-Validation Scores:
Kfold Target Encoding: 0.8472

Mutual Information Scores:
Kfold Target Encoding: 0.0022


In [73]:
model = RandomForestClassifier().fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))

Time_encoded                       0.081201
Day_encoded                        0.061979
Cause_of_accident_encoded          0.060240
Area_accident_occured_encoded      0.054666
Driving_experience_encoded         0.051692
Service_year_of_vehicle_encoded    0.047978
Lanes_or_Medians_encoded           0.047596
Type_of_vehicle_encoded            0.045388
Types_of_Junction_encoded          0.044576
Number_of_vehicles_involved        0.040000
Age_band_of_casualty_encoded       0.038525
Educational_level_encoded          0.037381
Type_of_collision_encoded          0.036513
Vehicle_movement_encoded           0.036220
Age_band_of_driver_encoded         0.034280
Number_of_casualties               0.029485
Sex_of_casualty_encoded            0.027922
Light_conditions_encoded           0.026812
Fitness_of_casuality_encoded       0.024473
Vehicle_driver_relation_encoded    0.023001
Casualty_severity_encoded          0.022979
Weather_conditions_encoded         0.022123
Road_surface_conditions_encoded 

In [74]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

le_rf_f1_score = f1_score(y_test, rf_pred, average='weighted')
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_weighted')
le_rf_cv = rf_cv_scores.mean()

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

le_xgb_f1_score = f1_score(y_test, xgb_pred, average='weighted')
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1_weighted')
le_xgb_cv = xgb_cv_scores.mean()

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

le_knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='f1_weighted')
le_knn_cv = knn_cv_scores.mean()

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

le_logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='f1_weighted')
le_logreg_cv = logreg_cv_scores.mean()

# Extra Trees
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train, y_train)
extra_trees_pred = extra_trees_model.predict(X_test)

le_extra_trees_f1_score = f1_score(y_test, extra_trees_pred, average='weighted')
extra_trees_cv_scores = cross_val_score(extra_trees_model, X, y, cv=5, scoring='f1_weighted')
le_extra_trees_cv = extra_trees_cv_scores.mean()

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

le_gb_f1_score = f1_score(y_test, gb_pred, average='weighted')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1_weighted')
le_gb_cv = gb_cv_scores.mean()

print("Random Forest\t\t -> F1 Score:", le_rf_f1_score, "Cross-Validation F1 Score:", le_rf_cv)
print("XGBoost\t\t\t -> F1 Score:", le_xgb_f1_score, "Cross-Validation F1 Score:", le_xgb_cv)
print("KNN\t\t\t -> F1 Score:", le_knn_f1_score, "Cross-Validation F1 Score:", le_knn_cv)
print("Logistic Regression\t -> F1 Score:", le_logreg_f1_score, "Cross-Validation F1 Score:", le_logreg_cv)
print("Extra Trees\t\t -> F1 Score:", le_extra_trees_f1_score, "Cross-Validation F1 Score:", le_extra_trees_cv)
print("Gradient Boosting\t -> F1 Score:", le_gb_f1_score, "Cross-Validation F1 Score:", le_gb_cv)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest		 -> F1 Score: 0.7761089507087952 Cross-Validation F1 Score: 0.7756450243843476
XGBoost			 -> F1 Score: 0.7905432522928929 Cross-Validation F1 Score: 0.7868941830613099
KNN			 -> F1 Score: 0.8044633479139843 Cross-Validation F1 Score: 0.7869022011130556
Logistic Regression	 -> F1 Score: 0.7753317112558925 Cross-Validation F1 Score: 0.7752608859289839
Extra Trees		 -> F1 Score: 0.7818277285165518 Cross-Validation F1 Score: 0.7817027378964375
Gradient Boosting	 -> F1 Score: 0.783622835384223 Cross-Validation F1 Score: 0.7816384999198606


# MIX Encoding

In [75]:
df_encoded_mix = pd.read_csv('../datasets/encoded.csv')

In [76]:
df_encoded_mix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12296 entries, 0 to 12295
Data columns (total 58 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Age_band_of_driver                                  12296 non-null  int64  
 1   Educational_level                                   12296 non-null  int64  
 2   Driving_experience                                  12296 non-null  int64  
 3   Service_year_of_vehicle                             12296 non-null  int64  
 4   Number_of_vehicles_involved                         12296 non-null  int64  
 5   Number_of_casualties                                12296 non-null  int64  
 6   Age_band_of_casualty                                12296 non-null  int64  
 7   Casualty_severity                                   12296 non-null  int64  
 8   Accident_severity                                   12296 non-null  int64  


In [77]:
df_encoded_mix.isna().sum().sum()

0

In [78]:
X = df_encoded_mix.drop(columns=['Accident_severity'])
y = df_encoded_mix['Accident_severity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [79]:
def evaluate_encoding(X_encoded, name):
    score = np.mean(cross_val_score(RandomForestClassifier(), X_encoded, y_train, cv=5))
    print(f"{name}: {score:.4f}")
    return score

print("Cross-Validation Scores:")
# score_ohe = evaluate_encoding(X_train, "One-Hot Encoding")
# score_le = evaluate_encoding(X_train, "Label Encoding")
# score_te = evaluate_encoding(X_train, "Target Encoding")
score_freq = evaluate_encoding(X_train, "Mix Encoding")

print("\nMutual Information Scores:")
mi_scores = {
    # 'One-Hot': mutual_info_classif(X_train, y_train).mean(),
    # 'Label Encoding': mutual_info_classif(X_train_le, y_train).mean(),
    # 'Target Encoding': mutual_info_classif(X_train, y_train).mean(),
    'MixEncoding': mutual_info_classif(X_train, y_train).mean()
}
for key, value in mi_scores.items():
    print(f"{key}: {value:.4f}")

Cross-Validation Scores:
Mix Encoding: 0.8473

Mutual Information Scores:
MixEncoding: 0.0018


In [80]:
model = RandomForestClassifier().fit(X_train, y_train)
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False))  

Time_encoded                                          7.976736e-02
Cause_of_accident_encoded                             6.298439e-02
Area_accident_occured_encoded                         5.678956e-02
Lanes_or_Medians_encoded                              5.260212e-02
Type_of_vehicle_encoded                               5.023462e-02
Types_of_Junction_encoded                             4.902804e-02
Vehicle_movement_encoded                              4.373435e-02
Type_of_collision_encoded                             4.205882e-02
Driving_experience                                    4.067594e-02
Number_of_vehicles_involved                           3.944885e-02
Service_year_of_vehicle                               3.165442e-02
Road_allignment_encoded                               3.071757e-02
Weather_conditions_encoded                            3.051938e-02
Number_of_casualties                                  2.982907e-02
Pedestrian_movement_encoded                           2.838449

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

le_rf_f1_score = f1_score(y_test, rf_pred, average='weighted')
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_weighted')
le_rf_cv = rf_cv_scores.mean()

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

le_xgb_f1_score = f1_score(y_test, xgb_pred, average='weighted')
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1_weighted')
le_xgb_cv = xgb_cv_scores.mean()

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

le_knn_f1_score = f1_score(y_test, knn_pred, average='weighted')
knn_cv_scores = cross_val_score(knn_model, X, y, cv=5, scoring='f1_weighted')
le_knn_cv = knn_cv_scores.mean()

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

le_logreg_f1_score = f1_score(y_test, logreg_pred, average='weighted')
logreg_cv_scores = cross_val_score(logreg_model, X, y, cv=5, scoring='f1_weighted')
le_logreg_cv = logreg_cv_scores.mean()

# Extra Trees
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train, y_train)
extra_trees_pred = extra_trees_model.predict(X_test)

le_extra_trees_f1_score = f1_score(y_test, extra_trees_pred, average='weighted')
extra_trees_cv_scores = cross_val_score(extra_trees_model, X, y, cv=5, scoring='f1_weighted')
le_extra_trees_cv = extra_trees_cv_scores.mean()

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

le_gb_f1_score = f1_score(y_test, gb_pred, average='weighted')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1_weighted')
le_gb_cv = gb_cv_scores.mean()

print("Random Forest\t\t -> F1 Score:", le_rf_f1_score, "Cross-Validation F1 Score:", le_rf_cv)
print("XGBoost\t\t\t -> F1 Score:", le_xgb_f1_score, "Cross-Validation F1 Score:", le_xgb_cv)
print("KNN\t\t\t -> F1 Score:", le_knn_f1_score, "Cross-Validation F1 Score:", le_knn_cv)
print("Logistic Regression\t -> F1 Score:", le_logreg_f1_score, "Cross-Validation F1 Score:", le_logreg_cv)
print("Extra Trees\t\t -> F1 Score:", le_extra_trees_f1_score, "Cross-Validation F1 Score:", le_extra_trees_cv)
print("Gradient Boosting\t -> F1 Score:", le_gb_f1_score, "Cross-Validation F1 Score:", le_gb_cv)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest		 -> F1 Score: 0.7753317112558925 Cross-Validation F1 Score: 0.7752608859289839
XGBoost			 -> F1 Score: 0.7929266651310093 Cross-Validation F1 Score: 0.7855573990370417
KNN			 -> F1 Score: 0.7767167701754903 Cross-Validation F1 Score: 0.7722822958769145
Logistic Regression	 -> F1 Score: 0.7753317112558925 Cross-Validation F1 Score: 0.7751800835104665
Extra Trees		 -> F1 Score: 0.784049340224471 Cross-Validation F1 Score: 0.7811897598869213
Gradient Boosting	 -> F1 Score: 0.7836446395103212 Cross-Validation F1 Score: 0.7821075994300345


In [None]:
encoding_methods = ['Label Encoder', 'One Hot Encoder', 'Target Encoder', 'KFold Target Encoder', 'Catboost Encoder']
accuracy_scores = [le_rf_accuracy, ohe_rf_accuracy, target_rf_accuracy, kfold_rf_accuracy, catboost_rf_accuracy]
cv_scores = [le_rf_cv, ohe_rf_cv, target_rf_cv, kfold_rf_cv, catboost_rf_accuracy]


x = np.arange(len(encoding_methods))
width = 0.3

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - width/2, accuracy_scores, width, label='Accuracy', color='skyblue')
ax.bar(x + width/2, cv_scores, width, label='Cross-validation', color='orange')

ax.set_xlabel('Encoding Methods')
ax.set_ylabel('Score')
ax.set_title('Comparison of Encoding Methods on Model Performance')
ax.set_xticks(x)
ax.set_xticklabels(encoding_methods, rotation=20)
ax.legend()

plt.show()