In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

# 1. Membaca dataset
df = pd.read_csv("../datasets/cleanedfix.csv")

# 2. Menangani missing values dengan imputasi berdasarkan distribusi
def fill_unknown(df, col):
    most_frequent = df[col].mode()[0]
    df[col] = df[col].replace("unknown", most_frequent)
    return df

for col in df.columns:
    if df[col].dtype == "object":
        df = fill_unknown(df, col)

# 3. Encoding fitur kategori
ordinal_features = ["Driving_experience", "Service_year_of_vehicle"]
categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
nominal_features = [col for col in categorical_features if col not in ordinal_features + ["Accident_severity"]]

le = LabelEncoder()
for col in ordinal_features:
    df[col] = le.fit_transform(df[col])

df = pd.get_dummies(df, columns=nominal_features, drop_first=True)
df["Accident_severity"] = le.fit_transform(df["Accident_severity"])

# 4. Memisahkan fitur dan target
X = df.drop(columns=["Accident_severity"])
y = df["Accident_severity"]

# 5. Membagi data menjadi train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 6. Menangani ketidakseimbangan kelas
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

smote_tomek = SMOTETomek(random_state=42)
X_smote_tomek, y_smote_tomek = smote_tomek.fit_resample(X_train, y_train)



In [None]:
# from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTETomek
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, f1_score

# # Split data menjadi train-test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # 1. Menggunakan SMOTE Biasa
# smote = SMOTE(random_state=42)
# X_smote, y_smote = smote.fit_resample(X_train, y_train)
# rf_smote = RandomForestClassifier(random_state=42)
# rf_smote.fit(X_smote, y_smote)
# y_pred_smote = rf_smote.predict(X_test)

# # 2. Menggunakan SMOTE-Tomek
# smote_tomek = SMOTETomek(random_state=42)
# X_smote_tomek, y_smote_tomek = smote_tomek.fit_resample(X_train, y_train)
# rf_smote_tomek = RandomForestClassifier(random_state=42)
# rf_smote_tomek.fit(X_smote_tomek, y_smote_tomek)
# y_pred_smote_tomek = rf_smote_tomek.predict(X_test)

# # 3. Menggunakan Class Weight
# rf_weighted = RandomForestClassifier(class_weight='balanced', random_state=42)
# rf_weighted.fit(X_train, y_train)
# y_pred_weighted = rf_weighted.predict(X_test)

# # Evaluasi dengan F1-score (macro)
# f1_smote = f1_score(y_test, y_pred_smote, average='macro')
# f1_smote_tomek = f1_score(y_test, y_pred_smote_tomek, average='macro')
# f1_weighted = f1_score(y_test, y_pred_weighted, average='macro')

# # Tampilkan hasil
# print("F1-score Macro:")
# print(f"SMOTE: {f1_smote:.4f}")
# print(f"SMOTE-Tomek: {f1_smote_tomek:.4f}")
# print(f"Class Weight: {f1_weighted:.4f}")


F1-score Macro:
SMOTE: 0.3447
SMOTE-Tomek: 0.3485
Class Weight: 0.3075
