In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Load dataset
df = pd.read_csv("training_dataset.csv")

# Target dan fitur
y = df["berlangganan_deposito"]
X = df.drop(columns=["berlangganan_deposito", "customer_number"])

# ===== PENANGANAN NILAI 999 =====
X["pernah_dihubungi"] = X["hari_sejak_kontak_sebelumnya"] != 999
X["hari_sejak_kontak_sebelumnya"] = X["hari_sejak_kontak_sebelumnya"].replace(999, np.nan)

# Tambahkan flag untuk kategori 'unknown'
X["is_gagal_bayar_unknown"] = X["gagal_bayar_sebelumnya"] == "unknown"
X["is_pinjaman_rumah_unknown"] = X["pinjaman_rumah"] == "unknown"
X["is_pinjaman_pribadi_unknown"] = X["pinjaman_pribadi"] == "unknown"

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# ===== FITUR =====
onehot_with_unknown = [
    'gagal_bayar_sebelumnya', 'pinjaman_rumah', 'pinjaman_pribadi'
]

onehot_default = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'jenis_kontak',
    'bulan_kontak_terakhir', 'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
]

numerical_features = [
    'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
    'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
    'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
    'suku_bunga_euribor_3bln', 'jumlah_pekerja', 'pernah_dihubungi',
    'is_gagal_bayar_unknown', 'is_pinjaman_rumah_unknown', 'is_pinjaman_pribadi_unknown'
]

# ===== PREPROCESSING PIPELINE =====
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_features),

    ('cat_known', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), onehot_default),

    ('cat_unknown_as_category', OneHotEncoder(
        handle_unknown='ignore', sparse_output=False
    ), onehot_with_unknown)
])

# ===== MODEL PIPELINES =====
# Contoh untuk Random Forest
pipeline_rf = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, None],
    'classifier__min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

# Sama untuk Logistic Regression
pipeline_lr = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
])

param_grid_lr = {
    'classifier__C': [0.1, 1.0, 10]
}

grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_

# XGBoost pipeline
pipeline_xgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

param_grid_xgb = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 5],
    'classifier__subsample': [0.8, 1],
    'classifier__colsample_bytree': [0.8, 1]
}

grid_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_

# CatBoost pipeline
pipeline_cat = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid_cat = {
    'classifier__iterations': [100, 200],
    'classifier__depth': [4, 6],
    'classifier__learning_rate': [0.05, 0.1]
}

grid_cat = GridSearchCV(pipeline_cat, param_grid_cat, cv=3, scoring='f1', n_jobs=-1)
grid_cat.fit(X_train, y_train)
best_cat = grid_cat.best_estimator_

# LightGBM pipeline
pipeline_lgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42))
])

param_grid_lgb = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [6, -1],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__num_leaves': [31, 50]
}

grid_lgb = GridSearchCV(pipeline_lgb, param_grid_lgb, cv=3, scoring='f1', n_jobs=-1)
grid_lgb.fit(X_train, y_train)
best_lgb = grid_lgb.best_estimator_

# ===== VOTING CLASSIFIER =====
# Voting classifier dengan pipeline yang sudah termasuk preprocessor dan smote,
# kita harus membuat pipeline voting tanpa smote lagi di sini,
# jadi kita gunakan estimators yang hanya memuat model tanpa preprocessor dan smote.

# Jadi, kita extract model dari pipeline untuk voting classifier,
# dan buat pipeline voting yang menggabungkan preprocessing + voting.

voting_estimators = [
    ('rf', best_rf.named_steps['classifier']),
    ('lr', best_lr.named_steps['classifier']),
    ('xgb', best_xgb.named_steps['classifier']),
    ('cat', best_cat.named_steps['classifier']),
    ('lgb', best_lgb.named_steps['classifier'])
]

voting_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('voting', VotingClassifier(
        estimators=voting_estimators,
        voting='soft',
        n_jobs=-1
    ))
])

voting_clf.fit(X_train, y_train)

# ===== EVALUASI =====
y_pred = voting_clf.predict(X_test)
y_proba = voting_clf.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_proba)
print(f"\nROC AUC Score: {auc_score:.4f}")


Best RF AUC: 0.7857937071987869
Best LR AUC: 0.7864147118742563
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best XGB AUC: 0.7845484489024894
Best CatBoost AUC: 0.7910267422083067
[LightGBM] [Info] Number of positive: 16241, number of negative: 16241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13440
[LightGBM] [Info] Number of data points in the train set: 32482, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best LightGBM AUC: 0.7864708617830417


NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.