In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

In [81]:
df = pd.read_csv('training_dataset.csv')
num_rows = df.shape[0]
num_rows

22916

In [82]:
# Target and features
y = df["berlangganan_deposito"]
X = df.drop(columns=["berlangganan_deposito", "customer_number"])

In [83]:
# categorical_columns = [
#     'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
#     'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
#     'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
# ]

# numerical_features = [
#     'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
#     'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
#     'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
#     'suku_bunga_euribor_3bln', 'jumlah_pekerja'
# ]

categorical_columns = X.select_dtypes(include="object").columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [84]:
# for col in categorical_columns:
#     X[col] = X[col].astype(str)

In [85]:
# Split data before preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [86]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [87]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

In [88]:
# XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

In [89]:
# Full pipeline with SMOTE
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb_clf)
])

In [90]:
# Hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [3, 5, 6, 7, 9],
    'classifier__learning_rate': [0.01, 0.1, 0,2, 0.3],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0]
}

In [91]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [92]:
# Grid Search
grid = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    scoring='roc_auc',
    n_iter=10,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [93]:
# Fit model
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [94]:
# Best params
print("Best parameters:", grid.best_params_)

Best parameters: {'classifier__subsample': 0.6, 'classifier__n_estimators': 200, 'classifier__max_depth': 6, 'classifier__learning_rate': 0.01, 'classifier__colsample_bytree': 0.8}


In [95]:
y_val_pred_proba = best_model.predict_proba(X_test)[:, 1]
val_auc = roc_auc_score(y_test, y_val_pred_proba)
print("Validation AUC:", val_auc)

Validation AUC: 0.7837062238718058


In [96]:
# Load your new dataset
# new_data = pd.read_csv('validation_set.csv')

In [97]:
# # Predict class
# new_preds = grid.predict(new_data)

# # Predict probabilities (optional, for e.g., log loss or AUC)
# new_pred_probs = grid.predict_proba(new_data)[:, 1]  # probability of class 1

# # Add predictions as new columns
# new_data['predicted_class'] = new_preds
# new_data['predicted_proba'] = new_pred_probs

# # Save to new CSV
# new_data.to_csv('new_data_with_predictions.csv', index=False)

In [98]:
# data_w_pred = pd.read_csv('new_data_with_predictions.csv')
# data_w_pred