In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [82]:
# Load training and test data
df_train = pd.read_csv("training_dataset.csv")
# df_test = pd.read_csv("validation_set.csv")

In [83]:
# Define features and target
X = df_train.drop(columns=["berlangganan_deposito", "customer_number"])
X = X.replace('unknown', np.nan)
y = df_train["berlangganan_deposito"]

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Preprocessing pipeline
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", categorical_transformer, cat_cols),
    ("num", numeric_transformer, num_cols)
])

In [84]:
# XGBoost pipeline
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="auc"))
])

# Hyperparameter search space
param_distributions = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [3, 5, 6, 7, 9],
    'classifier__learning_rate': [0.01, 0.1, 0,2, 0.3],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0]
}

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized search
search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_distributions,
    scoring="roc_auc",
    n_iter=10,
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [85]:
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_val_pred_proba)
print("Validation AUC:", val_auc)
print("Best Params:", search.best_params_)


Validation AUC: 0.7967402466120157
Best Params: {'classifier__subsample': 0.6, 'classifier__n_estimators': 200, 'classifier__max_depth': 6, 'classifier__learning_rate': 0.01, 'classifier__colsample_bytree': 0.8}


In [86]:
# X_test = df_test.drop(columns=["customer_number"])
# test_predictions = best_model.predict(X_test)
# test_probabilities = best_model.predict_proba(X_test)[:, 1]

# # Save predictions if needed
# df_test["prediction"] = test_predictions
# df_test["prediction_proba"] = test_probabilities
# df_test.to_csv("predicted_validation_set.csv", index=False)
