In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold


In [7]:
# Load training and test data
df_train = pd.read_csv("training_dataset.csv")
# df_test = pd.read_csv("validation_set.csv")

In [46]:
# Define features and target
X = df_train.drop(columns=["berlangganan_deposito", "customer_number"])
X = X.replace(['unknown', r'^\s*$'], np.nan, regex=True)
y = df_train["berlangganan_deposito"]
# X_val = df_test.drop(columns=["customer_number"])
# X_val = X_val.replace('unknown', np.nan)
# val_customer_number = df_test["customer_number"]

# pendidikan handle 
pendidikan_order = {
    "TIDAK SEKOLAH": 0, "Tidak Tamat SD": 1, "SD": 2, 
    "SMP": 3, "SMA": 4, "Diploma": 5, "Pendidikan Tinggi": 6}
X["pendidikan"] = X["pendidikan"].map(pendidikan_order)


# # month handle
# bulan_dict = {
#     "jan": 1, "feb": 2, "mar": 3, "apr": 4,
#     "mei": 5, "jun": 6, "jul": 7, "aug": 8,
#     "sep": 9, "oct": 10, "nov": 11, "dec": 12
# }
# X["bulan_kontak_terakhir"] = X["bulan_kontak_terakhir"].map(bulan_dict)

# # day handle
# hari_dict = {
#     "mon": 0, "tue": 1, "wed": 2, "thu": 3,
#     "fri": 4, "sat": 5, "sun": 6
# }
# X["hari_kontak_terakhir"] = X["hari_kontak_terakhir"].map(hari_dict)


# 999 handle
X['pernah_dihubungi_sebelumnya'] = (X['hari_sejak_kontak_sebelumnya'] != 999).astype(int)
median_hari = X.loc[X['hari_sejak_kontak_sebelumnya'] != 999, 'hari_sejak_kontak_sebelumnya'].median()
X['hari_sejak_kontak_sebelumnya'] = X['hari_sejak_kontak_sebelumnya'].replace(999, median_hari)


# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

num_cols


['usia',
 'pendidikan',
 'jumlah_kontak_kampanye_ini',
 'hari_sejak_kontak_sebelumnya',
 'jumlah_kontak_sebelumnya',
 'tingkat_variasi_pekerjaan',
 'indeks_harga_konsumen',
 'indeks_kepercayaan_konsumen',
 'suku_bunga_euribor_3bln',
 'jumlah_pekerja']

In [47]:
# Preprocessing pipeline
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", categorical_transformer, cat_cols),
    ("num", numeric_transformer, num_cols)
])

In [48]:
# LightGBM pipeline
lgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LGBMClassifier(objective="binary", metric="auc", random_state=42))
])

# Hyperparameter search space
param_distributions_lgb = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [3, 5, 6, 7, 9],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'classifier__num_leaves': [15, 31, 50, 70],
    'classifier__reg_lambda': [0, 0.1, 1, 10],
    'classifier__reg_alpha': [0, 0.1, 1, 10]
}

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized search
search_lgb = RandomizedSearchCV(
    lgb_pipeline,
    param_distributions=param_distributions_lgb,
    scoring="roc_auc",
    n_iter=25,  # Increase for better tuning
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit model
search_lgb.fit(X_train, y_train)

# Best model
best_lgb_model = search_lgb.best_estimator_




Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [22]:
# XGBoost pipeline
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="auc"))
])

# Hyperparameter search space
param_distributions = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [3, 5, 6, 7, 9],
    'classifier__learning_rate': [0.01, 0.1, 0,2, 0.3],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0]
}

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized search
search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_distributions,
    scoring="roc_auc",
    n_iter=10,
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.



In [49]:
y_val_pred_proba = best_lgb_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_val_pred_proba)
print("Validation AUC:", val_auc)
print("Best Params:", search.best_params_)


Validation AUC: 0.797824100253166
Best Params: {'classifier__subsample': 0.6, 'classifier__n_estimators': 200, 'classifier__max_depth': 6, 'classifier__learning_rate': 0.01, 'classifier__colsample_bytree': 0.8}


In [23]:
output = pd.DataFrame({
    "customer_number": val_customer_number,
    "berlangganan_deposito": y_val_pred_proba
})

In [26]:
output_sorted = output.sort_values(by="berlangganan_deposito", ascending=False)
output_sorted

Unnamed: 0,customer_number,berlangganan_deposito
1856,971758,0.776863
2954,803353,0.775658
4912,758564,0.772665
3609,271052,0.771808
4691,166739,0.765629
...,...,...
5503,585835,0.048430
2505,597789,0.048314
3077,910035,0.048247
3425,913860,0.048069


In [25]:
# X_test = df_test.drop(columns=["customer_number"])
# test_predictions = best_model.predict(X_test)
# test_probabilities = best_model.predict_proba(X_test)[:, 1]

# # Save predictions if needed
# df_test["prediction"] = test_predictions
# df_test["prediction_proba"] = test_probabilities
output.to_csv("predicted_validation_set.csv", index=False)
