In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('training_dataset.csv')
num_rows = df.shape[0]
num_rows

22916

In [3]:
# Target and features
y = df["berlangganan_deposito"]
X = df.drop(columns=["berlangganan_deposito", "customer_number"])

In [4]:
categorical_columns = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
    'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
    'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
]

numerical_features = [
    'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
    'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
    'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
    'suku_bunga_euribor_3bln', 'jumlah_pekerja'
]

In [5]:
for col in categorical_columns:
    X[col] = X[col].astype(str)

In [6]:
# Split data before preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

In [9]:
# Define classifier
log_reg = LogisticRegression(max_iter=1000, solver='saga', class_weight='balanced')

In [10]:
# Build pipeline with SMOTE
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', log_reg)
])

In [11]:
# Parameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__l1_ratio': [0.0, 0.5, 1.0]  # Only used with 'elasticnet'
}

In [12]:
# Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
# Grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='neg_log_loss', cv=cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [14]:
# Best results
print("Best parameters:", grid_search.best_params_)
print("Best log loss on validation set:", -grid_search.best_score_)

Best parameters: {'classifier__C': 0.1, 'classifier__l1_ratio': 0.5, 'classifier__penalty': 'elasticnet'}
Best log loss on validation set: 0.5209776200317228


In [15]:
# Evaluate on test set
y_pred_proba = grid_search.predict_proba(X_test)
y_pred_class = grid_search.predict(X_test)

print("Test set log loss:", log_loss(y_test, y_pred_proba))
print("Test set accuracy:", accuracy_score(y_test, y_pred_class))

Test set log loss: 0.5164704212529022
Test set accuracy: 0.8069371727748691


In [16]:
# Predicted probabilities for the positive class (1)
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]  # get probability for class 1

# Calculate AUC score
auc = roc_auc_score(y_test, y_pred_proba)

print("Test set AUC score:", auc)

Test set AUC score: 0.7877652604662265
