In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [3]:
df = pd.read_csv('training_dataset.csv')
num_rows = df.shape[0]
num_rows

22916

In [4]:
df

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,...,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau,berlangganan_deposito
0,531036,63,sosial media specialis,menikah,Pendidikan Tinggi,no,yes,no,cellular,jul,...,999,0,nonexistent,-1.7,94.215,-40.3,0.885,4991.6,Papua,1
1,999241,43,teknisi,menikah,Pendidikan Tinggi,no,yes,no,cellular,nov,...,999,0,nonexistent,-0.1,93.200,-42.0,4.021,5195.8,Sulawesi,0
2,995002,29,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,yes,cellular,jul,...,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,Papua,0
3,932750,40,pekerja kasar,menikah,SMA,no,no,no,telephone,may,...,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,Sumatera,1
4,684699,40,sosial media specialis,lajang,Pendidikan Tinggi,no,no,no,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,Bali,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22911,680377,32,teknisi,menikah,Diploma,no,yes,no,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.967,5228.1,Kalimantan,0
22912,505429,30,mahasiswa,lajang,Diploma,no,no,no,telephone,sep,...,999,0,nonexistent,-1.1,94.199,-37.5,0.880,4963.6,Bali,0
22913,308952,29,manajer,lajang,SMA,no,yes,no,cellular,jul,...,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,NTT,0
22914,865813,52,entrepreneur,menikah,Tidak Tamat SD,unknown,yes,no,telephone,jun,...,999,0,nonexistent,1.4,94.465,-41.8,4.960,5228.1,NTB,0


In [5]:
df2 = pd.read_csv('validation_set.csv')
num_rows = df2.shape[0]
num_rows

5729

In [6]:
df2

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,...,jumlah_kontak_kampanye_ini,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau
0,445420,35,penyedia jasa,menikah,SMA,no,yes,yes,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.960,5228.1,Jawa
1,585604,52,teknisi,lajang,Diploma,unknown,no,no,telephone,may,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0,Papua
2,888824,37,pekerja kasar,menikah,SMP,unknown,yes,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,Bali
3,816820,51,pengangguran,menikah,Diploma,no,no,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,Sumatera
4,542716,45,teknisi,cerai,SMA,no,yes,no,cellular,may,...,1,999,1,failure,-1.8,92.893,-46.2,1.327,5099.1,Sumatera
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724,782072,51,pekerja kasar,menikah,Tidak Tamat SD,no,yes,no,telephone,jun,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,Kalimantan
5725,116371,30,pekerja kasar,menikah,SMP,no,yes,yes,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,Sulawesi
5726,773759,45,pekerja kasar,menikah,SMP,unknown,yes,no,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,NTB
5727,612330,60,teknisi,menikah,Diploma,unknown,no,no,telephone,jul,...,15,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,Papua


In [65]:
categorical_columns = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
    'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
    'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
]

# Step 1: Do NOT drop rows with 'unknown' but keep 'unknown' as a category
# Just ensure categorical columns are strings to prevent errors
for col in categorical_columns:
    df[col] = df[col].astype(str)

In [66]:
# Target and features
y = df["berlangganan_deposito"]
X = df.drop(columns=["berlangganan_deposito", "customer_number"])

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [68]:
numerical_features = [
    'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
    'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
    'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
    'suku_bunga_euribor_3bln', 'jumlah_pekerja'
]

# Step 2: Split before imputation/scaling to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [69]:
# Step 3: Impute numerical features with median (fit only on train)
num_imputer = SimpleImputer(strategy='median')
X_train_num = num_imputer.fit_transform(X_train[numerical_features])
X_test_num = num_imputer.transform(X_test[numerical_features])

In [70]:
# Optional: Simple outlier treatment (clip to 1st and 99th percentile in training set)
lower_bounds = np.percentile(X_train_num, 1, axis=0)
upper_bounds = np.percentile(X_train_num, 99, axis=0)
X_train_num = np.clip(X_train_num, lower_bounds, upper_bounds)
X_test_num = np.clip(X_test_num, lower_bounds, upper_bounds)

In [71]:
# Step 4: Scale numerical features
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

In [72]:
# Step 5: Impute categorical features - treat 'unknown' as valid category (no imputation needed if 'unknown' present)
# But if missing values exist, impute with 'Missing'
cat_imputer = SimpleImputer(strategy='constant', fill_value='Missing')
X_train_cat_imputed = cat_imputer.fit_transform(X_train[categorical_columns])
X_test_cat_imputed = cat_imputer.transform(X_test[categorical_columns])

In [73]:
# Step 6: One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train_cat_imputed)
X_test_cat = encoder.transform(X_test_cat_imputed)

In [74]:
# Step 7: Combine numerical and categorical features
X_train_final = np.hstack([X_train_num_scaled, X_train_cat])
X_test_final = np.hstack([X_test_num_scaled, X_test_cat])

In [75]:
# Step 8: Balance training data with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_final, y_train)

In [76]:
# Now X_train_res, y_train_res are ready for training your model,
# and X_test_final, y_test for evaluation

In [77]:
# Example model with class weighting to handle imbalance
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

In [78]:
# Train on your balanced or original training data (here, without SMOTE for example)
model.fit(X_train_final, y_train)

In [79]:
# Predict probabilities on test set (for log loss, use predict_proba)
y_pred_proba = model.predict_proba(X_test_final)[:, 1]  # probability of positive class

In [80]:
# Calculate log loss on test set
loss = log_loss(y_test, y_pred_proba)
print(f"Log Loss: {loss:.4f}")

Log Loss: 0.5176


In [81]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],        # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],      # Different penalty terms
    'solver': ['liblinear', 'saga']             # solvers compatible with l1/elasticnet
}

In [82]:
# Set up Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    param_grid,
    scoring='neg_log_loss',   # we want to minimize log loss
    cv=cv,
    verbose=1,
    n_jobs=-1
)

In [83]:
grid_search.fit(X_train_final, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"Best log loss (negative): {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Geovanka\Documents\GitHub\DataQuest_Deposito\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Geovanka\Documents\GitHub\DataQuest_Deposito\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Geovanka\Documents\GitHub\DataQuest_Deposito\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _

Best params: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best log loss (negative): -0.5249


In [84]:
# Evaluate best model on test set
best_model = grid_search.best_estimator_
y_test_pred_proba = best_model.predict_proba(X_test_final)[:, 1]
test_loss = log_loss(y_test, y_test_pred_proba)
print(f"Test Log Loss with best params: {test_loss:.4f}")

Test Log Loss with best params: 0.5173


In [85]:
from sklearn.metrics import accuracy_score

# Predict labels (not probabilities) on the test set
y_pred = best_model.predict(X_test_final)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy on test set: {acc:.4f}")

Accuracy on test set: 0.8113


In [86]:
# Get feature names from preprocessing
# Combine numerical + one-hot encoded feature names
feature_names_num = numerical_features
feature_names_cat = encoder.get_feature_names_out(categorical_columns)
all_feature_names = list(feature_names_num) + list(feature_names_cat)

# Coefficients
coefficients = model.coef_[0]  # since it's binary classification
intercept = model.intercept_[0]

# Print model equation
print(f"Model equation:")
print(f"Logit(P) = {intercept:.4f}", end=" ")
for name, coef in zip(all_feature_names, coefficients):
    print(f"+ ({coef:.4f}) * {name}", end=" ")


Model equation:
Logit(P) = -0.0618 + (0.0229) * usia + (-0.1197) * jumlah_kontak_kampanye_ini + (-0.2482) * hari_sejak_kontak_sebelumnya + (-0.1303) * jumlah_kontak_sebelumnya + (-1.8276) * tingkat_variasi_pekerjaan + (0.8840) * indeks_harga_konsumen + (0.0442) * indeks_kepercayaan_konsumen + (0.3689) * suku_bunga_euribor_3bln + (0.1085) * jumlah_pekerja + (-0.2861) * pekerjaan_asisten rumah tangga + (-0.0210) * pekerjaan_entrepreneur + (0.2746) * pekerjaan_mahasiswa + (-0.0223) * pekerjaan_manajer + (-0.0874) * pekerjaan_pekerja kasar + (0.1437) * pekerjaan_pemilik bisnis + (0.0609) * pekerjaan_pengangguran + (0.2621) * pekerjaan_pensiunan + (-0.1333) * pekerjaan_penyedia jasa + (0.0158) * pekerjaan_sosial media specialis + (0.0465) * pekerjaan_teknisi + (-0.3001) * pekerjaan_unknown + (-0.0903) * status_perkawinan_cerai + (-0.0248) * status_perkawinan_lajang + (-0.0923) * status_perkawinan_menikah + (0.1609) * status_perkawinan_unknown + (-0.1090) * pendidikan_Diploma + (0.0136) * pe

In [87]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Example for first row of test set
z = intercept + np.dot(X_test_final[0], coefficients)
prob = sigmoid(z)
print(f"Predicted probability: {prob:.4f}")


Predicted probability: 0.6432


In [88]:
from sklearn.metrics import log_loss

# Predict probabilities for log loss
y_pred_proba = best_model.predict_proba(X_test_final)

# Log loss
ll = log_loss(y_test, y_pred_proba)

print(f"Log Loss on test set: {ll:.4f}")
print(f"Accuracy on test set: {acc:.4f}")

Log Loss on test set: 0.5173
Accuracy on test set: 0.8113
