## IMPORT LIBRARIES

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin

## LOAD DATASET

In [17]:
df = pd.read_csv("training_dataset.csv")
df_val = pd.read_csv("validation_set.csv")

## DEFINE CATEGORICAL COLUMNS

In [18]:
categorical_columns = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
    'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
    'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
]

## DEFINE FEATURES AND TARGET

In [19]:
y = df["berlangganan_deposito"]
X = df.drop(columns=["berlangganan_deposito", "customer_number"])
X_val = df_val.drop(columns=["customer_number"])
val_customer_number = df_val["customer_number"]

## DEFINE NUMERICAL FEATURES

In [20]:
numerical_features = [
    'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
    'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
    'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
    'suku_bunga_euribor_3bln', 'jumlah_pekerja'
]

## CUSTOM TRANSFORMER TO REPLACE "unknown"

In [21]:
class ReplaceUnknown(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.replace("unknown", np.nan)

## PREPROCESSING PIPELINE

In [22]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_features),
    ('cat', Pipeline([
        ('replace_unknown', ReplaceUnknown()),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_columns)
])

## APPLY PREPROCESSING

In [23]:
X_train_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_val)

## HANDLE CLASS IMBALANCE WITH SMOTE

In [24]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y)

## MODELS HYPERPARAMETER TUNING (RF AND LR)

In [25]:
# Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, None],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_res, y_train_res)
best_rf = grid_rf.best_estimator_

# Logistic Regression
param_grid_lr = {
    'C': [0.1, 1.0, 10],
    'solver': ['liblinear']
}
grid_lr = GridSearchCV(LogisticRegression(random_state=42), param_grid_lr, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_res, y_train_res)
best_lr = grid_lr.best_estimator_

## VOTING CLASSIFIER

In [26]:
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), 
                ('lr', best_lr), 
                ],
    voting='soft'
)
voting_clf.fit(X_train_res, y_train_res)

## PREDICT

In [27]:
y_pred = voting_clf.predict(X_test_processed)
y_proba = voting_clf.predict_proba(X_test_processed)[:, 1]

## CREATE AND SORT OUTPUT

In [28]:
output = pd.DataFrame({
    "customer_number": val_customer_number,
    "berlangganan_deposito": y_proba
})

In [29]:
output_sorted = output.sort_values(by="berlangganan_deposito", ascending=False)
output_sorted

Unnamed: 0,customer_number,berlangganan_deposito
418,639476,0.979474
1034,964667,0.976843
2954,803353,0.971832
4504,234831,0.968771
4593,649468,0.966729
...,...,...
786,285703,0.040232
4273,638436,0.034568
4140,618417,0.021394
2786,700197,0.014656


## SAVE TO CSV

In [30]:
output_sorted.to_csv('prediksi_validasi_2.csv', index=False)