In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

In [None]:
df_train = pd.read_csv("training_dataset.csv")
df_val = pd.read_csv("validation_set.csv")

In [None]:
categorical_columns = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
    'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
    'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya'
]

In [None]:
def count_unknown(df, categorical_cols):
    mask = df[categorical_cols].isin(['unknown']).any(axis=1)
    return mask.sum()

# Sebelum dihapus
unknown_train_before = count_unknown(df_train, categorical_columns)
unknown_val_before = count_unknown(df_val, categorical_columns)
print(f"Jumlah baris dengan 'unknown' di train sebelum dihapus: {unknown_train_before}")
print(f"Jumlah baris dengan 'unknown' di val sebelum dihapus: {unknown_val_before}")


Jumlah baris dengan 'unknown' di train sebelum dihapus: 5968
Jumlah baris dengan 'unknown' di val sebelum dihapus: 1476


In [None]:
df_train_clean = df_train[~df_train[categorical_columns].isin(['unknown']).any(axis=1)].reset_index(drop=True)
df_val_clean = df_val[~df_val[categorical_columns].isin(['unknown']).any(axis=1)].reset_index(drop=True)

In [None]:
unknown_train_after = count_unknown(df_train_clean, categorical_columns)
unknown_val_after = count_unknown(df_val_clean, categorical_columns)
print(f"Jumlah baris dengan 'unknown' di train setelah dihapus: {unknown_train_after}")
print(f"Jumlah baris dengan 'unknown' di val setelah dihapus: {unknown_val_after}")

Jumlah baris dengan 'unknown' di train setelah dihapus: 0
Jumlah baris dengan 'unknown' di val setelah dihapus: 0


In [None]:
df_train_clean


Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,...,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau,berlangganan_deposito
0,531036,63,sosial media specialis,menikah,Pendidikan Tinggi,no,yes,no,cellular,jul,...,999,0,nonexistent,-1.7,94.215,-40.3,0.885,4991.6,Papua,1
1,999241,43,teknisi,menikah,Pendidikan Tinggi,no,yes,no,cellular,nov,...,999,0,nonexistent,-0.1,93.200,-42.0,4.021,5195.8,Sulawesi,0
2,995002,29,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,yes,cellular,jul,...,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,Papua,0
3,932750,40,pekerja kasar,menikah,SMA,no,no,no,telephone,may,...,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,Sumatera,1
4,684699,40,sosial media specialis,lajang,Pendidikan Tinggi,no,no,no,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,Bali,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16943,137521,29,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,yes,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,NTT,0
16944,133325,30,mahasiswa,lajang,Diploma,no,no,no,cellular,sep,...,9,2,failure,-1.1,94.199,-37.5,0.879,4963.6,Bali,0
16945,680377,32,teknisi,menikah,Diploma,no,yes,no,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.967,5228.1,Kalimantan,0
16946,505429,30,mahasiswa,lajang,Diploma,no,no,no,telephone,sep,...,999,0,nonexistent,-1.1,94.199,-37.5,0.880,4963.6,Bali,0


In [None]:
df_val_clean

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,...,jumlah_kontak_kampanye_ini,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau
0,445420,35,penyedia jasa,menikah,SMA,no,yes,yes,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.960,5228.1,Jawa
1,816820,51,pengangguran,menikah,Diploma,no,no,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,Sumatera
2,542716,45,teknisi,cerai,SMA,no,yes,no,cellular,may,...,1,999,1,failure,-1.8,92.893,-46.2,1.327,5099.1,Sumatera
3,434084,32,sosial media specialis,lajang,SMA,no,no,no,telephone,jun,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,Papua
4,159684,61,pensiunan,cerai,Pendidikan Tinggi,no,yes,no,cellular,apr,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.384,5099.1,Kalimantan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4248,214452,27,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,no,cellular,apr,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.392,5099.1,Sulawesi
4249,588935,38,pemilik bisnis,cerai,SMP,no,yes,no,telephone,may,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,Papua
4250,782072,51,pekerja kasar,menikah,Tidak Tamat SD,no,yes,no,telephone,jun,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,Kalimantan
4251,116371,30,pekerja kasar,menikah,SMP,no,yes,yes,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,Sulawesi


In [None]:
y_train = df_train["berlangganan_deposito"]
X_train = df_train.drop(columns=["berlangganan_deposito", "customer_number"])
X_val = df_val.drop(columns=["customer_number"])
val_customer_number = df_val["customer_number"]

In [None]:
categorical_features = categorical_columns
numerical_features = [
    'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
    'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan',
    'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
    'suku_bunga_euribor_3bln', 'jumlah_pekerja'
]

In [None]:
categorical_features = categorical_columns

# Imputer untuk numerik (jika ada missing)
num_imputer = SimpleImputer(strategy='median')
X_train_num = num_imputer.fit_transform(X_train[numerical_features])
X_val_num = num_imputer.transform(X_val[numerical_features])

In [None]:
# Scaling numerik
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_val_num_scaled = scaler.transform(X_val_num)

In [None]:
# Kategorikal langsung encoding tanpa imputer
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_enc = encoder.fit_transform(X_train[categorical_features])
X_val_cat_enc = encoder.transform(X_val[categorical_features])

In [None]:
X_train_final = np.hstack([X_train_num_scaled, X_train_cat_enc])
X_val_final = np.hstack([X_val_num_scaled, X_val_cat_enc])

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_final, y_train)

In [44]:
# Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, None],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_res, y_train_res)
best_rf = grid_rf.best_estimator_

# Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=3, scoring='f1', n_jobs=-1)
grid_gb.fit(X_train_res, y_train_res)
best_gb = grid_gb.best_estimator_

# Logistic Regression
param_grid_lr = {
    'C': [0.1, 1.0, 10],
    'solver': ['liblinear']
}
grid_lr = GridSearchCV(LogisticRegression(random_state=42), param_grid_lr, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_res, y_train_res)
best_lr = grid_lr.best_estimator_

In [46]:

print("\n Best Parameters:", grid_rf.best_params_)
print("\n Best Parameters:", grid_gb.best_params_)
print("\n Best Parameters:", grid_lr.best_params_)


 Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}

 Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

 Best Parameters: {'C': 0.1, 'solver': 'liblinear'}


In [48]:
# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('lr', best_lr)],
    voting='soft'
)
voting_clf.fit(X_train_res, y_train_res)

In [45]:
# clf = RandomForestClassifier(random_state=42)
# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5]
# }
# grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
# grid.fit(X_train_res, y_train_res)

In [49]:
y_pred = voting_clf.predict(X_val_final)
y_val_pred = voting_clf.predict_proba(X_val_final)[:, 1]

In [50]:
output = pd.DataFrame({
    "customer_number": val_customer_number,
    "berlangganan_deposito": y_val_pred
})

In [51]:
output_sorted = output.sort_values(by="berlangganan_deposito", ascending=False)
output_sorted

Unnamed: 0,customer_number,berlangganan_deposito
3609,271052,0.958109
4420,854992,0.950358
418,639476,0.941460
4013,851658,0.938144
1034,964667,0.937064
...,...,...
4789,588128,0.031762
1729,656734,0.030576
2786,700197,0.029046
4140,618417,0.028663
