In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# --- 1. Konfigurasi Awal ---

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)


print("--- Library berhasil di-import ---")


# --- 2. Hubungkan ke Google Drive ---
try:
    drive.mount('/content/drive')
    print("--- Google Drive berhasil terhubung ---")
except Exception as e:
    print(f"--- Gagal menghubungkan Google Drive: {e} ---")

BASE_PATH = "/content/drive/MyDrive/Kuliah/HomeCreditProject/"

train_path = BASE_PATH + "application_train.csv"
desc_path = BASE_PATH + "HomeCredit_columns_description.csv"

print(f"Path data train: {train_path}")
print(f"Path kamus data: {desc_path}")

try:
    # Muat data latih
    df_train = pd.read_csv(train_path)
    print(f"\n--- Data 'application_train.csv' berhasil dimuat ---")
    print(f"Bentuk data (baris, kolom): {df_train.shape}")

    # Muat kamus data
    df_desc = pd.read_csv(desc_path, encoding='ISO-8859-1')
    print(f"\n--- Kamus Data 'HomeCredit_columns_description.csv' berhasil dimuat ---")

except FileNotFoundError as e:
    print(f"\n--- ERROR: File tidak ditemukan ---")
    print(e)
    print("\nPastikan Anda sudah mengunggah file ke Google Drive dan path di 'BASE_PATH' sudah benar.")
except Exception as e:
    print(f"\n--- ERROR: Terjadi kesalahan saat memuat data: {e} ---")

--- Library berhasil di-import ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Google Drive berhasil terhubung ---
Path data train: /content/drive/MyDrive/Kuliah/HomeCreditProject/application_train.csv
Path kamus data: /content/drive/MyDrive/Kuliah/HomeCreditProject/HomeCredit_columns_description.csv

--- Data 'application_train.csv' berhasil dimuat ---
Bentuk data (baris, kolom): (307511, 122)

--- Kamus Data 'HomeCredit_columns_description.csv' berhasil dimuat ---


In [None]:
# Analisis Variabel TARGET ---

if 'df_train' in locals():
    print("--- Menganalisis Kolom TARGET ---")

    # 1. Cek nilai unik
    print(f"Nilai unik di TARGET: {df_train['TARGET'].unique()}")

    # 2. Hitung jumlah & persentase
    target_counts = df_train['TARGET'].value_counts()
    target_perc = df_train['TARGET'].value_counts(normalize=True) * 100

    print("\nJumlah Klien:")
    print(f"TARGET 0 (Lancar):     {target_counts.get(0, 0)} klien")
    print(f"TARGET 1 (Gagal Bayar): {target_counts.get(1, 0)} klien")

    print("\nPersentase Klien:")
    print(f"TARGET 0 (Lancar):     {target_perc.get(0, 0):.2f}%")
    print(f"TARGET 1 (Gagal Bayar): {target_perc.get(1, 0):.2f}%")

    # 3. Visualisasi
    plt.figure(figsize=(8, 5))
    sns.countplot(x='TARGET', data=df_train)
    plt.title('Distribusi Variabel TARGET', fontsize=16)
    plt.xlabel('TARGET (0 = Lancar, 1 = Gagal Bayar)')
    plt.ylabel('Jumlah Klien')
    plt.show()

else:
    print("\n--- Variabel 'df_train' tidak ditemukan. Pastikan sel kode pertama berhasil dijalankan. ---")

--- Menganalisis Kolom TARGET ---
Nilai unik di TARGET: [1 0]

Jumlah Klien:
TARGET 0 (Lancar):     282686 klien
TARGET 1 (Gagal Bayar): 24825 klien

Persentase Klien:
TARGET 0 (Lancar):     91.93%
TARGET 1 (Gagal Bayar): 8.07%


In [None]:
# ---Menganalisis Missing Values ---

if 'df_train' in locals():
    # 1. Hitung total missing values per kolom
    missing_values = df_train.isnull().sum()

    # 2. Hitung persentase missing values
    missing_percentage = (missing_values / len(df_train)) * 100

    missing_data = pd.DataFrame({
        'Total Missing': missing_values,
        'Percentage (%)': missing_percentage
    })

    # 3. Urutkan dari yang paling banyak missing
    missing_data_sorted = missing_data[missing_data['Total Missing'] > 0].sort_values(
        by='Percentage (%)', ascending=False
    )

    print(f"--- Menganalisis Missing Values di {len(df_train.columns)} Kolom ---")
    print(f"Total kolom yang memiliki missing values: {len(missing_data_sorted)}")
    print("\n--- Kolom dengan Missing Values Terbanyak (di atas 0%) ---")
    print(missing_data_sorted)

else:
    print("\n--- Variabel 'df_train' tidak ditemukan. ---")

In [None]:
# --- 8. Memuat Data Uji (Test Data) ---

BASE_PATH = "/content/drive/MyDrive/Kuliah/HomeCreditProject/"
test_path = BASE_PATH + "application_test.csv"

try:
    df_test = pd.read_csv(test_path)
    print(f"--- Data 'application_test.csv' berhasil dimuat ---")
    print(f"Bentuk data (baris, kolom): {df_test.shape}")

    # Untuk konsistensi, simpan TARGET dan SK_ID_CURR
    train_target = df_train['TARGET']
    train_id = df_train['SK_ID_CURR']
    test_id = df_test['SK_ID_CURR']

    # Gabungkan sementara untuk cleaning
    df_full = pd.concat([df_train.drop('TARGET', axis=1), df_test], ignore_index=True)

    print(f"\nBentuk data gabungan (untuk cleaning): {df_full.shape}")

except FileNotFoundError:
    print(f"\n--- ERROR: 'application_test.csv' tidak ditemukan di path Anda ---")
    print("Pastikan file tersebut ada di folder yang sama.")
except Exception as e:
    print(f"\n--- ERROR: {e} ---")

In [None]:
# --- 9. Proses Data Cleaning (Imputasi & Dropping) ---

if 'df_full' in locals():
    print(f"Bentuk data sebelum cleaning: {df_full.shape}")

    # 1. HAPUS KOLOM DENGAN MISSING > 50%
    missing_perc = (df_full.isnull().sum() / len(df_full)) * 100

    cols_to_drop = missing_perc[missing_perc > 50].index

    print(f"\nMenghapus {len(cols_to_drop)} kolom (missing > 50%)...")
    df_full = df_full.drop(columns=cols_to_drop)

    print(f"Bentuk data setelah menghapus kolom: {df_full.shape}")

    # 2. ISI SISANYA (MISSING < 50%)

    print("\nMengisi sisa missing values...")
    remaining_missing_cols = df_full.columns[df_full.isnull().any()].tolist()

    categorical_cols = df_full[remaining_missing_cols].select_dtypes(include=['object']).columns

    numerical_cols = df_full[remaining_missing_cols].select_dtypes(include=['number']).columns

    # 3a. Isi Kolom Kategorikal dengan Modus (nilai paling sering muncul)
    print(f"Mengisi {len(categorical_cols)} kolom kategorikal dengan MODUS...")
    for col in categorical_cols:
        mode_value = df_full[col].mode()[0]
        df_full[col] = df_full[col].fillna(mode_value)

    # 3b. Isi Kolom Numerik dengan Median (nilai tengah)
    print(f"Mengisi {len(numerical_cols)} kolom numerik dengan MEDIAN...")
    for col in numerical_cols:
        median_value = df_full[col].median()
        df_full[col] = df_full[col].fillna(median_value)

    print("\n--- PROSES CLEANING SELESAI ---")

    total_missing_after = df_full.isnull().sum().sum()
    print(f"Total missing values di data gabungan sekarang: {total_missing_after}")

else:
    print("\n--- Variabel 'df_full' tidak ditemukan. Jalankan sel kode 8 terlebih dahulu. ---")

In [None]:
# --- 10. Data Processing (One-Hot Encoding) ---

if 'df_full' in locals():
    print(f"Bentuk data sebelum encoding: {df_full.shape}")

    # 1. Identifikasi kolom kategorikal (tipe 'object')
    categorical_cols = df_full.select_dtypes(include=['object']).columns

    print(f"Ditemukan {len(categorical_cols)} kolom kategorikal yang akan di-encode.")

    # 2. Lakukan One-Hot Encoding
    df_full_processed = pd.get_dummies(df_full, columns=categorical_cols)

    print(f"Bentuk data setelah encoding: {df_full_processed.shape}")

    # 3. Menyamakan nama kolom
    df_full_processed.columns = ["".join (c if c.isalnum() else '_' for c in str(x)) for x in df_full_processed.columns]

    print("Nama kolom telah dibersihkan dan disiapkan untuk model.")

else:
    print("\n--- Variabel 'df_full' tidak ditemukan. ---")

In [None]:
# --- 11. Memisahkan Kembali Data ---

if 'df_full_processed' in locals() and 'train_target' in locals():
    # Pisahkan kembali berdasarkan jumlah baris data train asli
    num_train_rows = len(df_train)

    # Data latih (fitur)
    X = df_full_processed.iloc[:num_train_rows].drop(columns=['SK_ID_CURR'])

    # Data uji (fitur)
    X_test = df_full_processed.iloc[num_train_rows:].drop(columns=['SK_ID_CURR'])

    # Data latih (target)
    y = train_target

    # Simpan ID untuk referensi (jika diperlukan untuk submisi)
    train_ids = train_id
    test_ids = test_id

    print(f"Bentuk data latih (X): {X.shape}")
    print(f"Bentuk data target (y): {y.shape}")
    print(f"Bentuk data uji (X_test): {X_test.shape}")

    print("\n--- Data siap untuk pemodelan! ---")
    print("Variabel Anda sekarang adalah: X, y, X_test")

else:
    print("\n--- Gagal memisahkan data. Pastikan Sel 10 (Encoding) berhasil dijalankan. ---")

In [None]:
from sklearn.preprocessing import StandardScaler

# --- 12. Feature Scaling ---

if 'X' in locals() and 'X_test' in locals():
    print("--- Memulai Feature Scaling ---")

    # 1. Buat object scaler
    scaler = StandardScaler()

    # 2. Fit dan transform HANYA pada data latih (X)
    X_scaled = scaler.fit_transform(X)

    # 3. Transform HANYA pada data uji (X_test)
    X_test_scaled = scaler.transform(X_test)

    # 4. Konversi kembali ke DataFrame (opsional, tapi lebih rapi)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    print("--- Feature Scaling Selesai ---")
    print(f"Bentuk X_scaled (data latih): {X_scaled.shape}")
    print(f"Bentuk X_test_scaled (data uji): {X_test_scaled.shape}")
    print("\nData Anda sekarang 100% siap untuk pemodelan!")
    print("Variabel Anda adalah: X_scaled, y, X_test_scaled")

else:
    print("\n--- Variabel 'X' dan 'X_test' tidak ditemukan. ---")
    print("Pastikan Sel 11 (Pemisahan Final) sudah dijalankan.")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# --- 13. Membagi Data Latih menjadi Latih & Validasi ---

if 'X_scaled' in locals() and 'y' in locals():
    # Membagi 80% untuk latih, 20% untuk validasi
    # random_state=42 untuk memastikan hasil bisa direproduksi
    #
    # stratify=y
    # Ini memastikan bahwa rasio 8.07% (gagal bayar) tetap ada
    # di kedua set data (latih dan validasi).

    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    print("--- Data berhasil dibagi ---")
    print(f"Bentuk X_train: {X_train.shape}")
    print(f"Bentuk y_train: {y_train.shape}")
    print(f"Bentuk X_val:   {X_val.shape}")
    print(f"Bentuk y_val:   {y_val.shape}")

    print("\nDistribusi TARGET di y_train (Latih):")
    print(y_train.value_counts(normalize=True) * 100)

    print("\nDistribusi TARGET di y_val (Validasi):")
    print(y_val.value_counts(normalize=True) * 100)

else:
    print("\n--- Variabel 'X_scaled' dan 'y' tidak ditemukan. ---")
    print("Pastikan Sel 12 (Feature Scaling) sudah dijalankan.")

In [None]:
# --- 14. Langkah 8 (Pemodelan) & 9 (Evaluasi): Regresi Logistik ---

if 'X_train' in locals():
    print("--- 1. Melatih Model Regresi Logistik ---")

    # Menggunakan class_weight='balanced' untuk mengatasi imbalanced dataset
    # solver='liblinear' adalah solver yang baik untuk dataset < 1jt baris
    model_lr = LogisticRegression(
        random_state=42,
        class_weight='balanced',
        solver='liblinear'
    )

    # Latih model
    model_lr.fit(X_train, y_train)

    print("--- Model berhasil dilatih ---")

    # --- 2. Membuat Prediksi & Evaluasi ---
    print("\n--- 2. Mengevaluasi Model pada Data Validasi ---")

    # Buat prediksi pada data validasi (X_val)
    y_pred_lr = model_lr.predict(X_val)

    # Buat probabilitas prediksi (dibutuhkan untuk AUC-ROC)
    # [:, 1] mengambil probabilitas untuk kelas '1' (Gagal Bayar)
    y_proba_lr = model_lr.predict_proba(X_val)[:, 1]

    # --- 3. Menampilkan Metrik Evaluasi ---
    print("\n--- Hasil Evaluasi (Baseline Model) ---")

    # Akurasi
    acc = accuracy_score(y_val, y_pred_lr)
    print(f"Akurasi: {acc * 100:.2f}%")
    print(f"(Catatan: Akurasi '91.93%' bisa didapat model bodoh. Jadi, ini bukan metrik utama kita.)")

    # Metrik Utama untuk Imbalanced Data
    print("\n--- Metrik Kinerja Bisnis ---")

    # AUC-ROC: Metrik terbaik untuk performa imbalanced
    auc = roc_auc_score(y_val, y_proba_lr)
    print(f"SKOR AUC-ROC: {auc:.4f}")

    # Recall: Seberapa baik MENEMUKAN klien Gagal Bayar? (TP / (TP + FN))
    rec = recall_score(y_val, y_pred_lr)
    print(f"Recall:       {rec:.4f} (Model ini menemukan {(rec * 100):.2f}% dari semua klien Gagal Bayar)")

    # Precision: Dari yang diprediksi Gagal Bayar, berapa yang benar? (TP / (TP + FP))
    prec = precision_score(y_val, y_pred_lr)
    print(f"Precision:    {prec:.4f} (Dari yang diprediksi Gagal Bayar, {(prec * 100):.2f}% benar)")

    # F1-Score: Rata-rata harmonik Precision dan Recall
    f1 = f1_score(y_val, y_pred_lr)
    print(f"F1-Score:     {f1:.4f}")

    # Confusion Matrix
    print("\n--- Confusion Matrix ---")
    cm = confusion_matrix(y_val, y_pred_lr)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)'])
    plt.ylabel('Aktual')
    plt.xlabel('Prediksi')
    plt.title('Confusion Matrix - Regresi Logistik', fontsize=16)
    plt.show()

else:
    print("\n--- Variabel 'X_train' tidak ditemukan. ---")
    print("Pastikan Sel 13 (Split Data) sudah dijalankan.")

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, confusion_matrix, accuracy_score

# --- 15. Langkah 8 (Metode Lain): LightGBM ---

if 'X_train' in locals():
    print("--- 1. Melatih Model LightGBM (LGBM) ---")

    model_lgbm = lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        n_estimators=200,  # Jumlah "pohon" yang akan dibuat
        learning_rate=0.05,
        num_leaves=31
    )

    # Latih model
    # gunakan X_train dan y_train yang sama
    model_lgbm.fit(X_train, y_train)

    print("--- Model LGBM berhasil dilatih ---")

    # --- 2. Mengevaluasi Model LGBM ---
    print("\n--- 2. Mengevaluasi Model LGBM pada Data Validasi ---")

    y_pred_lgbm = model_lgbm.predict(X_val)
    y_proba_lgbm = model_lgbm.predict_proba(X_val)[:, 1]

    # --- 3. Menampilkan Metrik Evaluasi ---
    print("\n--- Hasil Evaluasi (LightGBM) ---")

    acc_lgbm = accuracy_score(y_val, y_pred_lgbm)
    print(f"Akurasi: {acc_lgbm * 100:.2f}%")

    print("\n--- Metrik Kinerja Bisnis (LightGBM) ---")

    auc_lgbm = roc_auc_score(y_val, y_proba_lgbm)
    print(f"SKOR AUC-ROC: {auc_lgbm:.4f}  (Baseline LR: 0.7454)")

    rec_lgbm = recall_score(y_val, y_pred_lgbm)
    print(f"Recall:       {rec_lgbm:.4f}  (Baseline LR: 0.6749)")

    prec_lgbm = precision_score(y_val, y_pred_lgbm)
    print(f"Precision:    {prec_lgbm:.4f}  (Baseline LR: 0.1603)")

    f1_lgbm = f1_score(y_val, y_pred_lgbm)
    print(f"F1-Score:     {f1_lgbm:.4f}  (Baseline LR: 0.2590)")

    # Confusion Matrix
    print("\n--- Confusion Matrix (LightGBM) ---")
    cm_lgbm = confusion_matrix(y_val, y_pred_lgbm)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_lgbm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)'])
    plt.ylabel('Aktual')
    plt.xlabel('Prediksi')
    plt.title('Confusion Matrix - LightGBM', fontsize=16)
    plt.show()

else:
    print("\n--- Variabel 'X_train' tidak ditemukan. ---")
    print("Pastikan Sel 13 (Split Data) sudah dijalankan.")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# --- 16. Langkah 8 (Hyperparameter Tuning): VERSI GPU ---

if 'X_train' in locals():
    print("--- 1. Memulai Hyperparameter Tuning untuk LGBM (VERSI GPU) ---")

    # 1. Tentukan model dasar
    lgbm_tuner = lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        device='gpu'
    )

    # 2. Tentukan 'param_dist' - rentang hyperparameter yang ingin dicoba
    param_dist = {
        'n_estimators': randint(100, 400),
        'learning_rate': uniform(0.01, 0.1),
        'num_leaves': randint(20, 50),
        'colsample_bytree': uniform(0.6, 0.4),
        'subsample': uniform(0.6, 0.4)
    }

    # 3. Inisialisasi RandomizedSearchCV
    # n_iter=10: Akan mencoba 10 kombinasi acak.
    # cv=3: 3-fold cross-validation
    # scoring='roc_auc': Metrik utama
    # n_jobs=-1: Gunakan semua core CPU

    random_search = RandomizedSearchCV(
        lgbm_tuner,
        param_distributions=param_dist,
        n_iter=10,
        cv=3,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    print("Melatih 10 model kandidat di GPU... (Ini akan jauh lebih cepat)")

    # 4. Latih tuner
    random_search.fit(X_train, y_train)

    print("\n--- Tuning Selesai ---")

    # 5. Tampilkan hyperparameter terbaik
    print("Hyperparameter terbaik yang ditemukan:")
    print(random_search.best_params_)
    print(f"\nSkor AUC-ROC terbaik (dari CV): {random_search.best_score_:.4f}")

    # 6. Simpan model terbaik
    best_lgbm_tuned = random_search.best_estimator_

    # --- 7. Evaluasi Model yang Sudah di-Tune ---
    print("\n--- Mengevaluasi Model Tuned pada Data Validasi ---")

    y_pred_tuned = best_lgbm_tuned.predict(X_val)
    y_proba_tuned = best_lgbm_tuned.predict_proba(X_val)[:, 1]

    print("\n--- Hasil Evaluasi (LGBM Tuned) ---")

    auc_tuned = roc_auc_score(y_val, y_proba_tuned)
    print(f"SKOR AUC-ROC: {auc_tuned:.4f}  (Baseline LGBM: 0.7547)")

    rec_tuned = recall_score(y_val, y_pred_tuned)
    print(f"Recall:       {rec_tuned:.4f}  (Baseline LGBM: 0.6743)")

    prec_tuned = precision_score(y_val, y_pred_tuned)
    print(f"Precision:    {prec_tuned:.4f}  (Baseline LGBM: 0.1669)")

    f1_tuned = f1_score(y_val, y_pred_tuned)
    print(f"F1-Score:     {f1_tuned:.4f}  (Baseline LGBM: 0.2676)")

      # Confusion Matrix
    print("\n--- Confusion Matrix (LightGBM) ---")
    cm_lgbm = confusion_matrix(y_val, y_pred_tuned)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_lgbm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)'])
    plt.ylabel('Aktual')
    plt.xlabel('Prediksi')
    plt.title('Confusion Matrix - LightGBM', fontsize=16)
    plt.show()

else:
    print("\n--- Variabel 'X_train' tidak ditemukan. ---")
    print("Pastikan Sel 13 (Split Data) sudah dijalankan.")

**Feature Engineering**

In [None]:
import warnings
warnings.filterwarnings('ignore')

# --- 17. Langkah 7 (Feature Engineering): previous_application.csv ---

if 'X_scaled' in locals():
    print("--- 1. Memuat Data Pinjaman Sebelumnya ---")

    BASE_PATH = "/content/drive/MyDrive/Kuliah/HomeCreditProject/"
    prev_app_path = BASE_PATH + "previous_application.csv"

    try:
        df_prev_app = pd.read_csv(prev_app_path)
        print(f"Bentuk data previous_application: {df_prev_app.shape}")

        # --- 2. Proses One-Hot Encoding ---
        df_prev_app['prev_app_refused'] = (df_prev_app['NAME_CONTRACT_STATUS'] == 'Refused').astype(int)

        # --- 3. Proses Agregasi  ---
        print("\n--- 3. Melakukan Agregasi (groupby SK_ID_CURR) ---")

        agg_functions = {
            'SK_ID_PREV': ['count'],
            'AMT_ANNUITY': ['mean', 'max'],
            'AMT_APPLICATION': ['mean', 'max'],
            'AMT_CREDIT': ['mean', 'max'],
            'AMT_GOODS_PRICE': ['mean', 'max'],
            'DAYS_DECISION': ['mean', 'min'],
            'prev_app_refused': ['mean', 'sum']
        }

        # Lakukan agregasi. df_prev_agg akan memiliki SK_ID_CURR sebagai index
        df_prev_agg = df_prev_app.groupby('SK_ID_CURR').agg(agg_functions)

        # Merapikan nama kolom
        df_prev_agg.columns = ['prev_app_' + '_'.join(col).strip() for col in df_prev_agg.columns.values]

        print(f"Bentuk data agregat: {df_prev_agg.shape}")

        # --- 4. Menggabungkan ke Data Latih & Uji Kita ---
        print("\n--- 4. Menggabungkan Fitur Baru ke X_scaled dan X_test_scaled ---")

        # 'X_scaled' memiliki index 0-based. 'train_id' memiliki SK_ID_CURR.
        # atur index X_scaled menjadi SK_ID_CURR untuk join

        # Salin data
        X_scaled_new = X_scaled.copy()
        X_test_scaled_new = X_test_scaled.copy()

        # Atur index ke SK_ID_CURR menggunakan 'train_id' & 'test_id'
        X_scaled_new.index = train_id # dari Sel 11
        X_test_scaled_new.index = test_id # dari Sel 11

        # Sekarang join berdasarkan INDEX
        # df_prev_agg juga memiliki SK_ID_CURR sebagai index
        X_scaled_new = X_scaled_new.join(df_prev_agg, how='left')
        X_test_scaled_new = X_test_scaled_new.join(df_prev_agg, how='left')

        print(f"Bentuk X_scaled_new (latih): {X_scaled_new.shape}")
        print(f"Bentuk X_test_scaled_new (uji): {X_test_scaled_new.shape}")

        # --- 5. Cleaning Terakhir (Fillna) ---
        # Klien yang tidak punya riwayat di prev_app akan memiliki NaN
        print("\n--- 5. Mengisi NaN setelah join ---")
        X_scaled_new = X_scaled_new.fillna(0)
        X_test_scaled_new = X_test_scaled_new.fillna(0)

        # Pastikan kolom konsisten
        X_test_scaled_new = X_test_scaled_new[X_scaled_new.columns]

        print("\n--- Feature Engineering Selesai ---")
        print(f"Jumlah fitur baru: {len(df_prev_agg.columns)}")

    except FileNotFoundError:
        print(f"\n--- ERROR: 'previous_application.csv' tidak ditemukan. ---")
    except Exception as e:
        print(f"\n--- ERROR: {e} ---")
else:
    print("\n--- Variabel 'X_scaled' tidak ditemukan. Jalankan Sel 12. ---")

In [None]:
# --- 18. Split Ulang Data & Latih Ulang Model
if 'X_scaled_new' in locals():
    print("--- 1. Melakukan Split Ulang (dengan fitur baru) ---")

    # Gunakan parameter split yang SAMA PERSIS (random_state=42, stratify=y)
    X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
        X_scaled_new,
        y, # 'y' (target) tidak berubah
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    print("--- 2. Melatih Ulang Model LGBM Terbaik (best_lgbm_tuned) ---")

    # Pastikan variabel 'best_lgbm_tuned' ada
    if 'best_lgbm_tuned' in locals():
        # Latih ulang model pada data yang lebih kaya
        best_lgbm_tuned.fit(X_train_new, y_train_new)

        print("--- Model berhasil dilatih ulang ---")

        # --- 3. Evaluasi Model dengan Fitur Baru ---
        print("\n--- Mengevaluasi Model (dgn Fitur Baru) pada Data Validasi ---")

        y_pred_new = best_lgbm_tuned.predict(X_val_new)
        y_proba_new = best_lgbm_tuned.predict_proba(X_val_new)[:, 1]

        print("\n--- Hasil Evaluasi (LGBM + Fitur Baru) ---")

        auc_new = roc_auc_score(y_val_new, y_proba_new)
        print(f"SKOR AUC-ROC BARU: {auc_new:.4f}  (Sebelumnya: 0.7553)")

        rec_new = recall_score(y_val_new, y_pred_new)
        print(f"Recall BARU:       {rec_new:.4f}  (Sebelumnya: 0.6802)")

        prec_new = precision_score(y_val_new, y_pred_new)
        print(f"Precision BARU:    {prec_new:.4f}  (Sebelumnya: 0.1663)")

        # Tampilkan CM baru
        cm_new = confusion_matrix(y_val_new, y_pred_new)
        plt.figure(figsize=(8, 6))

        sns.heatmap(cm_new, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                    yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)']) # Typo 'S' sudah dihapus
        # ---------------------------------

        plt.ylabel('Aktual')
        plt.xlabel('Prediksi')
        plt.title('Confusion Matrix - LGBM + Fitur Baru', fontsize=16)
        plt.show()

    else:
        print("\n--- ERROR: 'best_lgbm_tuned' tidak ditemukan. Jalankan Sel 16 (Tuning) terlebih dahulu. ---")

else:
    print("\n--- ERROR: 'X_scaled_new' tidak ditemukan. Jalankan Sel 17 (Feature Engineering) terlebih dahulu. ---")

In [None]:
# --- 19. Langkah 7 (Feature Engineering): bureau_balance.csv ---

print("--- 1. Memuat bureau_balance.csv ---")
bureau_balance_path = BASE_PATH + "bureau_balance.csv"

try:
    df_bureau_balance = pd.read_csv(bureau_balance_path)

    print(f"Bentuk data bureau_balance: {df_bureau_balance.shape}")

    # --- 2. One-Hot Encoding Sederhana ---
    # Ubah status kredit menjadi angka
    df_bureau_balance = pd.get_dummies(df_bureau_balance, columns=['STATUS'], dummy_na=True)

    # --- 3. Agregasi Tahap 1 (per SK_ID_BUREAU) ---
    print("--- 3. Melakukan Agregasi (groupby SK_ID_BUREAU) ---")

    # - Berapa bulan histori kreditnya? (count)
    # - Berapa rata-rata status kreditnya? (mean)

    agg_bureau_balance = {
        'MONTHS_BALANCE': ['min', 'max', 'count'],
    }

    # Ambil semua kolom status (hasil OHE)
    status_cols = [col for col in df_bureau_balance.columns if col.startswith('STATUS_')]
    for col in status_cols:
        agg_bureau_balance[col] = ['mean', 'sum']

    # Lakukan agregasi
    df_bureau_balance_agg = df_bureau_balance.groupby('SK_ID_BUREAU').agg(agg_bureau_balance)

    # Rapikan nama kolom
    df_bureau_balance_agg.columns = ['bureau_bal_' + '_'.join(col).strip() for col in df_bureau_balance_agg.columns.values]

    print(f"Bentuk data agregat bureau_balance: {df_bureau_balance_agg.shape}")

except FileNotFoundError:
    print(f"\n--- ERROR: 'bureau_balance.csv' tidak ditemukan. ---")
except Exception as e:
    print(f"\n--- ERROR: {e} ---")

In [None]:
# --- 20. Langkah 7 (Feature Engineering): bureau.csv (2) ---

if 'df_bureau_balance_agg' in locals():
    print("--- 1. Memuat bureau.csv ---")
    bureau_path = BASE_PATH + "bureau.csv"

    try:
        df_bureau = pd.read_csv(bureau_path)
        print(f"Bentuk data bureau: {df_bureau.shape}")

        # --- 2. Gabungkan dengan bureau_balance_agg ---
        df_bureau_full = df_bureau.join(df_bureau_balance_agg, how='left', on='SK_ID_BUREAU')

        # Isi NaN (pinjaman tanpa riwayat bulanan) dengan 0
        df_bureau_full = df_bureau_full.fillna(0)

        # --- 3. Agregasi Tahap 2 (per SK_ID_CURR) ---
        print("--- 3. Melakukan Agregasi (groupby SK_ID_CURR) ---")

        # - Rata-rata keterlambatan, jumlah kredit, dll.

        # OHE sederhana untuk kredit aktif/tutup
        df_bureau_full['bureau_credit_active'] = (df_bureau_full['CREDIT_ACTIVE'] == 'Active').astype(int)
        df_bureau_full['bureau_credit_closed'] = (df_bureau_full['CREDIT_ACTIVE'] == 'Closed').astype(int)

        bureau_agg_functions = {
            'SK_ID_BUREAU': ['count'],
            'DAYS_CREDIT': ['mean', 'max', 'min'],
            'CREDIT_DAY_OVERDUE': ['mean', 'max'],
            'AMT_CREDIT_SUM': ['mean', 'sum'],
            'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
            'AMT_CREDIT_SUM_OVERDUE': ['mean', 'sum'],
            'bureau_credit_active': ['mean', 'sum'],
            'bureau_credit_closed': ['mean', 'sum'],
        }

        # Ambil kolom agregat dari bureau_balance
        balance_agg_cols = [col for col in df_bureau_full.columns if col.startswith('bureau_bal_')]
        for col in balance_agg_cols:
            bureau_agg_functions[col] = ['mean'] # Ambil rata-ratanya

        # Lakukan agregasi
        df_bureau_agg = df_bureau_full.groupby('SK_ID_CURR').agg(bureau_agg_functions)

        # Rapikan nama kolom
        df_bureau_agg.columns = ['bureau_' + '_'.join(col).strip() for col in df_bureau_agg.columns.values]

        print(f"Bentuk data agregat bureau: {df_bureau_agg.shape}")

    except FileNotFoundError:
        print(f"\n--- ERROR: 'bureau.csv' tidak ditemukan. ---")
    except Exception as e:
        print(f"\n--- ERROR: {e} ---")
else:
    print("\n--- ERROR: 'df_bureau_balance_agg' tidak ditemukan. Jalankan Sel 19. ---")

In [None]:
# --- 21. Gabung Ulang & Latih Ulang Model ---

if 'X_scaled_new' in locals() and 'df_bureau_agg' in locals():

    # --- 1. Menggabungkan Fitur Baru ---
    print("--- 1. Menggabungkan Fitur Bureau ke Data Utama ---")

    # gabungkan ke X_scaled_new (yang sudah punya fitur prev_app)
    # Gunakan index (SK_ID_CURR)
    X_scaled_final = X_scaled_new.join(df_bureau_agg, how='left')
    X_test_scaled_final = X_test_scaled_new.join(df_bureau_agg, how='left')

    print(f"Bentuk data latih final: {X_scaled_final.shape}")
    print(f"Bentuk data uji final: {X_test_scaled_final.shape}")

    # --- 2. Cleaning Terakhir (Fillna) ---
    print("--- 2. Mengisi NaN setelah join ---")
    # Klien yang tidak punya riwayat di bureau.csv akan memiliki NaN
    X_scaled_final = X_scaled_final.fillna(0)
    X_test_scaled_final = X_test_scaled_final.fillna(0)

    # Pastikan kolom konsisten
    X_test_scaled_final = X_test_scaled_final[X_scaled_final.columns]

    # --- 3. Split Ulang Data & Latih Ulang ---
    print("\n--- 3. Melakukan Split Ulang (dengan fitur baru) ---")

    X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
        X_scaled_final, y, test_size=0.2, random_state=42, stratify=y
    )

    print("--- 4. Melatih Ulang Model LGBM Terbaik (best_lgbm_tuned) ---")

    if 'best_lgbm_tuned' in locals():
        best_lgbm_tuned.fit(X_train_final, y_train_final)

        print("--- Model berhasil dilatih ulang ---")

        # --- 5. Evaluasi Model Final ---
        print("\n--- Mengevaluasi Model (dgn Fitur Bureau) pada Data Validasi ---")

        y_pred_final = best_lgbm_tuned.predict(X_val_final)
        y_proba_final = best_lgbm_tuned.predict_proba(X_val_final)[:, 1]

        print("\n--- Hasil Evaluasi (LGBM + Fitur PrevApp + Fitur Bureau) ---")

        auc_final = roc_auc_score(y_val_final, y_proba_final)
        print(f"SKOR AUC-ROC FINAL: {auc_final:.4f}  (Sebelumnya: 0.7609)")

        rec_final = recall_score(y_val_final, y_pred_final)
        print(f"Recall FINAL:       {rec_final:.4f}  (Sebelumnya: 0.6872)")

        prec_final = precision_score(y_val_final, y_pred_final)
        print(f"Precision FINAL:    {prec_final:.4f}  (Sebelumnya: 0.1697)")

        # Tampilkan CM baru
        cm_final = confusion_matrix(y_val_final, y_pred_final)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm_final, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                    yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)'])
        plt.ylabel('Aktual')
        plt.xlabel('Prediksi')
        plt.title('Confusion Matrix - LGBM + Semua Fitur', fontsize=16)
        plt.show()

    else:
        print("\n--- ERROR: 'best_lgbm_tuned' tidak ditemukan. Jalankan Sel 16 (Tuning) terlebih dahulu. ---")

else:
    print("\n--- ERROR: Data yang dibutuhkan tidak ditemukan. Jalankan Sel 17, 19, 20. ---")

In [None]:
# --- 22. Langkah 7 (Feature Engineering): installments_payments.csv ---

print("--- 1. Memuat installments_payments.csv ---")
installments_path = BASE_PATH + "installments_payments.csv"

try:
    df_installments = pd.read_csv(installments_path)
    print(f"Bentuk data installments_payments: {df_installments.shape}")

    # --- 2. Membuat Fitur Perilaku ---
    # Fitur 1: Seberapa cepat/lambat klien membayar?
    df_installments['payment_diff'] = df_installments['DAYS_ENTRY_PAYMENT'] - df_installments['DAYS_INSTALMENT']

    # Fitur 2: Apakah klien membayar penuh?
    df_installments['payment_amt_diff'] = df_installments['AMT_PAYMENT'] - df_installments['AMT_INSTALMENT']

    # --- 3. Agregasi Tahap 1 (per SK_ID_PREV) ---
    print("--- 3. Melakukan Agregasi (groupby SK_ID_PREV) ---")

    installments_agg_functions = {
        'NUM_INSTALMENT_NUMBER': ['count'], # Hitung jumlah total cicilan
        # ---------------------------------
        'payment_diff': ['mean', 'max', 'sum'], # Rata-rata, maks, total keterlambatan
        'payment_amt_diff': ['mean', 'max', 'sum'], # Rata-rata, maks, total kurang bayar
        'AMT_PAYMENT': ['sum'],
        'AMT_INSTALMENT': ['sum']
    }

    df_installments_agg = df_installments.groupby('SK_ID_PREV').agg(installments_agg_functions)

    # Rapikan nama kolom
    df_installments_agg.columns = ['install_' + '_'.join(col).strip() for col in df_installments_agg.columns.values]

    print(f"Bentuk data agregat installments: {df_installments_agg.shape}")

except FileNotFoundError:
    print(f"\n--- ERROR: 'installments_payments.csv' tidak ditemukan. ---")
except Exception as e:
    print(f"\n--- ERROR: {e} ---")

In [None]:
# --- 23. Langkah 7 (Feature Engineering): Agregasi Tahap 2 (per SK_ID_CURR) ---

if 'df_prev_app' in locals() and 'df_installments_agg' in locals():
    print("--- 1. Mengambil mapping ID dari previous_application ---")

    # Ambil mapping SK_ID_PREV -> SK_ID_CURR
    prev_app_ids = df_prev_app[['SK_ID_PREV', 'SK_ID_CURR']].copy()

    # --- 2. Gabungkan Agregasi Tahap 1 dengan Mapping ID ---
    df_installments_with_curr = prev_app_ids.join(df_installments_agg, how='left', on='SK_ID_PREV')

    # --- 3. Agregasi Tahap 2 (per SK_ID_CURR) ---
    print("--- 3. Melakukan Agregasi (groupby SK_ID_CURR) ---")

    # ambil rata-rata dari semua pinjaman sebelumnya
    # Ubah nama kolom agar tidak bentrok
    agg_cols = [col for col in df_installments_with_curr.columns if col.startswith('install_')]

    # Agregasi final: ambil rata-rata, min, max, sum dari fitur agregat
    final_agg_functions = {}
    for col in agg_cols:
        final_agg_functions[col] = ['mean', 'max', 'sum']

    df_installments_agg_final = df_installments_with_curr.groupby('SK_ID_CURR').agg(final_agg_functions)

    # Rapikan nama kolom
    df_installments_agg_final.columns = ['_'.join(col).strip() for col in df_installments_agg_final.columns.values]

    print(f"Bentuk data agregat final installments: {df_installments_agg_final.shape}")

else:
    print("\n--- ERROR: Data 'df_prev_app' atau 'df_installments_agg' tidak ditemukan. ---")

In [None]:
# --- 24. Gabung Ulang & Latih Ulang Model (Final) ---

if 'X_scaled_final' in locals() and 'df_installments_agg_final' in locals():

    # --- 1. Menggabungkan Fitur Baru ---
    print("--- 1. Menggabungkan Fitur Installments ke Data Utama ---")

    # gabungkan ke X_scaled_final (yang sudah punya fitur prev_app & bureau)
    X_scaled_ultra = X_scaled_final.join(df_installments_agg_final, how='left')
    X_test_scaled_ultra = X_test_scaled_final.join(df_installments_agg_final, how='left')

    print(f"Bentuk data latih ultra: {X_scaled_ultra.shape}")
    print(f"Bentuk data uji ultra: {X_test_scaled_ultra.shape}")

    # --- 2. Cleaning Terakhir (Fillna) ---
    print("--- 2. Mengisi NaN setelah join ---")
    X_scaled_ultra = X_scaled_ultra.fillna(0)
    X_test_scaled_ultra = X_test_scaled_ultra.fillna(0)

    # Pastikan kolom konsisten
    X_test_scaled_ultra = X_test_scaled_ultra[X_scaled_ultra.columns]

    # --- 3. Split Ulang Data & Latih Ulang ---
    print("\n--- 3. Melakukan Split Ulang (dengan fitur baru) ---")

    X_train_ultra, X_val_ultra, y_train_ultra, y_val_ultra = train_test_split(
        X_scaled_ultra, y, test_size=0.2, random_state=42, stratify=y
    )

    print("--- 4. Melatih Ulang Model LGBM Terbaik (best_lgbm_tuned) ---")

    if 'best_lgbm_tuned' in locals():
        best_lgbm_tuned.fit(X_train_ultra, y_train_ultra)

        print("--- Model berhasil dilatih ulang ---")

        # --- 5. Evaluasi Model Ultra ---
        print("\n--- Mengevaluasi Model (dgn Fitur Installments) pada Data Validasi ---")

        y_pred_ultra = best_lgbm_tuned.predict(X_val_ultra)
        y_proba_ultra = best_lgbm_tuned.predict_proba(X_val_ultra)[:, 1]

        print("\n--- Hasil Evaluasi (LGBM + Semua Fitur) ---")

        auc_ultra = roc_auc_score(y_val_ultra, y_proba_ultra)
        print(f"SKOR AUC-ROC ULTRA: {auc_ultra:.4f}  (Sebelumnya: 0.7651)")

        rec_ultra = recall_score(y_val_ultra, y_pred_ultra)
        print(f"Recall ULTRA:       {rec_ultra:.4f}  (Sebelumnya: 0.6896)")

        prec_ultra = precision_score(y_val_ultra, y_pred_ultra)
        print(f"Precision ULTRA:    {prec_ultra:.4f}  (Sebelumnya: 0.1715)")

        # Tampilkan CM baru
        cm_ultra = confusion_matrix(y_val_ultra, y_pred_ultra)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm_ultra, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Prediksi Lancar (0)', 'Prediksi Gagal Bayar (1)'],
                    yticklabels=['Aktual Lancar (0)', 'Aktual Gagal Bayar (1)'])
        plt.ylabel('Aktual')
        plt.xlabel('Prediksi')
        plt.title('Confusion Matrix - LGBM + Semua Fitur', fontsize=16)
        plt.show()

    else:
        print("\n--- ERROR: 'best_lgbm_tuned' tidak ditemukan. Jalankan Sel 16 (Tuning) terlebih dahulu. ---")

else:
    print("\n--- ERROR: Data yang dibutuhkan tidak ditemukan. Jalankan Sel 21 & 23. ---")