# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# Load Data + Info/ Deskripsi Data

In [None]:
df = pd.read_csv('data/german.csv', sep=';')

print("5 Baris Pertama")
display(df.head())


In [None]:
print("\nInfo Data")
print(df.info())

EDA

In [None]:

print("\nJumlah Kelas Target (Creditability)")
print(df['Creditability'].value_counts())

plt.figure(figsize=(6,4))
sns.countplot(x='Creditability', data=df)
plt.title('Distribusi Creditability (0=Bad, 1=Good)')
plt.show()

In [None]:
print("\n Missing Value:")
print(df.isnull().sum().sum())

In [None]:
plt.figure(figsize=(12, 10))
correlation = df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
print("Korelasi Fitur terhadap Creditability")
print(df.corr()['Creditability'].sort_values(ascending=False))

plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

sns.boxplot(x='Creditability', y='Credit_Amount', data=df, ax=ax[0])
ax[0].set_title('Credit Amount vs Risk')

sns.boxplot(x='Creditability', y='Duration_of_Credit_monthly', data=df, ax=ax[1])
ax[1].set_title('Duration vs Risk')

sns.boxplot(x='Creditability', y='Age_years', data=df, ax=ax[2])
ax[2].set_title('Age vs Risk')

In [None]:
def cek_risiko_kategori(kolom):
    ct = pd.crosstab(df[kolom], df['Creditability'], normalize='index') * 100
    ct.columns = ['Bad (%)', 'Good (%)'] 
    print(f"\nAnalisis Risiko Berdasarkan: {kolom}")
    print(ct.round(1))
    ct.plot(kind='bar', stacked=True, color=['red', 'green'], figsize=(8, 4))
    plt.title(f'Risiko Berdasarkan {kolom}')
    plt.ylabel('Persentase')
    plt.axhline(y=50, color='black', linestyle='--') 
    plt.show()


In [None]:
cek_risiko_kategori('Payment_Status_of_Previous_Credit')


In [None]:
cek_risiko_kategori('Purpose')

In [None]:
cek_risiko_kategori('Length_of_current_employment')

In [None]:
df_eda = df.copy()

map_sex_status = {
    1: 'Laki-laki (Cerai/Pisah)',
    2: 'Perempuan (Cerai/Nikah/Janda)',
    3: 'Laki-laki (Single)',
    4: 'Laki-laki (Menikah/Duda)',
    5: 'Perempuan (Single)'
}

map_history = {
    0: 'Macets/Tertunda (Bad)',
    1: 'Kritis/Akun Lain Bermasalah (Bad)',
    2: 'Lancar (Good)',
    3: 'Lancar (Existing Credit)',
    4: 'Lunas/Sempurna (Very Good)'
}

map_purpose = {
    0: 'Mobil Baru',
    1: 'Mobil Bekas',
    2: 'Perabotan',
    3: 'Radio/TV',
    4: 'Elektronik Rumah',
    5: 'Perbaikan/Renovasi',
    6: 'Pendidikan', 
    7: 'Liburan',
    8: 'Pelatihan/Retraining',
    9: 'Bisnis',
    10: 'Lainnya'
}

map_work = {
    1: 'Pengangguran',
    2: '< 1 Tahun',
    3: '1 - 4 Tahun',
    4: '4 - 7 Tahun',
    5: '> 7 Tahun'
}

map_target = {
    1: 'Good (Lunas)',
    0: 'Bad (Gagal Bayar)'
}

df_eda['Sex_Marital_Status'] = df_eda['Sex_Marital_Status'].map(map_sex_status)
df_eda['Payment_Status_of_Previous_Credit'] = df_eda['Payment_Status_of_Previous_Credit'].map(map_history)
df_eda['Purpose'] = df_eda['Purpose'].map(map_purpose)
df_eda['Length_of_current_employment'] = df_eda['Length_of_current_employment'].map(map_work)
df_eda['Creditability'] = df_eda['Creditability'].map(map_target)



In [None]:
print("Data Setelah Diberi Label: ")
print(df_eda[['Purpose', 'Payment_Status_of_Previous_Credit', 'Length_of_current_employment', 'Creditability']].head())

In [None]:
def cek_risiko_label(kolom):
    ct = pd.crosstab(df_eda[kolom], df_eda['Creditability'], normalize='index') * 100
    
    ax = ct.plot(kind='bar', stacked=True, color=['red', 'green'], figsize=(10, 5))
    plt.title(f'Risiko Gagal Bayar vs {kolom}')
    plt.ylabel('Persentase')
    plt.xlabel(kolom)

    plt.axhline(y=50, color='black', linestyle='--')
    plt.legend(title='Status Kredit', loc='upper right')
    plt.xticks(rotation=45, ha='right') 
    plt.tight_layout()
    plt.show()


In [None]:
cek_risiko_label('Purpose')
cek_risiko_label('Payment_Status_of_Previous_Credit')
cek_risiko_label('Length_of_current_employment')
cek_risiko_label('Sex_Marital_Status')

# Preprocessing Data

- Split Dataset

In [None]:
X = df.drop('Creditability', axis=1)
y = df['Creditability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Kondisi Awal Data Training (Sebelum SMOTE)")
print(f"Total Baris: {len(X_train)}")
print(f"Perbandingan Kelas: {Counter(y_train)}")


- Smote

In [None]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Kondisi Akhir Data Training (Setelah SMOTE)")
print(f"Total Baris: {len(X_train_balanced)}")
print(f"Perbandingan Kelas: {Counter(y_train_balanced)}")


- Standard Scaling

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_balanced) 

X_test_scaled = scaler.transform(X_test)


# Modelling

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='rbf', probability=True, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('XGBoost', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('LightGBM', lgb.LGBMClassifier(random_state=42))
]

In [None]:
results = []

for name, model in models:
    model.fit(X_train_scaled, y_train_balanced)
    
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    recall_bad = recall_score(y_test, y_pred, pos_label=0) 
    recall_good = recall_score(y_test, y_pred, pos_label=1) 
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Recall_Bad': recall_bad,
        'Recall_Good': recall_good,
        'F1_Score': f1,
        'ROC_AUC': auc
    })
    print(f"{name} selesai.")

# Evaluasi

In [None]:
df_results = pd.DataFrame(results)

df_results = df_results.sort_values(by=['ROC_AUC', 'Recall_Bad'], ascending=False)

print("\nLEADERBOARD")
print(df_results.round(3)) 
plt.figure(figsize=(10, 5))
sns.barplot(x='ROC_AUC', y='Model', data=df_results, palette='magma')
plt.title('ROC-AUC Score')
plt.xlim(0.5, 1.0) 
plt.show()

# Hyperparameter Tuning

In [None]:
tuning_params = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto'],
            'kernel': ['rbf']
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.7, 1.0]
        }
    }
}

In [None]:
best_models = {}
for name, config in tuning_params.items():
    print(f"Memulai Tuning untuk {name}")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        scoring='roc_auc',  
        cv=10,              
        n_jobs=-1,         
        verbose=2        
    )
    
    grid_search.fit(X_train_scaled, y_train_balanced)
    
    print(f"\nParameter terbaik untuk {name}:")
    print(grid_search.best_params_)
    print(f"Skor ROC AUC terbaik (cross-validated): {grid_search.best_score_:.4f}")
    
    best_models[name + '_Tuned'] = grid_search.best_estimator_

In [None]:
models_final = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Random Forest_Default', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM_Default', SVC(kernel='rbf', probability=True, random_state=42)),
    ('XGBoost_Default', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('LightGBM', lgb.LGBMClassifier(random_state=42))
]

In [None]:
for name, model in best_models.items():
    models_final.append((name, model))

results_final = []

for name, model in models_final:
    model.fit(X_train_scaled, y_train_balanced)
    
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    recall_bad = recall_score(y_test, y_pred, pos_label=0) 
    recall_good = recall_score(y_test, y_pred, pos_label=1) 
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results_final.append({
        'Model': name,
        'Accuracy': acc,
        'Recall_Bad': recall_bad,
        'Recall_Good': recall_good,
        'F1_Score': f1,
        'ROC_AUC': auc
    })
    print(f"{name} selesai.")


Evaluasi (Tuning)

In [None]:
df_results_final = pd.DataFrame(results_final)
df_results_final = df_results_final.sort_values(by=['ROC_AUC', 'Recall_Bad'], ascending=False)

print("\nLEADERBOARD FINAL (Default vs Tuned)")
print(df_results_final.round(3))

In [None]:
models_to_compare = ['Random Forest', 'SVM', 'XGBoost']
df_filtered = df_results_final[df_results_final['Model'].str.contains('|'.join(models_to_compare))].copy()

df_filtered['Status'] = np.where(df_filtered['Model'].str.contains('_Tuned'), 'Tuned', 'Default')

df_filtered['Base Model'] = df_filtered['Model'].str.replace('_Default', '').str.replace('_Tuned', '')

df_plot = df_filtered

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 7))

sns.barplot(
    data=df_plot,
    x='ROC_AUC',
    y='Base Model',
    hue='Status', 
    palette={'Default': '#B0B0B0', 'Tuned': '#4CAF50'}, 
    ax=ax
)

for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', padding=5, fontsize=10, color='black')

ax.set_title('Peningkatan Skor ROC AUC Setelah Hyperparameter Tuning', fontsize=16, pad=20, weight='bold')
ax.set_xlabel('ROC AUC Score on Test Data', fontsize=12)
ax.set_ylabel('Model', fontsize=12)
ax.legend(title='Status Model', loc='lower right')

min_score = df_plot['ROC_AUC'].min()
max_score = df_plot['ROC_AUC'].max()
ax.set_xlim(left=min_score - 0.02, right=max_score + 0.02)

plt.tight_layout()
plt.show()


-------

In [None]:
import joblib
df_results_final.to_csv('leaderboard_final.csv', index=False)

final_model = best_models['Random Forest_Tuned'] 
joblib.dump(final_model, 'credit_risk_model.pkl')


joblib.dump(scaler, 'scaler.pkl')

model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')

print("File berhasil disimpan!")