<a href="https://colab.research.google.com/github/annisafitribas/ft_credit_home/blob/main/ft_credit_home.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PERSIAPAN**

## import library dan workdir

In [1]:
import os, sys, time
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

WORKDIR = '/content/home_credit_task'
os.makedirs(WORKDIR, exist_ok=True)
print("WORKDIR:", WORKDIR)

WORKDIR: /content/home_credit_task


Bertujuan untuk menyiapkan folder kerja dan library yang akan digunakan dalam proses pengolahan data

## install dependencies if missing (Colab-friendly)

In [2]:
try:
    import gdown
except Exception:
    !pip install -q gdown
    import gdown

try:
    import lightgbm as lgb
except Exception:
    !pip install -q lightgbm
    import lightgbm as lgb

try:
    from pptx import Presentation
    from pptx.util import Inches, Pt
except Exception:
    !pip install -q python-pptx
    from pptx import Presentation
    from pptx.util import Inches, Pt

try:
    import matplotlib
    import matplotlib.pyplot as plt
except Exception:
    !pip install -q matplotlib
    import matplotlib.pyplot as plt

## sklearn / joblib

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.impute import SimpleImputer
import joblib
import pandas as pd
import numpy as np
from pathlib import Path

# **1. Download dataset file**

In [4]:
files = {
    'application_train.csv': '1q059QolR6CNxB0PWESAjkEWIprNutajA',
    'application_test.csv' : '1QD7ehk_hzXze0vHQuYa5qyqfDcfI8Sex',
    'bureau.csv'           : '1hndizX1t5ab0DTnKMTedqVJ1ZxLVclhF',
    'bureau_balance.csv'   : '1OXEQb_L6S_mZALJi4--C6RyFI6yOsq4x',
    'credit_card_balance.csv': '1t6Hhsmj0vSCCKUlNXht_xDQ6Z6l4M0Vu',
    'installments_payments.csv': '126xrKCW5EQrxkQoDwmN-yb00ILBKnhR8',
    'POS_CASH_balance.csv' : '1dODAmBQLaylpM2JcCHfc4KNbbtKY7xhA',
    'previous_application.csv': '1D4O7xf-lF_3oBeu6XMwzhpXtSvhcgoBU',
    'HomeCredit_columns_description.csv': '1v2iGGOJjlUGSTsQz-bsjtjtyM5IQp7uW',
    'sample_submission.csv': '1JongVA9fWMYml5XKVnbhm8TUlR5Efs0n'
}

for fname, fid in files.items():
    dest = os.path.join(WORKDIR, fname)
    if not os.path.exists(dest):
        print("Downloading", fname)
        url = f"https://drive.google.com/uc?export=download&id={fid}"
        gdown.download(url, dest, quiet=False)
    else:
        print("Exists:", fname)

Exists: application_train.csv
Exists: application_test.csv
Exists: bureau.csv
Exists: bureau_balance.csv
Exists: credit_card_balance.csv
Exists: installments_payments.csv
Exists: POS_CASH_balance.csv
Exists: previous_application.csv
Exists: HomeCredit_columns_description.csv
Exists: sample_submission.csv


# **2. Load csv**

In [5]:
print("\nLoading CSVs ...")
train = pd.read_csv(os.path.join(WORKDIR, 'application_train.csv'), low_memory=False)
test  = pd.read_csv(os.path.join(WORKDIR, 'application_test.csv'), low_memory=False)
bureau = pd.read_csv(os.path.join(WORKDIR, 'bureau.csv'), low_memory=False)
bureau_balance = pd.read_csv(os.path.join(WORKDIR, 'bureau_balance.csv'), low_memory=False)
credit_card_balance = pd.read_csv(os.path.join(WORKDIR, 'credit_card_balance.csv'), low_memory=False)
installments = pd.read_csv(os.path.join(WORKDIR, 'installments_payments.csv'), low_memory=False)
pos_cash = pd.read_csv(os.path.join(WORKDIR, 'POS_CASH_balance.csv'), low_memory=False)
prev_app = pd.read_csv(os.path.join(WORKDIR, 'previous_application.csv'), low_memory=False)
sample_sub = pd.read_csv(os.path.join(WORKDIR, 'sample_submission.csv'), low_memory=False)

print("Shapes:")
print("train", train.shape, "test", test.shape)
print("bureau", bureau.shape, "bureau_balance", bureau_balance.shape)
print("credit_card_balance", credit_card_balance.shape, "installments", installments.shape)
print("pos_cash", pos_cash.shape, "previous_application", prev_app.shape)


Loading CSVs ...
Shapes:
train (307511, 122) test (48744, 121)
bureau (1716428, 17) bureau_balance (27299925, 3)
credit_card_balance (3840312, 23) installments (13605401, 8)
pos_cash (10001358, 8) previous_application (1670214, 37)


# **3. Quick EDA & visuals**

In [6]:
print("\nTARGET distribution")
print(train['TARGET'].value_counts(normalize=True))

# helper for saving plots
def savefig(fig, filename):
    path = os.path.join(WORKDIR, filename)
    fig.savefig(path, bbox_inches='tight')
    print('Saved plot:', path)


TARGET distribution
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


## 3.1 target distribution plot

In [7]:
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
counts = train['TARGET'].value_counts().sort_index()
ax.bar(counts.index.astype(str), counts.values)
ax.set_title('Target distribution (counts)')
ax.set_xlabel('TARGET')
ax.set_ylabel('Count')
savefig(fig, 'target_distribution.png')
plt.close(fig)

Saved plot: /content/home_credit_task/target_distribution.png


## 3.2 top missing features (bar)

In [8]:
missing = train.isna().mean().sort_values(ascending=False).head(30)
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
ax.barh(missing.index[::-1], missing.values[::-1])
ax.set_title('Top missing percentage (train)')
ax.set_xlabel('Fraction missing')
savefig(fig, 'missing_pct_top.png')
plt.close(fig)

Saved plot: /content/home_credit_task/missing_pct_top.png


## 3.3 correlation heatmap of numeric features (sampled for speed)

In [9]:
num = train.select_dtypes(include=[np.number]).drop(['SK_ID_CURR','TARGET'], axis=1, errors='ignore')
# sample columns to avoid huge matrix
num_small = num.sample(n=min(30, num.shape[1]), axis=1)
corr = num_small.corr()
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
cax = ax.imshow(corr.values, interpolation='nearest')
ax.set_xticks(np.arange(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=90, fontsize=8)
ax.set_yticks(np.arange(len(corr.columns)))
ax.set_yticklabels(corr.columns, fontsize=8)
ax.set_title('Correlation matrix (subset)')
fig.colorbar(cax, ax=ax)
savefig(fig, 'corr_matrix_subset.png')
plt.close(fig)

Saved plot: /content/home_credit_task/corr_matrix_subset.png


# **4. Feature engineering (same as baseline but kept clear)**

## 4.1 Aggregations dari bureau (per SK_ID_CURR)

In [10]:
b_agg = bureau.groupby('SK_ID_CURR').agg(
    bureau_loans_count = ('SK_ID_BUREAU', 'count'),
    bureau_credit_sum_mean = ('AMT_CREDIT_SUM', 'mean'),
    bureau_credit_sum_max = ('AMT_CREDIT_SUM', 'max'),
    bureau_active_cnt = ('CREDIT_ACTIVE', lambda x: (x=='Active').sum())
).reset_index()

## 4.2 bureau_balance -> bad rate per bureau id then agg

In [11]:
bb_bad = bureau_balance[bureau_balance['STATUS'].isin(['2','3','4','5'])].groupby('SK_ID_BUREAU').size().rename('bad_months')
bb_tot = bureau_balance.groupby('SK_ID_BUREAU').size().rename('total_months')
bb = pd.concat([bb_bad, bb_tot], axis=1).fillna(0)
bb['bad_rate'] = bb['bad_months'] / bb['total_months']
bureau2 = bureau.merge(bb.reset_index(), on='SK_ID_BUREAU', how='left')
b2_agg = bureau2.groupby('SK_ID_CURR').agg(
    bureau_prev_bad_rate_mean = ('bad_rate','mean'),
    bureau_prev_months_mean = ('total_months','mean')
).reset_index()

b_agg = b_agg.merge(b2_agg, on='SK_ID_CURR', how='left')

## 4.3 previous_application aggregates

In [12]:
prev_agg = prev_app.groupby('SK_ID_CURR').agg(
    prev_count = ('SK_ID_PREV','count'),
    prev_amt_app_mean = ('AMT_APPLICATION','mean'),
    prev_amt_credit_mean = ('AMT_CREDIT','mean'),
    prev_approved = ('NAME_CONTRACT_STATUS', lambda x: (x=='Approved').sum())
).reset_index()

## 4.4 installments

In [13]:
inst_agg = installments.groupby('SK_ID_CURR').agg(
    inst_count = ('NUM_INSTALMENT_VERSION','count'),
    inst_amt_sum = ('AMT_PAYMENT','sum'),
    inst_delay_mean = ('DAYS_ENTRY_PAYMENT', lambda x: np.nanmean(x - installments.loc[x.index,'DAYS_INSTALMENT']))
).reset_index()

## 4.5 credit_card & pos

In [14]:
cc_agg = credit_card_balance.groupby('SK_ID_CURR').agg(
    cc_count = ('SK_ID_PREV','count'),
    cc_bal_mean = ('AMT_BALANCE','mean'),
    cc_limit_mean = ('AMT_CREDIT_LIMIT_ACTUAL','mean')
).reset_index()

pos_agg = pos_cash.groupby('SK_ID_CURR').agg(
    pos_count = ('SK_ID_PREV','count'),
    pos_dpd_mean = ('SK_DPD','mean')
).reset_index()

## 4.6 application-level features

In [15]:
def make_app_features(df):
    df = df.copy()
    df['DAYS_BIRTH_YEARS'] = (-df['DAYS_BIRTH']) / 365.25
    df['DAYS_EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'].replace(365243, np.nan) / -365.25
    df['INCOME_CREDIT_RATIO'] = df['AMT_INCOME_TOTAL'] / (df['AMT_CREDIT'] + 1)
    df['CREDIT_GOODS_RATIO'] = df['AMT_CREDIT'] / (df['AMT_GOODS_PRICE'] + 1)
    return df[['SK_ID_CURR','DAYS_BIRTH_YEARS','DAYS_EMPLOYED_YEARS','AMT_INCOME_TOTAL','AMT_CREDIT','INCOME_CREDIT_RATIO','CREDIT_GOODS_RATIO']]

app_train_feats = make_app_features(train)
app_test_feats = make_app_features(test)

# **5. Merge features**

In [16]:
train_base = train[['SK_ID_CURR','TARGET']].merge(b_agg, on='SK_ID_CURR', how='left') \
                               .merge(prev_agg, on='SK_ID_CURR', how='left') \
                               .merge(inst_agg, on='SK_ID_CURR', how='left') \
                               .merge(cc_agg, on='SK_ID_CURR', how='left') \
                               .merge(pos_agg, on='SK_ID_CURR', how='left') \
                               .merge(app_train_feats, on='SK_ID_CURR', how='left')

test_base = test[['SK_ID_CURR']].merge(b_agg, on='SK_ID_CURR', how='left') \
                               .merge(prev_agg, on='SK_ID_CURR', how='left') \
                               .merge(inst_agg, on='SK_ID_CURR', how='left') \
                               .merge(cc_agg, on='SK_ID_CURR', how='left') \
                               .merge(pos_agg, on='SK_ID_CURR', how='left') \
                               .merge(app_test_feats, on='SK_ID_CURR', how='left')

# **6. Prepare X, y; preprocessing**

In [17]:
Y = train_base['TARGET']
X = train_base.drop(['SK_ID_CURR','TARGET'], axis=1)
X_test = test_base.drop(['SK_ID_CURR'], axis=1)

# numeric / categorical separation
num_cols = [c for c in X.columns if X[c].dtype.kind in 'biufc']
cat_cols = [c for c in X.columns if c not in num_cols]

# Impute numeric with median (computed from train)
num_imputer = SimpleImputer(strategy='median')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# For categorical (if any), fill and label-encode simple
for c in cat_cols:
    X[c] = X[c].fillna('MISSING').astype(str)
    X_test[c] = X_test[c].fillna('MISSING').astype(str)
for c in cat_cols:
    X[c], _ = pd.factorize(X[c])
    X_test[c], _ = pd.factorize(X_test[c])

# Ensure X_test has all columns (reindex)
X_test = X_test.reindex(columns=X.columns, fill_value=0)

print("\nNumber of features:", X.shape[1])


Number of features: 24


# **7. Train/validation split and scaling**

In [18]:
X_tr, X_val, y_tr, y_val = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# **8. Logistic Regression**

In [19]:
print("\n=== Logistic Regression ===")
lr = LogisticRegression(max_iter=2000, class_weight='balanced', n_jobs=-1)
# 5-fold CV
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_cv_scores = []
for tr_idx, vc_idx in kf.split(X, Y):
    lr.fit(scaler.fit_transform(X.iloc[tr_idx]), Y.iloc[tr_idx])
    p = lr.predict_proba(scaler.transform(X.iloc[vc_idx]))[:,1]
    lr_cv_scores.append(roc_auc_score(Y.iloc[vc_idx], p))
print("LR 5-fold AUC: %.5f ± %.5f" % (np.mean(lr_cv_scores), np.std(lr_cv_scores)))

# fit on training partition and validate
lr.fit(X_tr_scaled, y_tr)
proba_lr_val = lr.predict_proba(X_val_scaled)[:,1]
print("LR holdout AUC:", roc_auc_score(y_val, proba_lr_val))
print("LR classification report (holdout, threshold=0.5):")
print(classification_report(y_val, (proba_lr_val>0.5).astype(int)))

# save logistic model & scaler
joblib.dump(lr, os.path.join(WORKDIR, 'model_logistic.pkl'))
joblib.dump(scaler, os.path.join(WORKDIR, 'scaler_logistic.pkl'))

# ROC curve plot (LR)
fpr, tpr, _ = roc_curve(y_val, proba_lr_val)
roc_auc_lr = auc(fpr, tpr)
fig = plt.figure(figsize=(6,5))
ax = fig.add_subplot(111)
ax.plot(fpr, tpr)
ax.plot([0,1],[0,1], linestyle='--')
ax.set_title(f'Logistic ROC (AUC={roc_auc_lr:.4f})')
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
savefig(fig, 'roc_logistic.png')
plt.close(fig)



=== Logistic Regression ===
LR 5-fold AUC: 0.67660 ± 0.00183
LR holdout AUC: 0.6779583940318442
LR classification report (holdout, threshold=0.5):
              precision    recall  f1-score   support

           0       0.95      0.64      0.76     56538
           1       0.13      0.62      0.22      4965

    accuracy                           0.63     61503
   macro avg       0.54      0.63      0.49     61503
weighted avg       0.88      0.63      0.72     61503

Saved plot: /content/home_credit_task/roc_logistic.png


# **9. LightGBM**

In [23]:
print("\nLightGBM")

best_lgb = lgb.LGBMClassifier(
    objective='binary',
    random_state=42,
    learning_rate=0.05,
    n_estimators=400,
    num_leaves=63,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1
)

best_lgb.fit(X_tr, y_tr)
proba_lgb_val = best_lgb.predict_proba(X_val)[:,1]

print("LightGBM FAST holdout AUC:", roc_auc_score(y_val, proba_lgb_val))
print("LGB classification report (holdout):")
print(classification_report(y_val, (proba_lgb_val > 0.5).astype(int)))

# Save model
joblib.dump(best_lgb, os.path.join(WORKDIR, 'model_lgb.pkl'))



LightGBM
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.194371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5000
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
LightGBM FAST holdout AUC: 0.6991342168535724
LGB classification report (holdout):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.42      0.00      0.01      4965

    accuracy                           0.92     61503
   macro avg       0.67      0.50      0.48     61503
weighted avg       0.88      0.92      0.88     61503



['/content/home_credit_task/model_lgb.pkl']

# **10. Feature importance (LGB) and plot**

In [None]:
fi = pd.Series(best_lgb.feature_importances_, index=X.columns).sort_values(ascending=False)
print('\nTop 20 features (LightGBM):')
print(fi.head(20))

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
topn = fi.head(20)[::-1]
ax.barh(topn.index, topn.values)
ax.set_title('Top 20 feature importances (LightGBM)')
ax.set_xlabel('Importance')
savefig(fig, 'feature_importance_top20.png')
plt.close(fig)


# **11. Create submissions**

In [None]:
print('\nCreating submissions ...')
preds_lg_test = best_lgb.predict_proba(X_test)[:,1]
preds_lr_test = lr.predict_proba(X_test_scaled)[:,1]

sub_lgb = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': preds_lg_test})
sub_lr  = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': preds_lr_test})

sub_lgb.to_csv(os.path.join(WORKDIR, 'submission_lgb.csv'), index=False)
sub_lr.to_csv(os.path.join(WORKDIR, 'submission_logistic.csv'), index=False)
print('Saved:', os.path.join(WORKDIR, 'submission_lgb.csv'))
print('Saved:', os.path.join(WORKDIR, 'submission_logistic.csv'))

# **12. Save report CSV**

In [None]:
report = pd.DataFrame({
    'model': ['LogisticRegression', 'LightGBM'],
    'holdout_auc': [roc_auc_score(y_val, proba_lr_val), roc_auc_score(y_val, proba_lgb_val)]
})
report.to_csv(os.path.join(WORKDIR, 'model_report.csv'), index=False)

# **13. Visualisasi**

In [None]:
from IPython.display import Image, display
import os

print("\nSemua visualisasi sudah dibuat dan disimpan di folder:")
print(WORKDIR)

visuals = [
    'target_distribution.png',
    'missing_pct_top.png',
    'corr_matrix_subset.png',
    'roc_lr.png',
    'roc_lgbm.png',
    'feature_importance_lgbm.png'
]

for v in visuals:
    img_path = os.path.join(WORKDIR, v)
    if os.path.exists(img_path):
        print(f"\n▶ {v}")
        display(Image(img_path))
    else:
        print(f"\n⚠️ File tidak ditemukan: {v}")

print("\nSemua plot siap dipakai untuk PPT atau laporan.")