<a href="https://colab.research.google.com/github/Ysagar-hub/Ysagar-hub/blob/main/Homecreditipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# COMPLETE END-TO-END HOME CREDIT PIPELINE (MASTER SCRIPT)
# ======================================================================================

import os
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
import lightgbm as lgb
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# 1. SETUP & DATA LOADING
print(">>> Step 1: Mounting Drive and Loading Data...")
drive.mount('/content/drive/')
data_dir = '/content/drive/MyDrive/mlp'
if not os.path.exists(data_dir): os.makedirs(data_dir)
os.chdir(data_dir)

# Extract if needed
if not os.path.exists('application_train.csv'):
    zip_path = os.path.join(data_dir, 'home-credit-default-risk.zip')
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

# Load Main Data
train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# ======================================================================================
# 2. FEATURE ENGINEERING (ALL FILES)
# ======================================================================================

# --- A. BUREAU DATA (Advanced Active/Closed Logic) ---
print("\n>>> Step 2A: Processing Bureau Data...")
if os.path.exists('bureau.csv'):
    bureau = pd.read_csv('bureau.csv')

    # 1. Basic Stats
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({
        'DAYS_CREDIT': 'mean',
        'SK_ID_BUREAU': 'count'
    }).reset_index()
    bureau_agg.columns = ['SK_ID_CURR', 'BUREAU_DAYS_CREDIT_MEAN', 'BUREAU_LOAN_COUNT']

    # 2. Active Loans
    active = bureau[bureau['CREDIT_ACTIVE'] == 'Active']
    active_agg = active.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU': 'count'}).reset_index()
    active_agg.columns = ['SK_ID_CURR', 'ACTIVE_LOANS_COUNT']

    # 3. Closed Loans
    closed = bureau[bureau['CREDIT_ACTIVE'] == 'Closed']
    closed_agg = closed.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU': 'count'}).reset_index()
    closed_agg.columns = ['SK_ID_CURR', 'CLOSED_LOANS_COUNT']

    # Merge
    train = train.merge(bureau_agg, on='SK_ID_CURR', how='left')
    train = train.merge(active_agg, on='SK_ID_CURR', how='left')
    train = train.merge(closed_agg, on='SK_ID_CURR', how='left')

    test = test.merge(bureau_agg, on='SK_ID_CURR', how='left')
    test = test.merge(active_agg, on='SK_ID_CURR', how='left')
    test = test.merge(closed_agg, on='SK_ID_CURR', how='left')

    # Fill NaNs
    for col in ['BUREAU_LOAN_COUNT', 'ACTIVE_LOANS_COUNT', 'CLOSED_LOANS_COUNT', 'BUREAU_DAYS_CREDIT_MEAN']:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

# --- B. PREVIOUS APPLICATIONS ---
print(">>> Step 2B: Processing Previous Applications...")
if os.path.exists('previous_application.csv'):
    prev = pd.read_csv('previous_application.csv')
    prev_agg = prev.groupby('SK_ID_CURR').agg({
        'AMT_APPLICATION': 'mean',
        'SK_ID_PREV': 'count',
        'NAME_CONTRACT_STATUS': lambda x: (x == 'Approved').sum()
    }).reset_index()
    prev_agg.columns = ['SK_ID_CURR', 'PREV_AMT_APPLICATION_MEAN', 'PREV_TOTAL_APPS', 'PREV_APPROVED_COUNT']

    train = train.merge(prev_agg, on='SK_ID_CURR', how='left')
    test = test.merge(prev_agg, on='SK_ID_CURR', how='left')

    for col in ['PREV_AMT_APPLICATION_MEAN', 'PREV_TOTAL_APPS', 'PREV_APPROVED_COUNT']:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

# --- C. POS CASH & INSTALLMENTS ---
print(">>> Step 2C: Processing POS & Installments...")
if os.path.exists('POS_CASH_balance.csv'):
    pos = pd.read_csv('POS_CASH_balance.csv')
    pos_agg = pos.groupby('SK_ID_CURR').agg({'SK_ID_PREV': 'count'}).reset_index()
    pos_agg.columns = ['SK_ID_CURR', 'POS_COUNT']
    train = train.merge(pos_agg, on='SK_ID_CURR', how='left')
    test = test.merge(pos_agg, on='SK_ID_CURR', how='left')
    train['POS_COUNT'] = train['POS_COUNT'].fillna(0)
    test['POS_COUNT'] = test['POS_COUNT'].fillna(0)

if os.path.exists('installments_payments.csv'):
    install = pd.read_csv('installments_payments.csv')
    install_agg = install.groupby('SK_ID_CURR').agg({'AMT_PAYMENT': 'sum'}).reset_index()
    install_agg.columns = ['SK_ID_CURR', 'INSTAL_AMT_PAYMENT_SUM']
    train = train.merge(install_agg, on='SK_ID_CURR', how='left')
    test = test.merge(install_agg, on='SK_ID_CURR', how='left')
    train['INSTAL_AMT_PAYMENT_SUM'] = train['INSTAL_AMT_PAYMENT_SUM'].fillna(0)
    test['INSTAL_AMT_PAYMENT_SUM'] = test['INSTAL_AMT_PAYMENT_SUM'].fillna(0)

# --- D. CREDIT CARD BALANCE ---
print(">>> Step 2D: Processing Credit Card Balance...")
if os.path.exists('credit_card_balance.csv'):
    cc = pd.read_csv('credit_card_balance.csv')
    cc_agg = cc.groupby('SK_ID_CURR').agg({'AMT_DRAWINGS_ATM_CURRENT': 'sum'}).reset_index()
    cc_agg.columns = ['SK_ID_CURR', 'CC_AMT_DRAWINGS_SUM']
    train = train.merge(cc_agg, on='SK_ID_CURR', how='left')
    test = test.merge(cc_agg, on='SK_ID_CURR', how='left')
    train['CC_AMT_DRAWINGS_SUM'] = train['CC_AMT_DRAWINGS_SUM'].fillna(0)
    test['CC_AMT_DRAWINGS_SUM'] = test['CC_AMT_DRAWINGS_SUM'].fillna(0)

# --- E. DOMAIN FEATURES & FIXES ---
print(">>> Step 2E: Creating Domain Features & Fixes...")
for df in [train, test]:
    # Fix DAYS_EMPLOYED anomaly
    df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)

    # Ratios
    df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']

    # External Sources Interaction
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

# ======================================================================================
# 3. PREPROCESSING
# ======================================================================================
print("\n>>> Step 3: Preprocessing...")
y = train['TARGET']
X = train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
X_test = test.drop(['SK_ID_CURR'], axis=1)

# Encode Categoricals
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
for col in categorical_columns:
    le = LabelEncoder()
    # Convert to string to handle NaNs in categories
    combined = pd.concat([X[col].astype(str), X_test[col].astype(str)])
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Fill remaining NaNs with mean
train_mean = X.mean()
X = X.fillna(train_mean).astype(np.float32)
X_test = X_test.fillna(train_mean).astype(np.float32)

# ======================================================================================
# 4. K-FOLD MODEL TRAINING
# ======================================================================================
print("\n>>> Step 4: Starting K-Fold Training (This will take a few minutes)...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(X_test.shape[0])
oom_preds = np.zeros(X.shape[0])

lgbm_params = {
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'num_leaves': 34,
    'colsample_bytree': 0.85,
    'subsample': 0.85,
    'max_depth': 8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_child_weight': 40,
    'random_state': 42,
    'verbosity': -1
}

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"  -> Training Fold {fold_ + 1}...")
    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    clf = LGBMClassifier(**lgbm_params)
    clf.fit(
        X_trn, y_trn,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )

    oom_preds[val_idx] = clf.predict_proba(X_val)[:, 1]
    test_preds += clf.predict_proba(X_test)[:, 1] / folds.n_splits

print("\n" + "="*40)
print(f"FINAL CV SCORE: {roc_auc_score(y, oom_preds):.5f}")
print("="*40)

# ======================================================================================
# 5. SUBMISSION
# ======================================================================================
print("\n>>> Step 5: Saving Submission...")
submission = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': test_preds})

# Saving with the name you requested
submission.to_csv('submission_sagar.csv', index=False)
print("Success! File 'submission_sagar.csv' is ready.")
print(submission.head())

>>> Step 1: Mounting Drive and Loading Data...
Mounted at /content/drive/
Train shape: (307511, 122), Test shape: (48744, 121)

>>> Step 2A: Processing Bureau Data...
>>> Step 2B: Processing Previous Applications...
>>> Step 2C: Processing POS & Installments...
>>> Step 2D: Processing Credit Card Balance...
>>> Step 2E: Creating Domain Features & Fixes...

>>> Step 3: Preprocessing...

>>> Step 4: Starting K-Fold Training (This will take a few minutes)...
  -> Training Fold 1...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.765251	valid_0's binary_logloss: 0.243869
  -> Training Fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1798]	valid_0's auc: 0.773573	valid_0's binary_logloss: 0.241142
  -> Training Fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1795]	valid_0's auc: 0.765826	v

In [3]:
# ======================================================================================
# ULTIMATE HOME CREDIT ENSEMBLE PIPELINE (LGBM + XGBOOST)
# ======================================================================================

import os
import zipfile
import re
import gc
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from google.colab import drive

warnings.filterwarnings('ignore')

# 1. SETUP & DATA LOADING
print(">>> Step 1: Mounting Drive and Loading Data...")
drive.mount('/content/drive/')
data_dir = '/content/drive/MyDrive/mlp'
if not os.path.exists(data_dir): os.makedirs(data_dir)
os.chdir(data_dir)

if not os.path.exists('application_train.csv'):
    with zipfile.ZipFile('home-credit-default-risk.zip', 'r') as zip_ref:
        zip_ref.extractall(data_dir)

train = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# ======================================================================================
# 2. FEATURE ENGINEERING (ADVANCED)
# ======================================================================================
print("\n>>> Step 2: Feature Engineering...")

# --- A. BUREAU ---
if os.path.exists('bureau.csv'):
    bureau = pd.read_csv('bureau.csv')
    # Basic
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({'DAYS_CREDIT': 'mean', 'SK_ID_BUREAU': 'count'}).reset_index()
    bureau_agg.columns = ['SK_ID_CURR', 'BUREAU_DAYS_CREDIT_MEAN', 'BUREAU_LOAN_COUNT']
    # Active
    active = bureau[bureau['CREDIT_ACTIVE'] == 'Active']
    active_agg = active.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU': 'count'}).reset_index()
    active_agg.columns = ['SK_ID_CURR', 'ACTIVE_LOANS_COUNT']
    # Closed
    closed = bureau[bureau['CREDIT_ACTIVE'] == 'Closed']
    closed_agg = closed.groupby('SK_ID_CURR').agg({'SK_ID_BUREAU': 'count'}).reset_index()
    closed_agg.columns = ['SK_ID_CURR', 'CLOSED_LOANS_COUNT']

    for df in [train, test]:
        df = df.merge(bureau_agg, on='SK_ID_CURR', how='left')
        df = df.merge(active_agg, on='SK_ID_CURR', how='left')
        df = df.merge(closed_agg, on='SK_ID_CURR', how='left')
        for col in ['BUREAU_LOAN_COUNT', 'ACTIVE_LOANS_COUNT', 'CLOSED_LOANS_COUNT']:
            df[col] = df[col].fillna(0)

# --- B. PREVIOUS APPS ---
if os.path.exists('previous_application.csv'):
    prev = pd.read_csv('previous_application.csv')
    prev_agg = prev.groupby('SK_ID_CURR').agg({
        'AMT_APPLICATION': 'mean', 'SK_ID_PREV': 'count',
        'NAME_CONTRACT_STATUS': lambda x: (x == 'Approved').sum()
    }).reset_index()
    prev_agg.columns = ['SK_ID_CURR', 'PREV_AMT_APPLICATION_MEAN', 'PREV_TOTAL_APPS', 'PREV_APPROVED_COUNT']

    train = train.merge(prev_agg, on='SK_ID_CURR', how='left')
    test = test.merge(prev_agg, on='SK_ID_CURR', how='left')

    for col in ['PREV_AMT_APPLICATION_MEAN', 'PREV_TOTAL_APPS', 'PREV_APPROVED_COUNT']:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

# --- C. POS & INSTALLMENTS & CC ---
if os.path.exists('POS_CASH_balance.csv'):
    pos = pd.read_csv('POS_CASH_balance.csv')
    pos_agg = pos.groupby('SK_ID_CURR').agg({'SK_ID_PREV': 'count'}).reset_index()
    pos_agg.columns = ['SK_ID_CURR', 'POS_COUNT']
    train = train.merge(pos_agg, on='SK_ID_CURR', how='left')
    test = test.merge(pos_agg, on='SK_ID_CURR', how='left')
    train['POS_COUNT'] = train['POS_COUNT'].fillna(0); test['POS_COUNT'] = test['POS_COUNT'].fillna(0)

if os.path.exists('installments_payments.csv'):
    install = pd.read_csv('installments_payments.csv')
    install_agg = install.groupby('SK_ID_CURR').agg({'AMT_PAYMENT': 'sum'}).reset_index()
    install_agg.columns = ['SK_ID_CURR', 'INSTAL_AMT_PAYMENT_SUM']
    train = train.merge(install_agg, on='SK_ID_CURR', how='left')
    test = test.merge(install_agg, on='SK_ID_CURR', how='left')
    train['INSTAL_AMT_PAYMENT_SUM'] = train['INSTAL_AMT_PAYMENT_SUM'].fillna(0); test['INSTAL_AMT_PAYMENT_SUM'] = test['INSTAL_AMT_PAYMENT_SUM'].fillna(0)

if os.path.exists('credit_card_balance.csv'):
    cc = pd.read_csv('credit_card_balance.csv')
    cc_agg = cc.groupby('SK_ID_CURR').agg({'AMT_DRAWINGS_ATM_CURRENT': 'sum'}).reset_index()
    cc_agg.columns = ['SK_ID_CURR', 'CC_AMT_DRAWINGS_SUM']
    train = train.merge(cc_agg, on='SK_ID_CURR', how='left')
    test = test.merge(cc_agg, on='SK_ID_CURR', how='left')
    train['CC_AMT_DRAWINGS_SUM'] = train['CC_AMT_DRAWINGS_SUM'].fillna(0); test['CC_AMT_DRAWINGS_SUM'] = test['CC_AMT_DRAWINGS_SUM'].fillna(0)

# --- D. DOMAIN FEATURES ---
print(">>> Step 2D: Domain Features...")
for df in [train, test]:
    df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)
    df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

# ======================================================================================
# 3. PREPROCESSING
# ======================================================================================
print("\n>>> Step 3: Preprocessing...")
y = train['TARGET']
X = train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
X_test = test.drop(['SK_ID_CURR'], axis=1)

# Categorical Encoding
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined = pd.concat([X[col].astype(str), X_test[col].astype(str)])
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# XGBoost Requirement: Rename columns to remove spaces or special chars
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]

# Fill NaNs
train_mean = X.mean()
X = X.fillna(train_mean).astype(np.float32)
X_test = X_test.fillna(train_mean).astype(np.float32)

# ======================================================================================
# 4. ENSEMBLE TRAINING (LGBM + XGBOOST)
# ======================================================================================
print("\n>>> Step 4: Starting Ensemble Training...")
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- MODEL 1: LIGHTGBM ---
print(">>> Training LightGBM (5 Folds)...")
lgb_preds = np.zeros(X_test.shape[0])
lgbm_params = {
    'n_estimators': 2000, 'learning_rate': 0.01, 'num_leaves': 34, 'colsample_bytree': 0.85,
    'subsample': 0.85, 'max_depth': 8, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'min_child_weight': 40,
    'random_state': 42, 'verbosity': -1
}

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    clf = LGBMClassifier(**lgbm_params)
    clf.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], eval_metric='auc',
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
    lgb_preds += clf.predict_proba(X_test)[:, 1] / 5

# --- MODEL 2: XGBOOST ---
print(">>> Training XGBoost (5 Folds)...")
xgb_preds = np.zeros(X_test.shape[0])
xgb_params = {
    'n_estimators': 2000, 'learning_rate': 0.01, 'max_depth': 8, 'subsample': 0.8,
    'colsample_bytree': 0.8, 'tree_method': 'hist', 'random_state': 42, 'eval_metric': 'auc',
    'early_stopping_rounds': 100
}

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    clf = XGBClassifier(**xgb_params)
    clf.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], verbose=0)
    xgb_preds += clf.predict_proba(X_test)[:, 1] / 5

# --- BLENDING ---
print("\n>>> Blending Predictions (50% LGBM + 50% XGBoost)...")
final_preds = (0.5 * lgb_preds) + (0.5 * xgb_preds)

# ======================================================================================
# 5. SUBMISSION
# ======================================================================================
submission = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': final_preds})
submission.to_csv('submission_sagar_ensemble.csv', index=False)
print("\nSUCCESS! Final file 'submission_sagar_ensemble.csv' created.")
print(submission.head())

>>> Step 1: Mounting Drive and Loading Data...
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Train shape: (307511, 122), Test shape: (48744, 121)

>>> Step 2: Feature Engineering...
>>> Step 2D: Domain Features...

>>> Step 3: Preprocessing...

>>> Step 4: Starting Ensemble Training...
>>> Training LightGBM (5 Folds)...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.763101	valid_0's binary_logloss: 0.244519
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1995]	valid_0's auc: 0.773305	valid_0's binary_logloss: 0.241305
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1966]	valid_0's auc: 0.763444	valid_0's binary_logloss: 0.244882
Training until validation scores don't improve for 100 rounds
Did