# Credit Risk Model (Home Loan) — Data Gathering (Step 1)

## 1. Gather data (Approved and Rejected Applications)

- Real bank data is confidential, so a **synthetic dataset** was generated to closely replicate industry credit-risk data.
- Large datasets were **split into multiple CSV files** to manage data volume constraints

### 1A. Application-level data

- Created **application-level data** for **approved** home-loan applications, including:
  - Demographics, income, LTV, DTI, cash reserves, employment details
  - Credit score (CIBIL-style) and application score
  - Approval status, with rejections driven by low score, high LTV/DTI, low income, or high loan amount

In [1]:
import pandas as pd
from faker import Faker
import random
import uuid

# ✅ Set seeds so output is repeatable
SEED = 42
random.seed(SEED)
Faker.seed(SEED)
fake = Faker()

data = []

employment_types = [
    "Salaried", "Self-employed", "Contractor", "Hourly",
    "Part-time", "Unemployed", "Retired", "Student"
]

for _ in range(100000):
    income = round(random.uniform(30000, 200000), 2)

    # Generate LTV with more spread (more realistic feature)
    ltv_ratio = round(random.uniform(0.50, 0.95), 2)

    # Loan amount
    loan_amount = round(random.uniform(50000, 500000), 2)

    # Derive property value from LTV
    property_value = round(loan_amount / ltv_ratio, 2)

    # Left-skewed credit score (more high scores)
    credit_score = int(300 + (1 - random.random()**10) * 550)

    # Employment Type
    employment_type = random.choice(employment_types)

    # Cash reserves
    cash_reserves_months = round((1 - (random.random()**3)) * 12, 1)
    cash_reserves_amount = round((income / 12.0) * cash_reserves_months, 2)

    # ✅ Make DTI tied to loan + income (simple EMI proxy)
    # Assume monthly payment proxy = loan_amount * factor
    # factor roughly captures interest+tenure; you can tune range
    emi_factor = random.uniform(0.006, 0.015)  # 0.6% to 1.5% of principal per month
    monthly_payment = loan_amount * emi_factor
    monthly_income = income / 12.0
    dti_ratio = round(monthly_payment / monthly_income, 2)
    dti_ratio = max(0.0, min(dti_ratio, 0.99))

    record = {
        "Customer_ID": str(uuid.uuid4()),
        "Loan_ID": str(uuid.uuid4()),
        "Loan_Amount": loan_amount,
        "Property_Value": property_value,
        "Income": income,
        "Credit_Score": credit_score,
        "Debt_to_Income_Ratio": dti_ratio,
        "LTV_Ratio": ltv_ratio,
        "Employment_Type": employment_type,
        "Cash_Reserves_Months": cash_reserves_months,
        "Cash_Reserves_Amount": cash_reserves_amount,
        "Marital_Status": random.choice(["Single", "Married"]),
        "Property_Occupancy": random.choice(["Owner-occupied", "Rented"]),
        "Application_Score": random.randint(100, 900),
        "Gender": random.choice(["Male", "Female"]),
        "Age": random.randint(21, 70),
        "Current_State": fake.state_abbr(),
        "Current_Property_Type": random.choice(["Own", "Rented"]),
    }
    data.append(record)

df = pd.DataFrame(data)

out_path = "/Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/home_loan_applications.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path, "Rows:", len(df))


Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/home_loan_applications.csv Rows: 100000


### 1B. Bureau tradeline data

- Generated **bureau tradeline data (as of application)** at the tradeline level:
  - Loan type, original amount, balance, tenure, account age
  - Delinquency indicators (30+/60+/90+ in last 3/6/12 months)
  - US vs OFF-US (competitor) lender indicator
  - Tradeline risk patterns aligned with applicant credit quality

In [2]:
import pandas as pd
import random
import uuid
import os

# ---------------------------
# INPUT
# ---------------------------
# df must exist and include at least:
# Customer_ID, Credit_Score, Income, Debt_to_Income_Ratio, Cash_Reserves_Months
CREDIT_SCORE_COL = "Credit_Score"

OUT_DIR = "/Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation"
OUT_PREFIX = "bureau_tradelines_part"
MAX_ROWS_PER_FILE = 1_000_000  # split if > 1M rows per csv

os.makedirs(OUT_DIR, exist_ok=True)

# ---------------------------
# HELPERS
# ---------------------------
def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def score_to_risk(credit_score: int) -> float:
    """
    Score -> risk in [0,1]
    850 => 0 (low risk), 300 => 1 (high risk)
    """
    r = (850 - credit_score) / (850 - 300)
    return clamp(r, 0.0, 1.0)

def applicant_risk(credit_score: int, income: float, dti: float, cash_m: float) -> float:
    """
    Composite applicant risk in [0,1], driven by:
    - credit_score (main)
    - income (higher income => lower risk)
    - dti (higher dti => higher risk)
    - cash_reserves_months (higher => lower risk)

    ✅ LTV effect removed
    """
    r = score_to_risk(credit_score)

    # Income effect
    income_norm = clamp((income - 30_000) / (200_000 - 30_000), 0.0, 1.0)
    r = r + 0.12 * (1 - income_norm)

    # DTI effect
    dti_norm = clamp(dti / 0.90, 0.0, 1.0)
    r = r + 0.35 * dti_norm

    # Cash reserves effect
    cash_norm = clamp(cash_m / 12.0, 0.0, 1.0)
    r = r - 0.25 * cash_norm

    return clamp(r, 0.0, 1.0)

def gen_delinquency_flags_by_risk(risk: float):
    """
    Create nested delinquency indicators based on composite risk.
    """
    # Any delinquency (nested windows)
    p12 = clamp(0.03 + 0.65 * risk, 0.0, 1.0)
    p6  = clamp(0.02 + 0.55 * risk, 0.0, 1.0)
    p3  = clamp(0.01 + 0.45 * risk, 0.0, 1.0)

    any12 = 1 if random.random() < p12 else 0
    any6  = 1 if (any12 == 1 or random.random() < p6) else 0
    any3  = 1 if (any6  == 1 or random.random() < p3) else 0

    # Severity odds
    p60 = clamp(0.10 + 0.55 * risk, 0.0, 1.0)
    p90 = clamp(0.03 + 0.35 * risk, 0.0, 1.0)

    def sev(any_flag: int):
        if any_flag == 0:
            return 0, 0, 0
        dpd30 = 1
        dpd90 = 1 if random.random() < p90 else 0
        dpd60 = 1 if (dpd90 == 1 or random.random() < p60) else 0
        return dpd30, dpd60, dpd90

    d30_12, d60_12, d90_12 = sev(any12)
    d30_6,  d60_6,  d90_6  = sev(any6)
    d30_3,  d60_3,  d90_3  = sev(any3)

    # enforce nesting (12 -> 6 -> 3)
    if d90_12: d90_6 = d90_3 = 1
    if d90_6:  d90_3 = 1
    if d60_12: d60_6 = d60_3 = 1
    if d60_6:  d60_3 = 1
    if d30_12: d30_6 = d30_3 = 1
    if d30_6:  d30_3 = 1

    # enforce severity nesting (90 -> 60 -> 30)
    if d90_12: d60_12 = d30_12 = 1
    if d60_12: d30_12 = 1
    if d90_6:  d60_6  = d30_6  = 1
    if d60_6:  d30_6  = 1
    if d90_3:  d60_3  = d30_3  = 1
    if d60_3:  d30_3  = 1

    return {
        "DPD30p_L3M": d30_3,    "DPD60p_L3M": d60_3,    "DPD90p_L3M": d90_3,
        "DPD30p_L6M": d30_6,    "DPD60p_L6M": d60_6,    "DPD90p_L6M": d90_6,
        "DPD30p_L12M": d30_12,  "DPD60p_L12M": d60_12,  "DPD90p_L12M": d90_12,
    }

def gen_tradeline_amounts_and_tenure_by_risk(tradeline_type: str, risk: float):
    """
    Generate amounts/tenure that respond to risk.
    Higher risk -> higher utilization for cards, etc.
    """
    if tradeline_type in ["Home Loan", "Property Loan"]:
        tenure_months = random.randint(120, 360)
        balance = round(random.uniform(80_000, 800_000), 2)
        low = 0.20 + 0.30 * (1 - risk)
        original_loan_amount = round(balance / random.uniform(low, 0.95), 2)
        original_loan_amount = max(original_loan_amount, balance + 1.0)
        account_age_months = random.randint(1, min(180, tenure_months))
        limit_amt = None

    elif tradeline_type in ["Auto", "Auto Loan"]:
        tenure_months = random.randint(24, 84)
        balance = round(random.uniform(2_000, 60_000), 2)
        original_loan_amount = round(balance / random.uniform(0.20 + 0.25*(1-risk), 0.95), 2)
        original_loan_amount = max(original_loan_amount, balance + 1.0)
        account_age_months = random.randint(1, tenure_months)
        limit_amt = None

    elif tradeline_type == "Personal Loan":
        tenure_months = random.randint(12, 60)
        balance = round(random.uniform(500, 60_000), 2)
        original_loan_amount = round(balance / random.uniform(0.20 + 0.25*(1-risk), 0.95), 2)
        original_loan_amount = max(original_loan_amount, balance + 1.0)
        account_age_months = random.randint(1, tenure_months)
        limit_amt = None

    else:  # Credit Card
        tenure_months = random.randint(12, 180)
        account_age_months = random.randint(1, tenure_months)
        limit_amt = round(random.uniform(1_000, 50_000), 2)

        util_low  = 0.02 + 0.40 * risk
        util_high = 0.25 + 0.75 * risk
        util = clamp(random.uniform(util_low, util_high), 0.01, 1.00)

        balance = round(limit_amt * util, 2)
        original_loan_amount = round(max(balance + 1.0, limit_amt * random.uniform(0.8, 1.5)), 2)

    return balance, limit_amt, original_loan_amount, tenure_months, account_age_months

# ---------------------------
# WRITE IN CHUNKS
# ---------------------------
def write_chunk(rows: list, part_idx: int):
    df_chunk = pd.DataFrame(rows)
    file_name = os.path.join(OUT_DIR, f"{OUT_PREFIX}_{part_idx:03d}.csv")
    df_chunk.to_csv(file_name, index=False)
    print(f"Saved: {file_name}  (rows={len(df_chunk):,})")

# ---------------------------
# MAIN GENERATION
# ---------------------------
part_idx = 1
buffer = []
buffer_rows = 0

tradeline_choices = ["Home Loan", "Property Loan", "Auto Loan", "Credit Card", "Personal Loan"]

for _, loan in df.iterrows():
    Customer_ID = loan["Customer_ID"]

    credit_score = int(loan[CREDIT_SCORE_COL])
    income = float(loan["Income"])
    dti = float(loan["Debt_to_Income_Ratio"])
    cash_m = float(loan["Cash_Reserves_Months"])

    # ✅ Composite risk (NO LTV)
    risk = applicant_risk(credit_score, income, dti, cash_m)

    # Higher risk -> slightly more tradelines
    max_tl = 6 if risk > 0.6 else 5
    num_tradelines = random.randint(2, max_tl)

    # OFFUS higher for risky profiles (still allowed without LTV)
    p_offus = clamp(0.35 + 0.35 * risk, 0.0, 1.0)

    for _ in range(num_tradelines):
        tradeline_type = random.choice(tradeline_choices)

        balance, limit_amt, original_amt, tenure_m, age_m = gen_tradeline_amounts_and_tenure_by_risk(
            tradeline_type, risk
        )
        delinq_flags = gen_delinquency_flags_by_risk(risk)

        lender_flag = "OFF_US" if random.random() < p_offus else "ON_US"

        row = {
            "Customer_ID": Customer_ID,
            "Tradeline_ID": str(uuid.uuid4()),
            "Tradeline_Type": tradeline_type,
            "Credit_Score_At_App": credit_score,
            "Original_Loan_Amount": original_amt,
            "Balance": balance,
            "Limit": None if limit_amt is None else limit_amt,
            "Tenure_Months": tenure_m,
            "Account_Age_Months": age_m,
            "Lender_Flag_US_vs_OFFUS": lender_flag,
            **delinq_flags
        }

        buffer.append(row)
        buffer_rows += 1

        # ✅ flush when reaching max rows
        if buffer_rows >= MAX_ROWS_PER_FILE:
            write_chunk(buffer, part_idx)
            part_idx += 1
            buffer = []
            buffer_rows = 0

# ✅ final flush
if buffer_rows > 0:
    write_chunk(buffer, part_idx)


Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/bureau_tradelines_part_001.csv  (rows=352,007)


### 1C. Bureau enquiry data
- Generated **bureau enquiry data at the loan level**:
  - Enquiry loan type and requested amount
  - Enquiry recency flags (30D, 2M, 3M, 6M, 12M) with logical nesting rules

In [3]:
import pandas as pd
import random
import os

# -----------------------------
# CONFIG
# -----------------------------
# df must exist and include at least:
# Customer_ID, Credit_Score, Income, Debt_to_Income_Ratio, Cash_Reserves_Months
CREDIT_SCORE_COL = "Credit_Score"

OUT_DIR = "/Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation"
OUT_PREFIX = os.path.join(OUT_DIR, "bureau_inquiry_loan_level_part")
MAX_ROWS_PER_FILE = 1_000_000
MAX_ENQUIRIES_PER_LOAN = 5

os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# HELPERS
# -----------------------------
def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def score_to_risk(score: int) -> float:
    r = (850 - score) / (850 - 300)
    return clamp(r, 0.0, 1.0)

def applicant_risk(credit_score: int, income: float, dti: float, cash_m: float) -> float:
    """
    Composite applicant risk in [0,1], driven by:
    - credit_score (main)
    - income (higher income => lower risk)
    - dti (higher dti => higher risk)
    - cash_reserves_months (higher => lower risk)

    ✅ LTV effect removed (same as tradelines code)
    """
    r = score_to_risk(credit_score)

    income_norm = clamp((income - 30_000) / (200_000 - 30_000), 0.0, 1.0)
    r = r + 0.12 * (1 - income_norm)

    dti_norm = clamp(dti / 0.90, 0.0, 1.0)
    r = r + 0.35 * dti_norm

    cash_norm = clamp(cash_m / 12.0, 0.0, 1.0)
    r = r - 0.25 * cash_norm

    return clamp(r, 0.0, 1.0)

def requested_amount_by_type(loan_type: str) -> float:
    if loan_type == "Home Loan":
        return round(random.uniform(80_000, 900_000), 2)
    if loan_type == "Property Loan":
        return round(random.uniform(100_000, 1_200_000), 2)
    if loan_type == "Auto Loan":
        return round(random.uniform(5_000, 80_000), 2)
    if loan_type == "Personal Loan":
        return round(random.uniform(1_000, 100_000), 2)
    if loan_type == "Credit Card":
        return round(random.uniform(500, 50_000), 2)
    return round(random.uniform(1_000, 100_000), 2)

def choose_inquiry_type_by_risk(risk: float) -> str:
    """
    Low risk -> more Home/Property
    High risk -> more Auto/PL/CC
    """
    types = ["Home Loan", "Property Loan", "Auto Loan", "Personal Loan", "Credit Card"]
    w_home = 30 * (1 - risk) + 5
    w_prop = 20 * (1 - risk) + 5
    w_auto = 10 * risk + 8
    w_pl   = 18 * risk + 8
    w_cc   = 22 * risk + 8
    return random.choices(types, weights=[w_home, w_prop, w_auto, w_pl, w_cc], k=1)[0]

def choose_highest_window_by_risk(risk: float) -> int:
    """
    Higher risk -> more likely to have recent inquiries (30D/2M/3M)
    """
    w0 = 35 * (1 - risk) + 5
    w1 = 14 * risk + 6
    w2 = 14 * risk + 7
    w3 = 12 * risk + 8
    w4 = 10 * (1 - risk) + 10
    w5 = 8  * (1 - risk) + 8
    return random.choices([0, 1, 2, 3, 4, 5], weights=[w0, w1, w2, w3, w4, w5], k=1)[0]

def choose_num_enquiries(risk: float) -> int:
    """
    Returns 0..5 enquiries per loan.
    Higher risk => more enquiries.
    """
    w0 = 25 * (1 - risk) + 5
    w1 = 18
    w2 = 16
    w3 = 14 * risk + 8
    w4 = 18 * risk + 6
    w5 = 22 * risk + 5
    return random.choices([0, 1, 2, 3, 4, 5], weights=[w0, w1, w2, w3, w4, w5], k=1)[0]

def flush_to_csv(rows, part_idx):
    out_name = f"{OUT_PREFIX}_{part_idx:03d}.csv"
    pd.DataFrame(rows).to_csv(out_name, index=False)
    print(f"Saved: {out_name}  (rows={len(rows):,})")

# -----------------------------
# GENERATE + SPLIT
# -----------------------------
buffer = []
buffer_rows = 0
part_idx = 1

for _, loan in df.iterrows():
    Customer_ID = loan["Customer_ID"]

    credit_score = int(loan[CREDIT_SCORE_COL])
    income = float(loan["Income"])
    dti = float(loan["Debt_to_Income_Ratio"])
    cash_m = float(loan["Cash_Reserves_Months"])

    # ✅ composite risk (same concept as tradelines)
    risk = applicant_risk(credit_score, income, dti, cash_m)

    # ✅ OFFUS higher for risky profiles (same pattern as tradelines)
    p_offus = clamp(0.35 + 0.35 * risk, 0.0, 1.0)

    # ✅ generate 0..5 enquiries per loan
    n_enq = choose_num_enquiries(risk)

    for enq_idx in range(1, n_enq + 1):
        inq_loan_type = choose_inquiry_type_by_risk(risk)

        base_amt = requested_amount_by_type(inq_loan_type)

        # ✅ requested amount adjusted by risk:
        # higher risk -> slightly smaller requests on avg
        score_multiplier = clamp(random.uniform(0.80, 1.05) * (1.05 - 0.25 * risk), 0.60, 1.20)
        inq_amount = round(base_amt * score_multiplier, 2)

        # ✅ inquiry recency window depends on risk
        highest = choose_highest_window_by_risk(risk)
        flags = {
            "Enq_L30D": 1 if highest >= 1 else 0,
            "Enq_L2M":  1 if highest >= 2 else 0,
            "Enq_L3M":  1 if highest >= 3 else 0,
            "Enq_L6M":  1 if highest >= 4 else 0,
            "Enq_L12M": 1 if highest >= 5 else 0,
        }

        lender_flag = "OFF_US" if random.random() < p_offus else "ON_US"

        row = {
            "Customer_ID": Customer_ID,
            "Credit_Score_At_App": credit_score,
            "Inquiry_Seq": enq_idx,
            "Inquiry_Loan_Type": inq_loan_type,
            "Inquiry_Requested_Amount": inq_amount,
            "Lender_Flag_US_vs_OFFUS": lender_flag,
            **flags
        }

        buffer.append(row)
        buffer_rows += 1

        if buffer_rows >= MAX_ROWS_PER_FILE:
            flush_to_csv(buffer, part_idx)
            part_idx += 1
            buffer = []
            buffer_rows = 0

# ✅ final flush
if buffer_rows > 0:
    flush_to_csv(buffer, part_idx)


Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/bureau_inquiry_loan_level_part_001.csv  (rows=175,273)


### 1D. Post-approval loan performance data

- Generated **post-approval loan performance data** for approved loans:
  - Month-on-month performance up to **60 months on book (MOB)**
  - Amount paid, outstanding balance, and current DPD
  - Delinquency dynamics modeled as a function of **application-time credit score, LTV, income, and cash-reserve strength**

In [4]:
import pandas as pd
import random
import csv
from pathlib import Path

# ------------------------------------------------------------
# MONTHLY PERFORMANCE (Approved loans only) till 60 MOB
# Now DPD + Payment behavior depends on:
#   - CIBIL score (Credit_Score)
#   - LTV_Ratio
#   - Income
#   - Cash_Reserves_Months
#   - Cash_Reserves_Amount
#   - DTI_Ratio   <-- ADDED
# ------------------------------------------------------------

# -----------------------------
# CONFIG
# -----------------------------
OUT_DIR = Path("/Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation")
OUT_PREFIX = "loan_performance_part"
MOB_MAX = 60
MAX_ROWS_PER_FILE = 1000000

# Column names (change if yours differ)
CIBIL_COL = "Credit_Score"               # CIBIL-like score
LTV_COL = "LTV_Ratio"                    # e.g., 0.70 .. 0.95
INCOME_COL = "Income"                    # annual income
DTI_COL = "DTI_Ratio"                    # <-- ADDED (e.g., 0.20 .. 0.55)
RES_M_COL = "Cash_Reserves_Months"       # months of reserves
RES_A_COL = "Cash_Reserves_Amount"       # dollar amount of reserves
START_BAL_COL = "Loan_Amount"            # starting balance / principal

# -----------------------------
# HELPERS
# -----------------------------
def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def cibil_to_risk(cibil: float) -> float:
    """
    300 -> high risk (1.0), 900 -> low risk (0.0)
    """
    cibil = float(cibil)
    r = (850 - cibil) / (850 - 300)
    return clamp(r, 0.0, 1.0)

def dti_to_risk(dti: float) -> float:
    """
    Map DTI to risk in [0,1].
    Typical mortgage DTI bands:
      <= 0.20 very good (low risk)
      0.43 common underwriting cutoff
      >= 0.55 very stretched (high risk)
    """
    dti = float(dti)
    # 0.20 (good) -> 0 risk add, 0.65 (bad) -> 1
    return clamp((dti - 0.00) / (1 - 0.00), 0.0, 1.0)

def risk_from_inputs(cibil, ltv, income, dti, res_months, res_amount, loan_amt):
    """
    Combine multiple drivers into a single risk score in [0,1].
    Higher = worse.
    """
    # Base risk from CIBIL
    r = 0.60 * cibil_to_risk(cibil)   # slightly reduced to make room for DTI

    # LTV: higher LTV => higher risk
    ltv = float(ltv)
    ltv_r = clamp((ltv - 0.10) / (0.95 - 0.10), 0.0, 1.0)
    r += 0.08 * ltv_r

    # Income relative to loan (affordability proxy)
    income = float(income)
    loan_amt = float(loan_amt)
    itl = income / max(loan_amt, 1.0)
    itl_r = 1.0 - clamp((itl - 0.05) / (0.80 - 0.05), 0.0, 1.0)
    r += 0.10 * itl_r

    # DTI: higher DTI => higher risk  <-- ADDED
    dti_r = dti_to_risk(dti)
    r += 0.12 * dti_r

    # Cash reserves months: higher is better
    res_months = float(res_months)
    rm_r = 1.0 - clamp(res_months / 12.0, 0.0, 1.0)
    r += 0.06 * rm_r

    # Cash reserves amount relative to monthly scheduled payment (strong cure ability)
    res_amount = float(res_amount)
    est_sched = loan_amt / MOB_MAX
    cover_months = res_amount / max(est_sched, 1.0)
    ra_r = 1.0 - clamp(cover_months / 12.0, 0.0, 1.0)
    r += 0.04 * ra_r

    return clamp(r, 0.0, 1.0)

def next_dpd(prev_dpd: int, mob: int, risk: float) -> int:
    """
    Transition rule: dpd ∈ {0, prev-30, prev+30} (bounded by max_dpd)
    Probabilities depend on risk.
    """
    max_dpd = (mob - 1) * 30
    candidates = [0, max(prev_dpd - 30, 0), prev_dpd + 30]
    candidates = [d for d in dict.fromkeys(candidates) if 0 <= d <= max_dpd]
    if not candidates:
        return 0

    if prev_dpd == 0:
        p_to30 = 0.03 + 0.25 * risk
        p_to0  = 1.0 - p_to30
        weights = []
        for d in candidates:
            if d == 0:
                weights.append(p_to0)
            elif d == 30:
                weights.append(p_to30)
            else:
                weights.append(0.0)
        s = sum(weights)
        weights = [w / s for w in weights]
        return random.choices(candidates, weights=weights, k=1)[0]

    p_cure = 0.10 + 0.06 * (1 - risk)
    p_back = 0.20 + 0.4 * (1 - risk)
    p_fwd  = 1.0 - (p_cure + p_back)
    p_fwd = clamp(p_fwd, 0.05, 0.80)

    weights = []
    for d in candidates:
        if d == 0:
            weights.append(p_cure)
        elif d == max(prev_dpd - 30, 0):
            weights.append(p_back)
        elif d == prev_dpd + 30:
            weights.append(p_fwd)
        else:
            weights.append(0.01)
    s = sum(weights)
    weights = [w / s for w in weights]
    return random.choices(candidates, weights=weights, k=1)[0]

def amount_paid(balance: float, scheduled: float, dpd: int, risk: float, reserves_amt: float) -> float:
    """
    Payment depends on dpd and risk. Higher risk => more underpay/skip.
    Also, higher reserves => higher chance to pay/cure.
    """
    if balance <= 0:
        return 0.0

    reserves_amt = float(reserves_amt)
    buffer_strength = clamp((reserves_amt / max(scheduled, 1.0)) / 12.0, 0.0, 1.0)

    if dpd == 0:
        base_low = 0.85 + 0.10 * buffer_strength
        base_high = 1.30 + 0.20 * buffer_strength
        mult = random.uniform(base_low, base_high) * (1.05 - 0.20 * risk)
        pay = scheduled * mult
    else:
        underpay_low = 0.00 + 0.10 * buffer_strength
        underpay_high = 0.55 + 0.30 * buffer_strength
        underpay_high *= (1.00 - 0.35 * risk)
        pay = scheduled * random.uniform(underpay_low, max(underpay_low, underpay_high))

    return round(min(pay, balance), 2)

# -----------------------------
# CSV WRITING
# -----------------------------
def open_new_writer(part_idx: int):
    out_path = OUT_DIR / f"{OUT_PREFIX}_{part_idx:03d}.csv"
    f = open(out_path, "w", newline="")
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "Loan_ID", "MOB", "Amount_Paid", "Balance", "Current_DPD",
            "Risk_Score"
        ],
    )
    writer.writeheader()
    return f, writer, out_path

# -----------------------------
# MAIN
# -----------------------------
approved = df.copy()

part_idx = 1
f, writer, out_path = open_new_writer(part_idx)
rows_written = 0

for _, row in approved.iterrows():
    Loan_ID = row["Loan_ID"]
    start_balance = float(row[START_BAL_COL])

    # pull risk drivers (if any missing, set reasonable defaults)
    cibil = float(row.get(CIBIL_COL, 750))
    ltv = float(row.get(LTV_COL, 0.80))
    income = float(row.get(INCOME_COL, 90_000))
    dti = float(row.get(DTI_COL, 0.35))              # <-- ADDED default DTI
    res_m = float(row.get(RES_M_COL, 3))
    res_a = float(row.get(RES_A_COL, 10_000))

    risk = risk_from_inputs(cibil, ltv, income, dti, res_m, res_a, start_balance)

    balance = start_balance
    scheduled = start_balance / MOB_MAX
    dpd = 0

    for mob in range(1, MOB_MAX + 1):
        dpd = next_dpd(dpd, mob, risk)

        paid = amount_paid(balance, scheduled, dpd, risk, res_a)
        balance = round(max(balance - paid, 0.0), 2)

        if balance <= 0:
            dpd = 0

        if rows_written >= MAX_ROWS_PER_FILE:
            f.close()
            print(f"Saved: {out_path}  (rows={rows_written:,})")
            part_idx += 1
            f, writer, out_path = open_new_writer(part_idx)
            rows_written = 0

        writer.writerow({
            "Loan_ID": Loan_ID,
            "MOB": mob,
            "Amount_Paid": paid,
            "Balance": balance,
            "Current_DPD": dpd,
            "Risk_Score": round(risk, 4)
        })
        rows_written += 1

f.close()
print(f"Saved: {out_path}  (rows={rows_written:,})")

Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_001.csv  (rows=1,000,000)
Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_002.csv  (rows=1,000,000)
Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_003.csv  (rows=1,000,000)
Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_004.csv  (rows=1,000,000)
Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_005.csv  (rows=1,000,000)
Saved: /Users/abhijit/Desktop/Courses/Application Score Card - Home Loan new/Data/1. Data Generation/loan_performance_part_006.csv  (rows=1,000,000)
