In [1]:
import pandas as pd
df = pd.read_csv(r"E:\Projects Data Scientist\credit-risk-project\data\credit_data.csv")

In [2]:
# Missing Income Handling
df_clean = df.copy()

# Missing income flag
df_clean["IncomeMissingFlag"] = df_clean["MonthlyIncome"].isna().astype(int)

# Median imputation
median_income = df_clean["MonthlyIncome"].median()
df_clean["MonthlyIncome"] = df_clean["MonthlyIncome"].fillna(median_income)

In [4]:
# Cap Utilization & Debt Ratio
import numpy as np
# Cap utilization
util_cap = df_clean["RevolvingUtilizationOfUnsecuredLines"].quantile(0.99)
df_clean["RevolvingUtilizationOfUnsecuredLines"] = np.minimum(
    df_clean["RevolvingUtilizationOfUnsecuredLines"], util_cap
)

# Cap debt ratio
debt_cap = df_clean["DebtRatio"].quantile(0.99)
df_clean["DebtRatio"] = np.minimum(
    df_clean["DebtRatio"], debt_cap)

In [5]:
# Risk-Meaningful Feature Engineering

# Total Delinquency Count
df_clean["TotalDelinquencyCount"] = (
    df_clean["NumberOfTime30-59DaysPastDueNotWorse"] +
    df_clean["NumberOfTime60-89DaysPastDueNotWorse"] +
    df_clean["NumberOfTimes90DaysLate"]
)

In [6]:
# High Utilization Flag
df_clean["HighUtilizationFlag"] = (
    df_clean["RevolvingUtilizationOfUnsecuredLines"] > 0.8
).astype(int)

In [7]:
# Age Bucket
df_clean["AgeBucket"] = pd.cut(
    df_clean["age"],
    bins=[0, 25, 35, 50, 65, np.inf],
    labels=["<25", "25-35", "35-50", "50-65", "65+"]
)

In [9]:
# Save cleaned dataset for modeling
df_clean.to_csv(r"E:\Projects Data Scientist\credit-risk-project\data\processed\credit_risk_clean.csv", index=False)