In [None]:
"""
02_feature_engineering.py

This script generates new features for the credit risk model, including loan-to-income ratios (LTI),
debt-to-income ratios (DTI), and macroeconomic rolling averages. It merges loan and macroeconomic data
and prepares the dataset for modeling.
"""


In [11]:
import pandas as pd

loan_df = pd.read_csv('../data/cleaned_loan_data.csv', parse_dates=['OriginationDate', 'DefaultDate'])
macro_df = pd.read_csv('../data/cleaned_macro_data.csv', parse_dates=['Date'])


In [12]:
# Debt-to-Income (DTI) ratio
loan_df['DTI'] = loan_df['LoanAmount'] / loan_df['Income']

# Loan-to-Income (LTI)
loan_df['LTI'] = loan_df['LoanAmount'] / loan_df['Income']

# Age bucket
loan_df['AgeGroup'] = pd.cut(
    loan_df['CustomerAge'],
    bins=[20, 30, 40, 50, 60, 70],
    labels=['20s', '30s', '40s', '50s', '60s']
)

# Time to Default (in months), only for defaulted loans
loan_df['TimeToDefaultMonths'] = (
    (loan_df['DefaultDate'] - loan_df['OriginationDate']) / pd.Timedelta(days=30)
).round().fillna(0)


In [13]:
# Add month to both datasets
loan_df['OriginationMonth'] = loan_df['OriginationDate'].dt.to_period('M').dt.to_timestamp()
macro_df['Month'] = macro_df['Date'].dt.to_period('M').dt.to_timestamp()

# Merge
loan_df = loan_df.merge(
    macro_df.rename(columns={"Date": "MacroDate"}),
    left_on='OriginationMonth',
    right_on='Month',
    how='left'
)

loan_df.drop(columns=['Month', 'MacroDate'], inplace=True)


In [None]:


macro_df.set_index('Month', inplace=True)

# Select numeric columns
numeric_cols = macro_df.select_dtypes(include='number').columns

# Calculate rolling mean on numeric columns
rolling_macro_numeric = macro_df[numeric_cols].rolling(window=3).mean()

# Reset index to bring 'Month' back as a column
rolling_macro = rolling_macro_numeric.reset_index()

# Rename columns for clarity
rolling_macro.columns = ['Month', 'GDP_Growth_3MAvg', 'Unemployment_3MAvg', 'Inflation_3MAvg']

# Merge with loan_df on OriginationMonth and Month
loan_df = loan_df.merge(
    rolling_macro,
    left_on='OriginationMonth',
    right_on='Month',
    how='left'
)

loan_df.drop(columns=['Month'], inplace=True)


Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [15]:
loan_df.to_csv('../data/engineered_loan_data.csv', index=False)
