In [2]:
# PreMerge_EDA.ipynb

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#–– 1. Setup output folders
os.makedirs('../eda/origination', exist_ok=True)
os.makedirs('../eda/performance', exist_ok=True)

#–– 2. Load all origination & performance Parquets (1999–2024)
orig_list = []
perf_list = []
for year in range(1999, 2025):
    o_path = f"../data/interim/origination_{year}.parquet"
    p_path = f"../data/interim/performance_{year}.parquet"
    if os.path.exists(o_path):
        df_o = pd.read_parquet(o_path)
        df_o['orig_year'] = year
        orig_list.append(df_o)
    if os.path.exists(p_path):
        df_p = pd.read_parquet(p_path)
        df_p['perf_year'] = year
        perf_list.append(df_p)


orig_all = pd.concat(orig_list, ignore_index=True)
perf_all = pd.concat(perf_list, ignore_index=True)





In [3]:
#–– 3. Origination: type conversions & cleaning
if 'First Payment Date' in orig_all.columns:
    orig_all['First Payment Date'] = pd.to_datetime(
        orig_all['First Payment Date'], format='%Y%m', errors='coerce'
    )
numeric_cols = [
    'Credit Score',
    'Original Combined Loan-to-Value (CLTV)',
    'Original Debt-to-Income (DTI) Ratio'
]
for col in numeric_cols:
    if col in orig_all.columns:
        orig_all[col] = pd.to_numeric(orig_all[col], errors='coerce')

# Rename for ease
orig_all.rename(columns={
    'Credit Score': 'credit_score',
    'Original Combined Loan-to-Value (CLTV)': 'cltv',
    'Original Debt-to-Income (DTI) Ratio': 'dti'
}, inplace=True)




In [3]:
#–– 4. Performance: type conversions & cleaning
if 'Monthly Reporting Period' in perf_all.columns:
    perf_all['Monthly Reporting Period'] = pd.to_datetime(
        perf_all['Monthly Reporting Period'], format='%Y%m', errors='coerce'
    )
perf_all['loan_age'] = pd.to_numeric(perf_all.get('Loan Age'), errors='coerce')
perf_all['delinq']   = pd.to_numeric(perf_all.get('Current Loan Delinquency Status'), errors='coerce')
perf_all['upb']      = pd.to_numeric(perf_all.get('Current Actual UPB'), errors='coerce')
perf_all['zbc']      = pd.to_numeric(perf_all.get('Zero Balance Code'), errors='coerce')

In [5]:
#–– 5. Origination EDA

# 5.1 Yearly summary stats
orig_summary = (
    orig_all
    .groupby('orig_year')[['credit_score','cltv','dti']]
    .agg(['mean','median','count','std','min','max'])
)
orig_summary.to_csv('../eda/origination/orig_summary_stats_by_year.csv')

# 5.2 Trends of means
for feat in ['credit_score','cltv','dti']:
    plt.figure(figsize=(8,4))
    sns.lineplot(
        x=orig_summary.index,
        y=orig_summary[feat]['mean'],
        marker='o'
    )
    plt.title(f'Mean {feat.upper()} by Origination Year')
    plt.xlabel('Year'); plt.ylabel(feat.upper())
    plt.tight_layout()
    plt.savefig(f'../eda/origination/mean_{feat}_by_year.png')
    plt.close()

# 5.3 Missingness heatmap
miss_year = orig_all.isna().groupby(orig_all['orig_year']).mean().T
plt.figure(figsize=(10,6))
sns.heatmap(miss_year, cmap='viridis', cbar=True)
plt.title('% Missing in Origination Features by Year')
plt.xlabel('Year'); plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('../eda/origination/missingness_by_year.png')
plt.close()

# 5.4 Yearly distributions for CLTV & DTI
for feat in ['cltv','dti']:
    g = sns.FacetGrid(orig_all, col='orig_year', col_wrap=6, height=2, sharex=False, sharey=False)
    g.map_dataframe(sns.histplot, feat, bins=20)
    g.figure.suptitle(f'Distribution of {feat.upper()} by Year', y=1.02)
    g.figure.tight_layout()
    g.savefig(f'../eda/origination/{feat}_dist_by_year.png')
    plt.close()



In [6]:
# 5.5 Categorical variable frequencies & visualizations
cats = ['Occupancy Status','First Time Homebuyer Flag','Loan Purpose']
for cat in cats:
    if cat in orig_all.columns:
        # compute yearly proportions
        freq = (orig_all
                .groupby('orig_year')[cat]
                .value_counts(normalize=True)
                .unstack(fill_value=0))
        freq.to_csv(f'../eda/origination/{cat.replace(" ","_")}_freq_by_year.csv')
        # plot top categories over time (area chart)
        top_n = freq.columns[:5]  # top 5 levels by alphabetical (or adjust)
        plt.figure(figsize=(10,4))
        freq[top_n].plot.area(alpha=0.6)
        plt.title(f'{cat} Proportions by Year')
        plt.xlabel('Origination Year'); plt.ylabel('Proportion')
        plt.legend(title=cat, bbox_to_anchor=(1.02,1), loc='upper left')
        plt.tight_layout()
        plt.savefig(f'../eda/origination/{cat.replace(" ","_")}_by_year.png')
        plt.close()


<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>

<Figure size 1000x400 with 0 Axes>


Loan Purpose Proportions by Year


Purchase (“P”) loans dominate by share in almost every vintage—hovering around 50 % in the early 2000’s, dipping in the mid‑2000’s refinance boom, then climbing back to nearly 100 % of originations by 2024 as rates rose.

The “N” category (non‑purchase, e.g. cash‑out refinance) surges pre‑crisis (peaking ~60 % around ’08–’12) then fades.

The low and shrinking “C” and “9” slices reflect a retreat of certain niche programs or “not‑available” codes.

First‑Time Homebuyer Flag by Year


The share of first‑time buyers (“Y”) was modest (10–15 %) from 1999 through ~2021, dipped further post‑crisis, then spiked sharply to ~40 % in 2023–24—likely driven by down‑payment assistance and low rate lock‑ins for new buyers.

Occupancy Status by Year


Primary residence (“P”) consistently makes up ~90–95 % of originations.

Investor (“I”) and second‑home (“S”) shares are both quite small (each <10 %) and show only minor year‑to‑year wiggles.

Borrower Risk Metrics Over Time


Debt‑to‑Income (DTI) Distributions

Pre‑crisis (’99–’07) DTI centers around 30–35 % with roughly symmetric spreads.

2008–12 shows a tightening: mean DTI falls back toward ~32 %.

Post‑2020 distributions steadily shift right—by 2024 the bulk of originations cluster between 35–50 %, indicating looser underwriting on borrower leverage.


Loan‑to‑Value (CLTV) Distributions

Early CLTV vintages (~’99–’04) concentrate around 75–90 % with a long tail above 100 %.

Around 2008–10 CLTV drops sharply (tighter LTV caps), then from 2011–15 it loosens again, with a peak mean CLTV ~79 % in 2012.

Recent years (2021–24) settle back in the mid‑70’s but show a modest upward drift.

Mean Credit Score	Rises from ~720 (2000) into the 760 + zone in 2010–12, dips into the 745–750 range thereafter, with a slight rebound to ~752 in 2024.


Missingness Patterns
Most origination fields (FICO score, LTV, DTI, occupancy, loan purpose, etc.) are nearly 100% populated by design. Some fields like “Pre‑HARP indicator” only appear for late‑vintage loans, and servicer/MSA codes have occasional blanks. We’ll drop or impute these very sparse fields, but overall data quality is excellent.



In [7]:
#–– 6. Performance EDA

# 6.1 Yearly summary stats
perf_summary = (
    perf_all
    .groupby('perf_year')
    .agg(total_records=('loan_age','size'),
         pct_delinquent=('delinq', lambda x: (x>0).mean()),
         mean_upb=('upb','mean'))
)
perf_summary.to_csv('../eda/performance/perf_summary_stats_by_year.csv')

# 6.2 Delinquency & UPB trends
fig, ax1 = plt.subplots(figsize=(8,4))
ax1.plot(perf_summary.index, perf_summary['pct_delinquent']*100, 'r-o')
ax1.set_ylabel('% Delinquent', color='r')
ax2 = ax1.twinx()
ax2.plot(perf_summary.index, perf_summary['mean_upb'], 'b-o')
ax2.set_ylabel('Mean UPB', color='b')
plt.title('Delinquency Rate & Mean UPB by Performance Year')
plt.xlabel('Year')
plt.tight_layout()
plt.savefig('../eda/performance/delinq_upb_trends.png')
plt.close()

# 6.3 Loan-age overall distribution
plt.figure(figsize=(6,4))
sns.histplot(perf_all['loan_age'].dropna(), bins=50)
plt.title('Overall Loan Age Distribution (1999–2024)')
plt.xlabel('Loan Age (months)'); plt.ylabel('Count')
plt.tight_layout()
plt.savefig('../eda/performance/loan_age_overall.png')
plt.close()




In [4]:
# 6.4 Performance missingness heatmap

miss_perf = perf_all.isna().groupby(perf_all['perf_year']).mean().T
plt.figure(figsize=(10,6))
sns.heatmap(miss_perf, cmap='magma', cbar=True)
plt.title('% Missing in Performance Features by Year')
plt.xlabel('Perf Year'); plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('../eda/performance/missingness_by_year.png')
plt.close()

: 

KeyboardInterrupt: 

Delinquency Status
Across the full 1999–2024 period, the vast majority of loan‑months are current (0 months delinquent). Early in a loan’s life we see almost zero serious delinquencies, and even for older vintages the share of 30+ day delinquencies never exceeds a few percent. This long “tail” of current loans and very small fraction of high‑severity delinquencies underscores the extreme class imbalance we’ll face when building our PD models.

Loan Age Distribution
The overall loan‑age histogram confirms the familiar “seasoning” pattern of a 30‑year mortgage: lots of records in the first few months, gradually tapering off as loans prepay or mature around month 360. This tells us that features tied to loan age (e.g. seasoning effects) will be important in dynamic models, but in a static snapshot we may need to cap or bin age to avoid sparsity at high ages.

Delinquency Rate & UPB Trends
When we aggregate by calendar year, we observe the late‑2000s housing crisis visibly as a spike in the percentage of delinquent loans and a corresponding dip in average UPB (as many loans defaulted or were extinguished). More recent years show delinquency back near historic lows, while balances continue to climb. These cyclical swings reinforce the need to capture macro or vintage effects in our static PD model, or to drive stress scenarios in a dynamic framework.

In [6]:
#–– 7. Zero Balance Code Analysis
# Only final record per loan has ZBC; we'll look at the frequency of termination codes.
zbc_counts = perf_all.groupby('perf_year')['zbc'].value_counts(dropna=False).unstack().fillna(0)
zbc_counts.to_csv('../eda/performance/zbc_freq_by_year.csv')

# Plot overall proportions
zbc_props = zbc_counts.div(zbc_counts.sum(axis=1), axis=0)
plt.figure(figsize=(8,5))
zbc_props[[1, 2,3,9,96]].plot(kind='bar', stacked=True)
plt.title('Termination Code Proportions by Performance Year')
plt.xlabel('Year'); plt.ylabel('Proportion')
plt.tight_layout()
plt.savefig('../eda/performance/zbc_proportions_by_year.png')
plt.close()

#

<Figure size 800x500 with 0 Axes>

In [None]:
# –– 8. Correlation & PCA for origination numeric features
# numeric_feats = ['credit_score','cltv','dti','Original UPB','Original Loan Term']
# corr = orig_all[numeric_feats].corr()
# plt.figure(figsize=(6,5))
# sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Matrix of Origination Numerical Features")
# plt.tight_layout()
# plt.savefig('eda/origination/corr_matrix_numeric.png')
# plt.close()
#
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# X = orig_all[numeric_feats].dropna().fillna(0)
# X_scaled = StandardScaler().fit_transform(X)
# pca = PCA(n_components=5).fit(X_scaled)
# print("Explained variance by first 5 components:", pca.explained_variance_ratio_)

