## Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Process data

We want to define :
- Definition of  default
- Time step (12 months, 24 months)

If the lender default after 24 months we do not take it into account.


We define the default as having 90 days of non payment or having zero_balance code a 02, 03 or 09 (cf data details) between the "FIRST PAYMENT DATE" and the "MONTLHLY REPORTING PERIOD" being under 12 months.

In [None]:
##### make_labels.py initial code

# Paths to origination and performance datasets
path_orig = "../data/raw/mortgage_data/historical_data_2022Q1/historical_data_2022Q1.txt"
path_perf = "../data/raw/mortgage_data/historical_data_2022Q1/historical_data_time_2022Q1.txt"

# Column names for the Origination dataset
colnames_origination = [
    "credit_score",                         # Borrower credit score
    "first_payment_date",                   # First scheduled payment date (YYYYMM)
    "first_time_homebuyer_flag",            # First-time homebuyer flag
    "maturity_date",                        # Loan maturity date (YYYYMM)
    "msa_md",                               # MSA / Metropolitan Division
    "mi_percent",                           # Mortgage Insurance percentage
    "number_of_units",                      # Number of units in the property
    "occupancy_status",                     # Occupancy status (owner/second/investment)
    "original_cltv",                        # Original Combined Loan-to-Value
    "original_dti",                         # Original Debt-to-Income ratio
    "original_upb",                         # Original Unpaid Principal Balance
    "original_ltv",                         # Original Loan-to-Value
    "original_interest_rate",               # Original Interest Rate
    "channel",                              # Origination channel (Retail, Broker, etc.)
    "ppm_flag",                             # Prepayment penalty flag
    "amortization_type",                    # Amortization type (FRM/ARM)
    "property_state",                       # Property state (2-letter code)
    "property_type",                        # Property type
    "postal_code",                          # Postal code (last 2 digits = 00)
    "loan_sequence_number",                 # Unique loan identifier (primary key)
    "loan_purpose",                         # Loan purpose (Purchase, Refinance)
    "original_loan_term",                   # Original loan term (months)
    "number_of_borrowers",                  # Number of borrowers
    "seller_name",                          # Seller name
    "servicer_name",                        # Servicer name
    "super_conforming_flag",                # Super conforming flag
    "pre_relief_refi_loan_seq_number",      # Pre-relief refinance loan sequence number
    "special_eligibility_program",          # Special eligibility program
    "relief_refinance_indicator",           # Relief refinance indicator
    "property_valuation_method",            # Property valuation method
    "interest_only_indicator",              # Interest-only indicator
    "mi_cancellation_indicator"             # MI cancellation indicator
]

# Column names for the Performance dataset
colnames_performance = [
    "loan_sequence_number",                 # Loan identifier (primary key)
    "monthly_reporting_period",             # Reporting period (YYYYMM)
    "current_actual_upb",                   # Current actual Unpaid Principal Balance
    "current_loan_delinquency_status",      # Loan delinquency status (0,1,2,..., RA)
    "loan_age",                             # Loan age in months
    "remaining_months_to_legal_maturity",   # Remaining months until legal maturity
    "defect_settlement_date",               # Defect settlement date (YYYYMM)
    "modification_flag",                    # Loan modification flag
    "zero_balance_code",                    # Zero balance code (01,02,03,09, etc.)
    "zero_balance_effective_date",          # Zero balance effective date (YYYYMM)
    "current_interest_rate",                # Current interest rate
    "current_non_interest_bearing_upb",     # Current non-interest-bearing UPB
    "ddlpi",                                # Due Date of Last Paid Installment (YYYYMM)
    "mi_recoveries",                        # Mortgage insurance recoveries
    "net_sale_proceeds",                    # Net sale proceeds
    "non_mi_recoveries",                    # Non-MI recoveries
    "total_expenses",                       # Total expenses
    "legal_costs",                          # Legal costs
    "maintenance_and_preservation_costs",   # Maintenance & preservation costs
    "taxes_and_insurance",                  # Taxes and insurance
    "miscellaneous_expenses",               # Miscellaneous expenses
    "actual_loss_calculation",              # Actual loss calculation
    "cumulative_modification_cost",         # Cumulative modification cost
    "step_modification_flag",               # Step modification flag
    "payment_deferral",                     # Payment deferral indicator
    "estimated_ltv",                        # Estimated Loan-to-Value (ELTV)
    "zero_balance_removal_upb",             # Zero balance removal UPB
    "delinquent_accrued_interest",          # Delinquent accrued interest
    "delinquency_due_to_disaster",          # Delinquency due to disaster
    "borrower_assistance_status_code",      # Borrower assistance status code
    "current_month_modification_cost",      # Current month modification cost
    "interest_bearing_upb"                  # Interest-bearing UPB
]

# Load both datasets (pipe-delimited, no header)
df_train_train_orig = pd.read_csv(path_orig, sep="|", header=None, names=colnames_origination)
df_perf = pd.read_csv(path_perf, sep="|", header=None, names=colnames_performance)


# Shape data
print(f"Shape data orig : {df_orig.shape}")
print(f"Shape data perf: {df_perf.shape}")

# Convert YYYYMM to datetime
df_perf["monthly_reporting_period"] = pd.to_datetime(
    df_perf["monthly_reporting_period"].astype(str), 
    format="%Y%m"
)
df_orig["first_payment_date"] = pd.to_datetime(
    df_orig["first_payment_date"].astype(str), 
    format="%Y%m"
)

# Convert to monthly Period type (YYYY-MM), easier for month arithmetic
df_perf["monthly_reporting_period"] = df_perf["monthly_reporting_period"].dt.to_period("M")
df_orig["first_payment_date"] = df_orig["first_payment_date"].dt.to_period("M")

# Merge Origination info into Performance dataset
df_perf = pd.merge(
    df_perf, 
    df_orig[["loan_sequence_number", "first_payment_date"]], 
    on="loan_sequence_number", 
    how="left"
)

# Compute loan age in months since origination
df_perf["months_since_orig"] = (
    df_perf["monthly_reporting_period"] - df_perf["first_payment_date"]
).apply(lambda x: x.n)

# Flag observations within the first 24 months after origination
df_perf["within_24m"] = df_perf["months_since_orig"] <= 24



df_perf_within = df_perf.copy()
# print(df_perf_within.shape)
df_perf_within = df_perf_within[df_perf_within["within_24m"] == True]
# print(df_perf_within.shape)




# Define default:
# - delinquency status not in {0,1,2} (=> 90+ days delinquent or RA)
# - OR zero_balance_code in {03,09,15,16,96} (dispositions / foreclosures)
df_perf_within["default"] = np.where(
    (~df_perf_within["current_loan_delinquency_status"].astype(str).isin(["0","1","2"])) |
    (df_perf_within["zero_balance_code"].astype(str).isin(["03","09","15","16","96"])),
    1, 0
)

# Aggregate at loan level: max(default) = 1 if loan ever defaulted in 24m
loan_level = (
    df_perf_within.groupby("loan_sequence_number")["default"]
    .max()  # 0 if always current, 1 if ever default
    .reset_index()
)

df_orig = pd.merge(df_orig, loan_level, on = "loan_sequence_number", how = "left")

# Feature engeenering


When building a credit scoring model, it’s essential to respect the time dimension: the model should be trained on past vintages of loans and evaluated on future vintages. For example, if we have data for 2021Q4, 2022Q1, and 2022Q2, you would train on the first two quarters and keep 2022Q2 strictly for testing. This “out-of-time” validation mimics the real-world situation where a model is always used to predict the future, and it ensures that performance and calibration are not artificially inflated by information leakage across time.

In [1]:
from pathlib import Path
import re
import pandas as pd

PATH = Path("../data/processed/default_labels")
PATTERN = re.compile(r"default_labels_24m_(\d{4}Q[1-4])\.csv$", re.I)

# Optionnel: accélère et économise de la RAM (pandas ≥ 2.0)
READ_KW = dict(engine="pyarrow", dtype_backend="pyarrow")
# Optionnel: ne charger que les colonnes utiles
# READ_KW["usecols"] = ["loan_sequence_number", "default_24m", "vintage", ...]

files = sorted(
    (p for p in PATH.glob("default_labels_24m_*.csv") if PATTERN.match(p.name)),
    key=lambda p: PATTERN.match(p.name).group(1)
)

buckets = {"train": [], "validation": [], "test": []}
def year_to_split(y: int):
    if 2020 <= y <= 2022: return "train"
    if y == 2023:         return "validation"
    if y == 2024:         return "test"
    return None  # ignore/alerter si autre année

for p in files:
    qstr = PATTERN.match(p.name).group(1)       # ex: "2021Q3"
    q = pd.Period(qstr, freq="Q")
    split = year_to_split(q.year)
    if split is None:
        print(f"Ignoré: {p.name}")
        continue
    df = pd.read_csv(p, **READ_KW)
    df["vintage"] = q                            # utile pour groupby/tri
    buckets[split].append(df)

# Concat une seule fois par split (plus rapide que concat successifs)
df_train       = pd.concat(buckets["train"], ignore_index=True)
df_validation  = pd.concat(buckets["validation"], ignore_index=True)
df_test        = pd.concat(buckets["test"], ignore_index=True)

# (Optionnel) dédup si nécessaire
# for d in (df_train, df_validation, df_test):
#     d.drop_duplicates(subset="loan_sequence_number", inplace=True)

# Proportions jolies
sizes = {"train": len(df_train), "validation": len(df_validation), "test": len(df_test)}
total = sum(sizes.values())
for k, n in sizes.items():
    print(f"{k.capitalize():<12}: {n:>10,} rows  ({n/total:.2%})")

Ignoré: default_labels_24m_2014Q1.csv
Ignoré: default_labels_24m_2014Q2.csv
Ignoré: default_labels_24m_2014Q3.csv
Ignoré: default_labels_24m_2014Q4.csv
Ignoré: default_labels_24m_2015Q1.csv
Ignoré: default_labels_24m_2015Q2.csv
Ignoré: default_labels_24m_2015Q3.csv
Ignoré: default_labels_24m_2015Q4.csv
Ignoré: default_labels_24m_2016Q1.csv
Ignoré: default_labels_24m_2016Q2.csv
Ignoré: default_labels_24m_2016Q3.csv
Ignoré: default_labels_24m_2016Q4.csv
Ignoré: default_labels_24m_2017Q1.csv
Ignoré: default_labels_24m_2017Q2.csv
Ignoré: default_labels_24m_2017Q3.csv
Ignoré: default_labels_24m_2017Q4.csv
Ignoré: default_labels_24m_2018Q1.csv
Ignoré: default_labels_24m_2018Q2.csv
Ignoré: default_labels_24m_2018Q3.csv
Ignoré: default_labels_24m_2018Q4.csv
Ignoré: default_labels_24m_2019Q1.csv
Ignoré: default_labels_24m_2019Q2.csv
Ignoré: default_labels_24m_2019Q3.csv
Ignoré: default_labels_24m_2019Q4.csv
Train       :  9,590,892 rows  (82.90%)
Validation  :    931,745 rows  (8.05%)
Test     

In [None]:
df_train.dtypes

In [2]:
import pandas as pd
from pandas.api.types import CategoricalDtype

df_train_clean = df_train.copy()

# --- Helpers ---
def yn_space_to_bool(s, na_vals=('9',)):
    s = s.astype('string').str.strip().str.upper()
    # blank (espace/vide) = False par défaut pour certains indicateurs (cf. doc)
    s = s.fillna('').replace({' ': ''})
    out = s.map({'Y': True, 'N': False, '': False})
    out[s.isin(na_vals)] = pd.NA
    return out.astype('boolean')

def to_periodM(col):
    s = col.astype('string').str.strip()
    s = s.where(s.str.fullmatch(r'\d{6}'), pd.NA)   # YYYYMM
    return pd.PeriodIndex(s, freq='M')

def to_periodQ(col):
    s = col.astype('string').str.strip().str.upper()
    # suppose un format type '2016Q4'
    return pd.PeriodIndex(s.where(s.str.fullmatch(r'\d{4}Q[1-4]'), pd.NA), freq='Q')

# --- Dates / périodes ---
df_train_clean['first_payment_date'] = pd.to_datetime(df_train_clean['first_payment_date'],
                                                       errors="coerce").dt.to_period("M")
df_train_clean['maturity_date']      = to_periodM(df_train_clean['maturity_date'])
df_train_clean['vintage']            = to_periodQ(df_train_clean['vintage'])

# --- Booléens (selon doc) ---
df_train_clean['ppm_flag']                    = yn_space_to_bool(df_train_clean['ppm_flag'], na_vals=())
df_train_clean['interest_only_indicator']     = yn_space_to_bool(df_train_clean['interest_only_indicator'], na_vals=())
df_train_clean['super_conforming_flag']       = yn_space_to_bool(df_train_clean['super_conforming_flag'], na_vals=())
df_train_clean['first_time_homebuyer_flag']   = yn_space_to_bool(df_train_clean['first_time_homebuyer_flag'], na_vals=('9',))
# Relief refinance: Y / (blank)
df_train_clean['relief_refinance_indicator']  = yn_space_to_bool(df_train_clean['relief_refinance_indicator'])

# Si ta cible est 0/1 :
if pd.api.types.is_integer_dtype(df_train_clean['default_24m']) or pd.api.types.is_bool_dtype(df_train_clean['default_24m']):
    df_train_clean['default_24m'] = df_train_clean['default_24m'].map({1: True, 0: False}).astype('boolean')

# --- Sentinelles -> NA puis cast num ---
df_train_clean['credit_score']       = df_train_clean['credit_score'].replace({9999: pd.NA}).astype('Int16')
df_train_clean['mi_percent']         = df_train_clean['mi_percent'].replace({999: pd.NA}).astype('Int16')
df_train_clean['number_of_units']    = df_train_clean['number_of_units'].replace({99: pd.NA}).astype('Int8')
df_train_clean['original_cltv']      = df_train_clean['original_cltv'].replace({999: pd.NA}).astype('Int16')
df_train_clean['original_dti']       = df_train_clean['original_dti'].replace({999: pd.NA}).astype('Int16')
df_train_clean['original_ltv']       = df_train_clean['original_ltv'].replace({999: pd.NA}).astype('Int16')
df_train_clean['original_loan_term'] = df_train_clean['original_loan_term'].astype('Int16')
df_train_clean['number_of_borrowers']= df_train_clean['number_of_borrowers'].replace({99: pd.NA}).astype('Int8')
df_train_clean['original_upb']       = df_train_clean['original_upb'].astype('Int64')
df_train_clean['msa_md']             = df_train_clean['msa_md'].astype('Int32', errors='ignore')  # si NA déjà ok

# --- Identifiants / codes ---
df_train_clean['loan_sequence_number']          = df_train_clean['loan_sequence_number'].astype('string')
df_train_clean['pre_relief_refi_loan_seq_number']= df_train_clean['pre_relief_refi_loan_seq_number'].astype('string')

# Postal code : string + padding
df_train_clean['postal_code'] = (df_train_clean['postal_code']
    .astype('Int64')        # si c'était numérique
    .astype('string')
    .str.strip().str.upper()
    .str.zfill(5)
)

# --- Catégorielles (avec gestion des codes "NA") ---
df_train_clean['occupancy_status'] = (df_train_clean['occupancy_status'].astype('string').str.strip().str.upper()
                          .replace({'9': pd.NA})
                          .astype(CategoricalDtype(categories=['P','S','I'], ordered=False)))

df_train_clean['channel'] = (df_train_clean['channel'].astype('string').str.strip().str.upper()
                 .replace({'9': pd.NA})
                 .astype(CategoricalDtype(categories=['R','B','C','T'], ordered=False)))

df_train_clean['amortization_type'] = (df_train_clean['amortization_type'].astype('string').str.strip().str.upper()
                           .astype(CategoricalDtype(categories=['FRM','ARM'], ordered=False)))

df_train_clean['property_state'] = df_train_clean['property_state'].astype('string').str.strip().str.upper().astype('category')

df_train_clean['property_type'] = (df_train_clean['property_type'].astype('string').str.strip().str.upper()
                       .replace({'99': pd.NA})
                       .astype(CategoricalDtype(categories=['SF','CO','PU','CP','MH'], ordered=False)))

df_train_clean['loan_purpose'] = (df_train_clean['loan_purpose'].astype('string').str.strip().str.upper()
                      .replace({'9': pd.NA})
                      .astype(CategoricalDtype(categories=['P','C','N','R'], ordered=False)))

df_train_clean['special_eligibility_program'] = (df_train_clean['special_eligibility_program'].astype('string').str.strip().str.upper()
                                     .replace({'9': pd.NA})
                                     .astype(CategoricalDtype(categories=['H','F','R'], ordered=False)))

# Valeurs 1..4..9 => catégorie (pas ordonnée)
df_train_clean['property_valuation_method'] = (df_train_clean['property_valuation_method']
                                   .replace({9: pd.NA})
                                   .astype('Int8')
                                   .astype('category'))

In [3]:
df_train_clean

Unnamed: 0,credit_score,first_payment_date,first_time_homebuyer_flag,maturity_date,msa_md,mi_percent,number_of_units,occupancy_status,original_cltv,original_dti,...,servicer_name,super_conforming_flag,pre_relief_refi_loan_seq_number,special_eligibility_program,relief_refinance_indicator,property_valuation_method,interest_only_indicator,mi_cancellation_indicator,default_24m,vintage
0,661,2020-06,False,2035-05,41540,0,1,P,36,19,...,Other servicers,False,,,False,2,False,7,False,2020Q1
1,681,2020-03,False,2050-02,45820,30,1,P,95,13,...,"ROCKET MORTGAGE, LLC",False,,,False,2,False,N,False,2020Q1
2,775,2020-04,False,2050-03,,25,1,P,87,29,...,PHH MORTGAGE CORPORATION,False,,,False,2,False,N,False,2020Q1
3,770,2020-03,False,2035-02,41180,0,2,I,65,14,...,Other servicers,False,,,False,2,False,7,False,2020Q1
4,791,2020-04,False,2050-03,10580,0,1,P,80,33,...,Other servicers,False,,,False,2,False,7,False,2020Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9590887,791,2024-08,False,2054-07,19804,0,1,P,69,29,...,Other servicers,False,,,False,2,False,7,False,2022Q4
9590888,750,2024-05,False,2054-04,48424,25,1,P,90,50,...,Other servicers,False,,,False,2,False,N,True,2022Q4
9590889,773,2024-07,False,2054-06,27260,0,1,P,38,35,...,Other servicers,False,,,False,2,False,7,False,2022Q4
9590890,817,2024-08,False,2054-07,45104,0,1,P,55,35,...,Other servicers,False,,,False,2,False,7,False,2022Q4


In [88]:
# just to have a clear view of what to do with the correlation

from collections import defaultdict

dict_types = {
 'credit_score': 'continuous',
 'first_payment_date': 'date',
 'first_time_homebuyer_flag': 'bool',            # Y/N, 9 -> NA
 'maturity_date': 'date',
 'msa_md': 'category',
 'mi_percent': 'continuous',                     # pas catégoriel
 'number_of_units': 'category',                  # 1–4, 99 -> NA (caté OK)
 'occupancy_status': 'category',
 'original_cltv': 'continuous',
 'original_dti': 'continuous',
 'original_upb': 'continuous',
 'original_ltv': 'continuous',
 'original_interest_rate': 'continuous',
 'channel': 'category',
 'ppm_flag': 'bool',                             # Y/N
 'amortization_type': 'category',                # FRM/ARM
 'property_state': 'category',
 'property_type': 'category',
 'postal_code': 'category',
 'loan_sequence_number': 'id',
 'loan_purpose': 'category',
 'original_loan_term': 'integer',                # nb de mois, pas une caté
 'number_of_borrowers': 'integer',               # petit entier
 'seller_name': 'category',
 'servicer_name': 'category',
 'super_conforming_flag': 'bool',                # Y / vide
 'pre_relief_refi_loan_seq_number': 'id_ref',    # clé de ref, pas une caté
 'special_eligibility_program': 'category',      # H/F/R/9
 'relief_refinance_indicator': 'bool',           # Y / vide
 'property_valuation_method': 'category',        # codes 1–4, 9 -> NA
 'interest_only_indicator': 'bool',              # Y/N
 'mi_cancellation_indicator': 'category',        # Y/N/7/9
 'default_24m': 'bool',
 'vintage': 'date'                               # ex. YYYY-Q à partir de FIRST_PAYMENT_DATE
}

# inversion type -> liste de colonnes
_dict_types_col = defaultdict(list)
for col, t in dict_types.items():
    _dict_types_col[t].append(col)

dict_types_col = dict(_dict_types_col)  # (optionnel) convertir en dict standard

cont_cols = dict_types_col.get('continuous', [])
ord_cols  = dict_types_col.get('integer', [])      # ici: original_loan_term, number_of_borrowers (traités comme ordinal/binaire)
cat_cols  = dict_types_col.get('category', []) + dict_types_col.get('bool', [])

cont_cols

['credit_score',
 'mi_percent',
 'original_cltv',
 'original_dti',
 'original_upb',
 'original_ltv',
 'original_interest_rate']

In [None]:
from scipy.stats import spearmanr, pointbiserialr, pearsonr, kendalltau
from scipy.stats import chi2_contingency
from itertools import combinations

spearman_pairs  = {}
pearson_pairs   = {}
cramer_pairs    = {}
pb_pairs        = {}
kendall_pairs   = {}

# Utiliser un seul DataFrame propre
D = df_train_clean

# 1) Spearman : ordinal <-> continu (monotone)
for x in ord_cols:
    for y in cont_cols:
        s = D[[x, y]].dropna()
        if s.empty: continue
        r, p = spearmanr(s[x], s[y])
        spearman_pairs[(x, y)] = (r, p)

# 2) Pearson : continu <-> continu (sans doublons ni diagonale)
for x, y in combinations(cont_cols, 2):
    s = D[[x, y]].dropna()
    if s.empty: continue
    r, p = pearsonr(s[x], s[y])
    pearson_pairs[(x, y)] = (r, p)

# 3) Point-bisérial : continu <-> booléen binaire
bin_cols = [c for c in dict_types_col.get('bool', [])
            if set(D[c].dropna().unique()).issubset({0,1,True,False})]
for x in cont_cols:
    for b in bin_cols:
        s = D[[x, b]].dropna()
        if s.empty: continue
        r, p = pointbiserialr(s[x], s[b].astype(int))
        pb_pairs[(x, b)] = (r, p)   # <-- bug corrigé

# 4) Kendall tau-b : ordinal <-> ordinal (et en option ordinal <-> continu)
for x, y in combinations(ord_cols, 2):
    s = D[[x, y]].dropna()
    if s.empty: continue
    tau, p = kendalltau(s[x], s[y], method="auto")  # tau-b avec correction des ties
    kendall_pairs[(x, y)] = (tau, p)

# Optionnel : Kendall pour ordinal <-> continu
for x in ord_cols:
    for y in cont_cols:
        s = D[[x, y]].dropna()
        if s.empty: continue
        tau, p = kendalltau(s[x], s[y], method="auto")
        kendall_pairs[(x, y)] = (tau, p)

# 5) Cramér V corrigé : nominal/ordinal <-> nominal/ordinal
def cramers_v_corrected(x, y):
    tbl = pd.crosstab(x, y)
    if tbl.size == 0: 
        return np.nan
    chi2 = chi2_contingency(tbl, correction=False)[0]
    n = tbl.values.sum()
    r, k = tbl.shape
    phi2 = chi2 / n
    # Correction biais (Bergsma, 2013)
    phi2corr = max(0, phi2 - (k-1)*(r-1)/(n-1))
    rcorr = r - (r-1)**2/(n-1)
    kcorr = k - (k-1)**2/(n-1)
    denom = max(1e-12, min((kcorr-1), (rcorr-1)))
    return np.sqrt(phi2corr / denom)

cats = list(set(cat_cols) | set(ord_cols))  # traiter l'ordinal comme catégoriel ici
for a, b in combinations(cats, 2):
    s = D[[a, b]].dropna()
    if s.empty: continue
    cramer_pairs[(a, b)] = cramers_v_corrected(s[a], s[b])


{('original_loan_term', 'credit_score'): (np.float64(-0.07325623265283099),
  np.float64(0.0)),
 ('original_loan_term', 'mi_percent'): (np.float64(0.2408243365885056),
  np.float64(0.0)),
 ('original_loan_term', 'original_cltv'): (np.float64(0.33863212960898986),
  np.float64(0.0)),
 ('original_loan_term', 'original_dti'): (np.float64(0.11264715762440759),
  np.float64(0.0)),
 ('original_loan_term', 'original_upb'): (np.float64(0.21521854078109512),
  np.float64(0.0)),
 ('original_loan_term', 'original_ltv'): (np.float64(0.33814580794728977),
  np.float64(0.0)),
 ('original_loan_term',
  'original_interest_rate'): (np.float64(0.4284890279478544), np.float64(0.0)),
 ('number_of_borrowers', 'credit_score'): (np.float64(-0.04810525632171764),
  np.float64(0.0)),
 ('number_of_borrowers', 'mi_percent'): (np.float64(-0.03504330868478439),
  np.float64(0.0)),
 ('number_of_borrowers', 'original_cltv'): (np.float64(-0.04760017214345313),
  np.float64(0.0)),
 ('number_of_borrowers', 'original_dt

In [None]:
# Deal with NA :
# 2020 Cohort has a lot of missing value
round(df_train_clean.isna().sum()/df_train_clean.shape[0]*100,3)

na_cols = df_train_clean.isna().sum()/df_train_clean.shape[0]*100
na_cols = na_cols[na_cols != 0]
print(round(na_cols,5))

cols_to_drop = ["pre_relief_refi_loan_seq_number"]
dict_imput = {
    "mode"      : ["channel","property_valuation_method","first_time_homebuyer_flag"],
    "median"    : ["original_cltv","mi_percent","original_dti"],
    "special"   : ["credit_score"],
    "na_into_cat" : ["special_eligibility_program","msa_md"]
}

credit_score                         0.01490
first_time_homebuyer_flag            0.00005
msa_md                               8.63984
mi_percent                           0.00024
original_cltv                        0.00019
original_dti                         0.00753
channel                              0.00001
pre_relief_refi_loan_seq_number    100.00000
special_eligibility_program         96.00078
property_valuation_method            0.01490
dtype: float64


In [95]:
df_train_clean[df_train_clean["original_cltv"].isna()]["original_ltv"]

4272       97
30746      97
50869      74
53565      94
282517     76
289067     96
383089     93
403961     96
513318     80
537087     74
541029     90
551815     86
711310     94
736687     64
747877     97
1863519    69
2026875    91
5132409    43
Name: original_ltv, dtype: Int16

# Graph, data viz 

In [None]:
counts = df_train["default_24m"].value_counts(normalize=True, dropna=False) * 100

# 4) Tracé Matplotlib (et non seaborn)
fig, ax = plt.subplots()
bars = ax.bar(counts.index.astype(str), counts.values, color="lightblue")

ax.set_title("Default modality proportion")
ax.set_xlabel("Modalité (default_24m)")
ax.set_ylabel("Pourcentage (%)")

# Légendes de pourcentage sur chaque barre
ax.bar_label(bars, labels=[f"{v:.1f}%" for v in counts.values], padding=3)

# Marges et rendu propre
ax.set_ylim(0, max(100, counts.max() * 1.15))   # laisse un peu d'espace pour le label
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Données (sans NaN)
x = df_train["original_upb"].dropna().values

# Bins robustes (Freedman–Diaconis) ; repli sur 'auto' si nécessaire
try:
    bins = np.histogram_bin_edges(x, bins='fd')
    if np.isinf(bins).any() or len(bins) < 2:
        raise ValueError
except Exception:
    bins = 'auto'

# Mise en forme "milliers avec espace"
fmt = FuncFormatter(lambda v, pos: f"{v:,.0f}".replace(",", " "))

fig, ax = plt.subplots(figsize=(9, 5), dpi=120)

# Histogramme (effectifs)
n, b, patches = ax.hist(
    x, bins=bins, density=False,
    edgecolor='white', linewidth=0.8, alpha=0.9
)

# Lignes moyenne / médiane
moy = np.mean(x)
med = np.median(x)
ax.axvline(moy, linestyle='--', linewidth=1.6, label=f"Moyenne = {moy:,.0f}".replace(",", " "))
ax.axvline(med, linestyle=':',  linewidth=1.6, label=f"Médiane = {med:,.0f}".replace(",", " "))

# Habillage
ax.set_title("Distribution de original_upb", pad=10)
ax.set_xlabel("original_upb")
ax.set_ylabel("Effectifs")
ax.grid(axis='y', linestyle=':', alpha=0.5)
ax.xaxis.set_major_formatter(fmt)
ax.yaxis.set_major_formatter(fmt)
ax.legend(frameon=False, loc='upper right')

# (Optionnel) Utile si très asymétrique :
# ax.set_xscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Exemple : df est votre DataFrame et la colonne s’appelle 'status'
# 1) Lister les modalités (valeurs distinctes)
df_train['default_24m'].unique()

# 2) Compter chaque modalité
df['status'].value_counts()

# 3) N’afficher que les lignes dont la modalité est 'default'
df_default = df[df['status'].eq('default')]        # équiv. : df.query("status == 'default'")

# 4) Nombre et proportion de 'default'
nb_default = df['status'].eq('default').sum()
prop_default = df['status'].eq('default').mean()   # proportion entre 0 et 1

# 5) Si la colonne est codée 0/1, la mapper vers des libellés puis (optionnel) en catégorie ordonnée
df['status'] = df['status'].map({1: 'default', 0: 'non-default'}).astype('category')

from pandas.api.types import CategoricalDtype
cat = CategoricalDtype(categories=['non-default', 'default'], ordered=True)
df['status'] = df['status'].astype(cat)

# 6) Afficher la catégorie 'default' (si dtype catégoriel) et vérifier qu’elle existe
'default' in df['status'].cat.categories
