## Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 0) Process data

We want to define :
- Definition of  default
- Time step (12 months, 24 months)

If the lender default after 24 months we do not take it into account.


We define the default as having 90 days of non payment or having zero_balance code a 02, 03 or 09 (cf data details) between the "FIRST PAYMENT DATE" and the "MONTLHLY REPORTING PERIOD" being under 12 months.

In [None]:
##### make_labels.py initial code

# Paths to origination and performance datasets
path_orig = "../data/raw/mortgage_data/historical_data_2022Q1/historical_data_2022Q1.txt"
path_perf = "../data/raw/mortgage_data/historical_data_2022Q1/historical_data_time_2022Q1.txt"

# Column names for the Origination dataset
colnames_origination = [
    "credit_score",                         # Borrower credit score
    "first_payment_date",                   # First scheduled payment date (YYYYMM)
    "first_time_homebuyer_flag",            # First-time homebuyer flag
    "maturity_date",                        # Loan maturity date (YYYYMM)
    "msa_md",                               # MSA / Metropolitan Division
    "mi_percent",                           # Mortgage Insurance percentage
    "number_of_units",                      # Number of units in the property
    "occupancy_status",                     # Occupancy status (owner/second/investment)
    "original_cltv",                        # Original Combined Loan-to-Value
    "original_dti",                         # Original Debt-to-Income ratio
    "original_upb",                         # Original Unpaid Principal Balance
    "original_ltv",                         # Original Loan-to-Value
    "original_interest_rate",               # Original Interest Rate
    "channel",                              # Origination channel (Retail, Broker, etc.)
    "ppm_flag",                             # Prepayment penalty flag
    "amortization_type",                    # Amortization type (FRM/ARM)
    "property_state",                       # Property state (2-letter code)
    "property_type",                        # Property type
    "postal_code",                          # Postal code (last 2 digits = 00)
    "loan_sequence_number",                 # Unique loan identifier (primary key)
    "loan_purpose",                         # Loan purpose (Purchase, Refinance)
    "original_loan_term",                   # Original loan term (months)
    "number_of_borrowers",                  # Number of borrowers
    "seller_name",                          # Seller name
    "servicer_name",                        # Servicer name
    "super_conforming_flag",                # Super conforming flag
    "pre_relief_refi_loan_seq_number",      # Pre-relief refinance loan sequence number
    "special_eligibility_program",          # Special eligibility program
    "relief_refinance_indicator",           # Relief refinance indicator
    "property_valuation_method",            # Property valuation method
    "interest_only_indicator",              # Interest-only indicator
    "mi_cancellation_indicator"             # MI cancellation indicator
]

# Column names for the Performance dataset
colnames_performance = [
    "loan_sequence_number",                 # Loan identifier (primary key)
    "monthly_reporting_period",             # Reporting period (YYYYMM)
    "current_actual_upb",                   # Current actual Unpaid Principal Balance
    "current_loan_delinquency_status",      # Loan delinquency status (0,1,2,..., RA)
    "loan_age",                             # Loan age in months
    "remaining_months_to_legal_maturity",   # Remaining months until legal maturity
    "defect_settlement_date",               # Defect settlement date (YYYYMM)
    "modification_flag",                    # Loan modification flag
    "zero_balance_code",                    # Zero balance code (01,02,03,09, etc.)
    "zero_balance_effective_date",          # Zero balance effective date (YYYYMM)
    "current_interest_rate",                # Current interest rate
    "current_non_interest_bearing_upb",     # Current non-interest-bearing UPB
    "ddlpi",                                # Due Date of Last Paid Installment (YYYYMM)
    "mi_recoveries",                        # Mortgage insurance recoveries
    "net_sale_proceeds",                    # Net sale proceeds
    "non_mi_recoveries",                    # Non-MI recoveries
    "total_expenses",                       # Total expenses
    "legal_costs",                          # Legal costs
    "maintenance_and_preservation_costs",   # Maintenance & preservation costs
    "taxes_and_insurance",                  # Taxes and insurance
    "miscellaneous_expenses",               # Miscellaneous expenses
    "actual_loss_calculation",              # Actual loss calculation
    "cumulative_modification_cost",         # Cumulative modification cost
    "step_modification_flag",               # Step modification flag
    "payment_deferral",                     # Payment deferral indicator
    "estimated_ltv",                        # Estimated Loan-to-Value (ELTV)
    "zero_balance_removal_upb",             # Zero balance removal UPB
    "delinquent_accrued_interest",          # Delinquent accrued interest
    "delinquency_due_to_disaster",          # Delinquency due to disaster
    "borrower_assistance_status_code",      # Borrower assistance status code
    "current_month_modification_cost",      # Current month modification cost
    "interest_bearing_upb"                  # Interest-bearing UPB
]

# Load both datasets (pipe-delimited, no header)
df_train_train_orig = pd.read_csv(path_orig, sep="|", header=None, names=colnames_origination)
df_perf = pd.read_csv(path_perf, sep="|", header=None, names=colnames_performance)


# Shape data
print(f"Shape data orig : {df_orig.shape}")
print(f"Shape data perf: {df_perf.shape}")

# Convert YYYYMM to datetime
df_perf["monthly_reporting_period"] = pd.to_datetime(
    df_perf["monthly_reporting_period"].astype(str), 
    format="%Y%m"
)
df_orig["first_payment_date"] = pd.to_datetime(
    df_orig["first_payment_date"].astype(str), 
    format="%Y%m"
)

# Convert to monthly Period type (YYYY-MM), easier for month arithmetic
df_perf["monthly_reporting_period"] = df_perf["monthly_reporting_period"].dt.to_period("M")
df_orig["first_payment_date"] = df_orig["first_payment_date"].dt.to_period("M")

# Merge Origination info into Performance dataset
df_perf = pd.merge(
    df_perf, 
    df_orig[["loan_sequence_number", "first_payment_date"]], 
    on="loan_sequence_number", 
    how="left"
)

# Compute loan age in months since origination
df_perf["months_since_orig"] = (
    df_perf["monthly_reporting_period"] - df_perf["first_payment_date"]
).apply(lambda x: x.n)

# Flag observations within the first 24 months after origination
df_perf["within_24m"] = df_perf["months_since_orig"] <= 24



df_perf_within = df_perf.copy()
# print(df_perf_within.shape)
df_perf_within = df_perf_within[df_perf_within["within_24m"] == True]
# print(df_perf_within.shape)




# Define default:
# - delinquency status not in {0,1,2} (=> 90+ days delinquent or RA)
# - OR zero_balance_code in {03,09,15,16,96} (dispositions / foreclosures)
df_perf_within["default"] = np.where(
    (~df_perf_within["current_loan_delinquency_status"].astype(str).isin(["0","1","2"])) |
    (df_perf_within["zero_balance_code"].astype(str).isin(["03","09","15","16","96"])),
    1, 0
)

# Aggregate at loan level: max(default) = 1 if loan ever defaulted in 24m
loan_level = (
    df_perf_within.groupby("loan_sequence_number")["default"]
    .max()  # 0 if always current, 1 if ever default
    .reset_index()
)

df_orig = pd.merge(df_orig, loan_level, on = "loan_sequence_number", how = "left")

# 0) Working on the data


When building a credit scoring model, it’s essential to respect the time dimension: the model should be trained on past vintages of loans and evaluated on future vintages. For example, if we have data for 2021Q4, 2022Q1, and 2022Q2, you would train on the first two quarters and keep 2022Q2 strictly for testing. This “out-of-time” validation mimics the real-world situation where a model is always used to predict the future, and it ensures that performance and calibration are not artificially inflated by information leakage across time.

In [None]:
# LOAD DATA

from pathlib import Path
import re
import pandas as pd

PATH = Path("../data/processed/default_labels")
PATTERN = re.compile(r"default_labels_24m_(\d{4}Q[1-4])\.csv$", re.I)

# Optionnel: accélère et économise de la RAM (pandas ≥ 2.0)
READ_KW = dict(engine="pyarrow", dtype_backend="pyarrow")
# Optionnel: ne charger que les colonnes utiles
# READ_KW["usecols"] = ["loan_sequence_number", "default_24m", "vintage", ...]

files = sorted(
    (p for p in PATH.glob("default_labels_24m_*.csv") if PATTERN.match(p.name)),
    key=lambda p: PATTERN.match(p.name).group(1)
)

buckets = {"train": [], "validation": [], "test": []}
def year_to_split(y: int):
    if 2020 <= y <= 2022: return "train"
    if y == 2023:         return "validation"
    if y == 2024:         return "test"
    return None  # ignore/alerter si autre année

for p in files:
    qstr = PATTERN.match(p.name).group(1)       # ex: "2021Q3"
    q = pd.Period(qstr, freq="Q")
    split = year_to_split(q.year)
    if split is None:
        print(f"Ignoré: {p.name}")
        continue
    df = pd.read_csv(p, **READ_KW)
    df["vintage"] = q                            # utile pour groupby/tri
    buckets[split].append(df)

# Concat une seule fois par split (plus rapide que concat successifs)
df_train       = pd.concat(buckets["train"], ignore_index=True)
df_validation  = pd.concat(buckets["validation"], ignore_index=True)
df_test        = pd.concat(buckets["test"], ignore_index=True)

# (Optionnel) dédup si nécessaire
# for d in (df_train, df_validation, df_test):
#     d.drop_duplicates(subset="loan_sequence_number", inplace=True)

# Proportions jolies
sizes = {"train": len(df_train), "validation": len(df_validation), "test": len(df_test)}
total = sum(sizes.values())
for k, n in sizes.items():
    print(f"{k.capitalize():<12}: {n:>10,} rows  ({n/total:.2%})")

# 1) Data types

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype, is_integer_dtype, is_bool_dtype
from sklearn.base import BaseEstimator, TransformerMixin

class LoanDataPreprocessor(BaseEstimator, TransformerMixin):
    """
    Encapsule les transformations de typage/parse/casts/catégories
    pour df_train et df_validation, avec catégories apprises sur le train.
    """

    def __init__(self, target_col='default_24m'):
        self.target_col = target_col

        # Catégories "fixes" selon doc
        self.fixed_cats_ = {
            'occupancy_status': ['P','S','I'],
            'channel': ['R','B','C','T'],
            'amortization_type': ['FRM','ARM'],
            'property_type': ['SF','CO','PU','CP','MH'],
            'loan_purpose': ['P','C','N','R'],
            'special_eligibility_program': ['H','F','R'],
        }

        # Ces catégories sont apprises sur le train (dynamiques)
        self.learned_cats_ = {
            'property_state': None,            # toutes les valeurs vues sur le train (upper)
            'property_valuation_method': None, # valeurs Int8 vues sur le train (hors 9)
        }

    # ---------- Helpers ----------
    @staticmethod
    def _yn_space_to_bool(s: pd.Series, na_vals=('9',)):
        s = s.astype('string').str.strip().str.upper()
        # blank (espace/vide) = False par défaut
        s = s.fillna('').replace({' ': ''})
        out = s.map({'Y': True, 'N': False, '': False})
        if na_vals:
            out[s.isin(na_vals)] = pd.NA
        return out.astype('boolean')

    @staticmethod
    def _to_periodM(col: pd.Series):
        s = col.astype('string').str.strip()
        s = s.where(s.str.fullmatch(r'\d{6}'), pd.NA)  # YYYYMM
        return pd.PeriodIndex(s, freq='M')

    @staticmethod
    def _to_periodQ(col: pd.Series):
        s = col.astype('string').str.strip().str.upper()
        s = s.where(s.str.fullmatch(r'\d{4}Q[1-4]'), pd.NA)  # ex: 2016Q4
        return pd.PeriodIndex(s, freq='Q')

    # ---------- Fit ----------
    def fit(self, X, y=None):
        df = X.copy()

        # Pré-traitement minimal pour apprendre les catégories dynamiques
        # property_state
        if 'property_state' in df.columns:
            s = df['property_state'].astype('string').str.strip().str.upper()
            self.learned_cats_['property_state'] = sorted(s.dropna().unique().tolist())

        # property_valuation_method (remplacer 9 -> NA, caster Int8, puis apprendre les niveaux)
        if 'property_valuation_method' in df.columns:
            pvm = pd.to_numeric(df['property_valuation_method'], errors='coerce')
            pvm = pvm.replace({9: pd.NA}).astype('Int8')
            self.learned_cats_['property_valuation_method'] = sorted(pd.Series(pvm).dropna().unique().tolist())

        return self

    # ---------- Transform ----------
    def transform(self, X):
        df = X.copy()

        # --- Dates / périodes ---
        if 'first_payment_date' in df.columns:
            df['first_payment_date'] = pd.to_datetime(df['first_payment_date'], errors="coerce").dt.to_period("M")
        if 'maturity_date' in df.columns:
            df['maturity_date'] = self._to_periodM(df['maturity_date'])
        if 'vintage' in df.columns:
            df['vintage'] = self._to_periodQ(df['vintage'])

        # --- Booléens (selon doc) ---
        if 'ppm_flag' in df.columns:
            df['ppm_flag'] = self._yn_space_to_bool(df['ppm_flag'], na_vals=())
        if 'interest_only_indicator' in df.columns:
            df['interest_only_indicator'] = self._yn_space_to_bool(df['interest_only_indicator'], na_vals=())
        if 'super_conforming_flag' in df.columns:
            df['super_conforming_flag'] = self._yn_space_to_bool(df['super_conforming_flag'], na_vals=())
        if 'first_time_homebuyer_flag' in df.columns:
            df['first_time_homebuyer_flag'] = self._yn_space_to_bool(df['first_time_homebuyer_flag'], na_vals=('9',))
        if 'relief_refinance_indicator' in df.columns:
            # Y / (blank)
            df['relief_refinance_indicator'] = self._yn_space_to_bool(df['relief_refinance_indicator'])

        # Cible optionnelle -> boolean nullable
        if self.target_col in df.columns:
            s = df[self.target_col]
            if is_integer_dtype(s) or is_bool_dtype(s):
                df[self.target_col] = s.map({1: True, 0: False}).astype('boolean')

        # --- Sentinelles -> NA puis cast num ---
        if 'credit_score' in df.columns:
            df['credit_score'] = df['credit_score'].replace({9999: pd.NA}).astype('Int16')
        if 'mi_percent' in df.columns:
            df['mi_percent'] = df['mi_percent'].replace({999: pd.NA}).astype('Int16')
        if 'number_of_units' in df.columns:
            df['number_of_units'] = df['number_of_units'].replace({99: pd.NA}).astype('Int8')
        if 'original_cltv' in df.columns:
            df['original_cltv'] = df['original_cltv'].replace({999: pd.NA}).astype('Int16')
        if 'original_dti' in df.columns:
            df['original_dti'] = df['original_dti'].replace({999: pd.NA}).astype('Int16')
        if 'original_ltv' in df.columns:
            df['original_ltv'] = df['original_ltv'].replace({999: pd.NA}).astype('Int16')
        if 'original_loan_term' in df.columns:
            df['original_loan_term'] = df['original_loan_term'].astype('Int16')
        if 'number_of_borrowers' in df.columns:
            df['number_of_borrowers'] = df['number_of_borrowers'].replace({99: pd.NA}).astype('Int8')
        if 'original_upb' in df.columns:
            df['original_upb'] = df['original_upb'].astype('Int64')
        if 'msa_md' in df.columns:
            df['msa_md'] = pd.to_numeric(df['msa_md'], errors='coerce').astype('Int32')

        # --- Identifiants / codes ---
        if 'loan_sequence_number' in df.columns:
            df['loan_sequence_number'] = df['loan_sequence_number'].astype('string')
        if 'pre_relief_refi_loan_seq_number' in df.columns:
            df['pre_relief_refi_loan_seq_number'] = df['pre_relief_refi_loan_seq_number'].astype('string')

        # Postal code : string + padding
        if 'postal_code' in df.columns:
            s = df['postal_code'].astype('Int64')  # si c'était numérique
            s = s.astype('string').str.strip().str.upper()
            df['postal_code'] = s.str.zfill(5)

        # --- Catégorielles (avec gestion des codes "NA") ---
        if 'occupancy_status' in df.columns:
            s = df['occupancy_status'].astype('string').str.strip().str.upper().replace({'9': pd.NA})
            df['occupancy_status'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['occupancy_status'], ordered=False))

        if 'channel' in df.columns:
            s = df['channel'].astype('string').str.strip().str.upper().replace({'9': pd.NA})
            df['channel'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['channel'], ordered=False))

        if 'amortization_type' in df.columns:
            s = df['amortization_type'].astype('string').str.strip().str.upper()
            df['amortization_type'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['amortization_type'], ordered=False))

        if 'property_state' in df.columns:
            s = df['property_state'].astype('string').str.strip().str.upper()
            cats = self.learned_cats_['property_state'] if self.learned_cats_['property_state'] is not None else sorted(s.dropna().unique().tolist())
            df['property_state'] = s.astype(CategoricalDtype(categories=cats, ordered=False))

        if 'property_type' in df.columns:
            s = df['property_type'].astype('string').str.strip().str.upper().replace({'99': pd.NA})
            df['property_type'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['property_type'], ordered=False))

        if 'loan_purpose' in df.columns:
            s = df['loan_purpose'].astype('string').str.strip().str.upper().replace({'9': pd.NA})
            df['loan_purpose'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['loan_purpose'], ordered=False))

        if 'special_eligibility_program' in df.columns:
            s = df['special_eligibility_program'].astype('string').str.strip().str.upper().replace({'9': pd.NA})
            df['special_eligibility_program'] = s.astype(CategoricalDtype(categories=self.fixed_cats_['special_eligibility_program'], ordered=False))

        if 'property_valuation_method' in df.columns:
            # Valeurs 1..4..9 => catégorie (9 => NA)
            pvm = pd.to_numeric(df['property_valuation_method'], errors='coerce')
            pvm = pvm.replace({9: pd.NA}).astype('Int8')
            # Catégories apprises sur le train si disponibles
            cats = self.learned_cats_['property_valuation_method']
            if cats is None:
                cats = sorted(pd.Series(pvm).dropna().unique().tolist())
            # On cast en category sur la version stringifiée stable (ex: "1","2",...) OU on garde en Int8?
            # Ici on garde "category" directement depuis les Int8 via astype('category') est ok,
            # mais pour figer l'ordre on passe par CategoricalDtype sur la version string.
            s = pd.Series(pvm.astype('Int8'), index=df.index)
            # Convertir vers string pour unifier les niveaux en catégorie
            s_str = s.astype('Int16').astype('string')  # garde <NA>
            s_str = s_str.astype(CategoricalDtype(categories=[str(v) for v in cats], ordered=False))
            df['property_valuation_method'] = s_str

        return df


# ---------- Utilisation ----------
# 1) Fit sur TRAIN uniquement
prep = LoanDataPreprocessor(target_col='default_24m')
df_train_prep = prep.fit_transform(df_train)

# 2) Transform sur TRAIN et sur VALIDATION
df_validation_prep = prep.transform(df_validation)


PATH_OUTPUT = "../data/processed/merged/non_imputed/"
# Save data
df_train_prep.to_parquet(f"{PATH_OUTPUT}train.parquet")
df_validation_prep.to_parquet(f"{PATH_OUTPUT}validation.parquet")


# Intégration dans un Pipeline scikit-learn (exemple)
# pipe = Pipeline([('prep', LoanDataPreprocessor()), ('imp', DataImputer(use_cohort=True)), ('clf', LogisticRegression(...))])
# scores = cross_val_score(pipe, X, y, cv=KFold(...))


# 2) Imputation step

In [None]:
# Naive imputation (TODO)
# Meilleur code plus bas (dans deux cells)

In [None]:
# # Good imputation cell but old

# import numpy as np
# import pandas as pd
# from pandas.api.types import CategoricalDtype

# # Toggle: imputation par cohorte (année de vintage, etc.)
# imput_cohort = False

# df = df_train.copy()

# # ---------- 0) Drop évident ----------
# df.drop(columns=['pre_relief_refi_loan_seq_number'], errors='ignore', inplace=True)

# # ---------- Helpers ----------
# vyear = df['vintage'].dt.year if 'vintage' in df.columns else None

# # ---------- 1) Catés : Unknown / NotApplicable ----------
# # NB: first_time_homebuyer_flag est un bool tri-état -> on NE met PAS "Unknown" (on garde NA)
# # channel : garder Unknown plutôt que mode
# if 'channel' in df.columns and isinstance(df['channel'].dtype, CategoricalDtype):
#     df['channel'] = df['channel'].cat.add_categories(['Unknown']).fillna('Unknown')

# # property_valuation_method : recodage numérique + NotApplicable avant 2017 + NotAvailable
# if 'property_valuation_method' in df.columns:
#     pvm = pd.to_numeric(df['property_valuation_method'].astype('string'), errors='coerce')
#     if vyear is not None:
#         # 99 = NotApplicable (avant 2017)
#         pvm = pvm.where(vyear >= 2017, 99)
#     # 9 = NotAvailable
#     pvm = pvm.fillna(9)
#     df['property_valuation_method'] = pvm.astype('Int16').astype('category')

# # special_eligibility_program : garder la colonne + flag binaire
# if 'special_eligibility_program' in df.columns and isinstance(df['special_eligibility_program'].dtype, CategoricalDtype):
#     df['special_eligibility_program'] = df['special_eligibility_program'].cat.add_categories(['Unknown']).fillna('Unknown')
#     df['has_special_program'] = df['special_eligibility_program'].isin(['H','F','R']).astype('int8')

# # msa_md : regrouper NA / non-MSA (0)
# if 'msa_md' in df.columns:
#     df['msa_md'] = df['msa_md'].fillna(0)

# # ---------- 2) CREDIT SCORE : clip + médiane (année × purpose) + flag ----------
# if 'credit_score' in df.columns:
#     df['cs_missing'] = df['credit_score'].isna().astype('int8')
#     cs = pd.to_numeric(df['credit_score'], errors='coerce').clip(lower=300, upper=850)
#     if imput_cohort:
#         if vyear is not None and 'loan_purpose' in df.columns:
#             med = df.groupby([vyear, 'loan_purpose'])['credit_score'].transform('median')
#             cs = cs.fillna(med)
#         elif 'loan_purpose' in df.columns:
#             med = df.groupby(['loan_purpose'])['credit_score'].transform('median')
#             cs = cs.fillna(med)
#     df['credit_score'] = cs.fillna(cs.median()).round().astype('Int16')

# # ---------- 3) MI% : LTV≤80 -> 0 ; sinon médiane (année × LTV bins) + flags ----------
# if 'mi_percent' in df.columns:
#     df['mi_missing'] = df['mi_percent'].isna().astype('int8')
#     mi = pd.to_numeric(df['mi_percent'], errors='coerce').astype('Float32')
#     if 'original_ltv' in df.columns:
#         ltv = pd.to_numeric(df['original_ltv'], errors='coerce').clip(lower=0)  # sécurité
#         # règle métier : si LTV<=80 et MI NaN -> 0
#         mi = mi.mask(ltv.le(80) & mi.isna(), 0.0)
#         # imputation conditionnelle pour le reste
#         ltv_bins = pd.cut(ltv, [0, 80, 90, 95, 100, np.inf], include_lowest=True, right=True)
#         if vyear is not None and imput_cohort:
#             med = df.groupby([vyear, ltv_bins])['mi_percent'].transform('median')
#             mi = mi.fillna(med)
#         else:
#             med = df.groupby(ltv_bins)['mi_percent'].transform('median')
#             mi = mi.fillna(med)
#     df['mi_percent'] = mi.fillna(0.0).astype('Float32')
#     df['has_mi'] = (df['mi_percent'] > 0).astype('int8')

# # ---------- 4) DTI : médiane (année × purpose) + flag ----------
# if 'original_dti' in df.columns:
#     df['dti_missing'] = df['original_dti'].isna().astype('int8')
#     dti = pd.to_numeric(df['original_dti'], errors='coerce')
#     if imput_cohort:
#         if vyear is not None and 'loan_purpose' in df.columns:
#             med = df.groupby([vyear, 'loan_purpose'])['original_dti'].transform('median')
#             dti = dti.fillna(med)
#         elif 'loan_purpose' in df.columns:
#             med = df.groupby(['loan_purpose'])['original_dti'].transform('median')
#             dti = dti.fillna(med)
#     df['original_dti'] = dti.fillna(dti.median()).round().astype('Int16')

# # ---------- 5) CLTV : borne basse = LTV si possible + flag ; puis médiane ----------
# if 'original_cltv' in df.columns:
#     df['cltv_missing'] = df['original_cltv'].isna().astype('int8')
#     cltv = pd.to_numeric(df['original_cltv'], errors='coerce').astype('Float32')
#     if 'original_ltv' in df.columns:
#         ltv = pd.to_numeric(df['original_ltv'], errors='coerce').astype('Float32')
#         # si CLTV NaN & LTV dispo -> CLTV = LTV (borne basse)
#         cltv = cltv.fillna(ltv)
#         # sécurité : jamais CLTV < LTV
#         cltv = np.where(ltv.notna(), np.maximum(cltv, ltv), cltv)
#         cltv = pd.Series(cltv, index=df.index).astype('Float32')
#     if imput_cohort and vyear is not None:
#         med = df.groupby(vyear)['original_cltv'].transform('median')
#         cltv = cltv.fillna(med)
#     # fallback global si reste des NA
#     df['original_cltv'] = pd.Series(cltv, index=df.index).fillna(float(pd.Series(cltv).median())).astype('Float32')

# # ---------- 6) Petites ordinales ----------
# for col in ['original_loan_term', 'number_of_borrowers']:
#     if col in df.columns:
#         df[col + '_missing'] = df[col].isna().astype('int8')
#         # imputations simples au mode (peu de niveaux, impact limité)
#         try:
#             df[col] = df[col].fillna(df[col].mode(dropna=True).iloc[0])
#         except Exception:
#             df[col] = df[col].fillna(method='ffill').fillna(method='bfill')

# # ---------- 7) Contrôle ----------
# na_left = (df.isna().mean() * 100).round(3)
# print("NA restants (%):\n", na_left[na_left > 0].sort_values(ascending=False))

# # Résultat final
# df_train_imputed = df

In [None]:
# Best imputation cell

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.base import BaseEstimator, TransformerMixin

class DataImputer(BaseEstimator, TransformerMixin):
    def __init__(self, use_cohort=True, missing_flag = False, ltv_bins=(0, 80, 90, 95, 100, np.inf)):
        self.use_cohort = use_cohort
        self.ltv_bins = ltv_bins
        self.missing_flag = missing_flag

    # ---------- helpers ----------
    @staticmethod
    def _mode(x):
        try:
            return x.mode(dropna=True).iloc[0]
        except Exception:
            return np.nan

    @staticmethod
    def _to_year(series):
        try:
            return pd.to_datetime(series).dt.year
        except Exception:
            return None

    def _map_fill(self, keys_tuple_series, mapping):
        """keys_tuple_series = Series of tuples; mapping = dict {tuple: value}"""
        if mapping is None:
            return pd.Series(np.nan, index=keys_tuple_series.index)
        return keys_tuple_series.map(mapping)

    # ---------- fit on TRAIN ONLY ----------
    def fit(self, X, y=None):
        df = X.copy()

        # Precompute year if present
        vyear = self._to_year(df['vintage']) if 'vintage' in df.columns else None

        # STORAGE
        self.stats_ = {}

        # ---- CREDIT SCORE medians
        if 'credit_score' in df.columns:
            cs = pd.to_numeric(df['credit_score'], errors='coerce').clip(300, 850)
            self.stats_['credit_score_global'] = float(cs.median())
            self.stats_['credit_score_by_lp'] = None
            self.stats_['credit_score_by_year_lp'] = None

            if self.use_cohort and 'loan_purpose' in df.columns:
                grp_lp = df[['loan_purpose']].copy()
                med_lp = cs.groupby(df['loan_purpose']).median()
                self.stats_['credit_score_by_lp'] = med_lp.to_dict()

                if vyear is not None:
                    keys = list(zip(vyear, df['loan_purpose']))
                    med_y_lp = pd.Series(cs.values, index=pd.MultiIndex.from_arrays([vyear, df['loan_purpose']])).groupby(level=[0,1]).median()
                    self.stats_['credit_score_by_year_lp'] = {k: float(v) for k,v in med_y_lp.items()}

        # ---- MI% medians by LTV bins (and year)
        if 'mi_percent' in df.columns and 'original_ltv' in df.columns:
            mi = pd.to_numeric(df['mi_percent'], errors='coerce')
            ltv = pd.to_numeric(df['original_ltv'], errors='coerce').clip(lower=0)
            ltv_bins = pd.cut(ltv, self.ltv_bins, include_lowest=True, right=True)

            self.stats_['mi_by_bin'] = mi.groupby(ltv_bins).median().to_dict()
            self.stats_['mi_by_year_bin'] = None
            if self.use_cohort and vyear is not None :
                idx = pd.MultiIndex.from_arrays([vyear, ltv_bins])
                med = pd.Series(mi.values, index=idx).groupby(level=[0,1]).median()
                self.stats_['mi_by_year_bin'] = {k: float(v) for k,v in med.items()}

        # ---- DTI medians
        if 'original_dti' in df.columns:
            dti = pd.to_numeric(df['original_dti'], errors='coerce')
            self.stats_['dti_global'] = float(dti.median())
            self.stats_['dti_by_lp'] = None
            self.stats_['dti_by_year_lp'] = None

            if self.use_cohort and 'loan_purpose' in df.columns:
                self.stats_['dti_by_lp'] = dti.groupby(df['loan_purpose']).median().to_dict()
                if vyear is not None:
                    med = pd.Series(dti.values,
                                    index=pd.MultiIndex.from_arrays([vyear, df['loan_purpose']])
                                   ).groupby(level=[0,1]).median()
                    self.stats_['dti_by_year_lp'] = {k: float(v) for k,v in med.items()}

        # ---- CLTV medians by year (fallback global)
        if 'original_cltv' in df.columns:
            cltv = pd.to_numeric(df['original_cltv'], errors='coerce')
            self.stats_['cltv_global'] = float(cltv.median())
            self.stats_['cltv_by_year'] = None
            if self.use_cohort and vyear is not None:
                med = pd.Series(cltv.values, index=vyear).groupby(level=0).median()
                self.stats_['cltv_by_year'] = {int(k): float(v) for k,v in med.items() if pd.notna(v)}

        # ---- modes for small ordinal
        for col in ['original_loan_term', 'number_of_borrowers']:
            if col in df.columns:
                self.stats_[f'{col}_mode'] = self._mode(df[col])

        return self

    # ---------- transform (apply to TRAIN and TEST) ----------
    def transform(self, X):
        df = X.copy()
        # 0) Drop
        df.drop(columns=['pre_relief_refi_loan_seq_number'], errors='ignore', inplace=True)

        if self.missing_flag:
            cols_impute = df.columns 
            missing0 = df[cols_impute].isna().add_prefix('was_missing_').astype('int8')
        
        

        # Helpers
        vyear = self._to_year(df['vintage']) if 'vintage' in df.columns else None

        # 1) Catés : Unknown / NotApplicable
        if 'channel' in df.columns and isinstance(df['channel'].dtype, CategoricalDtype):
            df['channel'] = df['channel'].cat.add_categories(['Unknown']).fillna('Unknown')

        if 'property_valuation_method' in df.columns:
            pvm = pd.to_numeric(df['property_valuation_method'].astype('string'), errors='coerce')
            if vyear is not None:
                pvm = pvm.where(vyear >= 2017, 99)  # 99 NotApplicable avant 2017
            pvm = pvm.fillna(9)  # 9 NotAvailable
            df['property_valuation_method'] = pvm.astype('Int16').astype('category')

        if 'special_eligibility_program' in df.columns and isinstance(df['special_eligibility_program'].dtype, CategoricalDtype):
            df['special_eligibility_program'] = df['special_eligibility_program'].cat.add_categories(['Unknown']).fillna('Unknown')
            df['has_special_program'] = df['special_eligibility_program'].isin(['H','F','R']).astype('int8')

        if 'msa_md' in df.columns:
            df['msa_md'] = df['msa_md'].fillna(0)

        # 2) CREDIT SCORE
        if 'credit_score' in df.columns:
            df['cs_missing'] = df['credit_score'].isna().astype('int8')
            cs = pd.to_numeric(df['credit_score'], errors='coerce').clip(300, 850)

            # cohort fill
            if self.use_cohort and 'loan_purpose' in df.columns:
                if vyear is not None and self.stats_.get('credit_score_by_year_lp'):
                    keys = pd.Series(list(zip(vyear, df['loan_purpose'])), index=df.index)
                    mapped = self._map_fill(keys, self.stats_['credit_score_by_year_lp'])
                    cs = cs.fillna(mapped)
                if self.stats_.get('credit_score_by_lp'):
                    mapped = df['loan_purpose'].map(self.stats_['credit_score_by_lp'])
                    cs = cs.fillna(mapped)

            # global fallback
            cs = cs.fillna(self.stats_.get('credit_score_global', float(np.nan)))
            df['credit_score'] = pd.Series(cs, index=df.index).round().astype('Int16')

        # 3) MI%
        if 'mi_percent' in df.columns:
            df['mi_missing'] = df['mi_percent'].isna().astype('int8')
            mi = pd.to_numeric(df['mi_percent'], errors='coerce').astype('Float32')

            ltv = pd.to_numeric(df['original_ltv'], errors='coerce').clip(lower=0) if 'original_ltv' in df.columns else pd.Series(np.nan, index=df.index)
            # règle métier
            mi = mi.mask(ltv.le(80) & mi.isna(), 0.0)

            # cohort median by LTV bins (and year)
            if 'original_ltv' in df.columns:
                ltv_bins = pd.cut(ltv, self.ltv_bins, include_lowest=True, right=True)

                if self.use_cohort and vyear is not None and self.stats_.get('mi_by_year_bin'):
                    keys = pd.Series(list(zip(vyear, ltv_bins)), index=df.index)
                    mapped = self._map_fill(keys, self.stats_['mi_by_year_bin'])
                    mi = mi.fillna(mapped)

                if self.stats_.get('mi_by_bin'):
                    mapped = ltv_bins.map(self.stats_['mi_by_bin'])
                    mi = mi.fillna(mapped)

            df['mi_percent'] = mi.fillna(0.0).astype('Float32')
            df['has_mi'] = (df['mi_percent'] > 0).astype('int8')

        # 4) DTI
        if 'original_dti' in df.columns:
            df['dti_missing'] = df['original_dti'].isna().astype('int8')
            dti = pd.to_numeric(df['original_dti'], errors='coerce')

            if self.use_cohort and 'loan_purpose' in df.columns:
                if vyear is not None and self.stats_.get('dti_by_year_lp'):
                    keys = pd.Series(list(zip(vyear, df['loan_purpose'])), index=df.index)
                    mapped = self._map_fill(keys, self.stats_['dti_by_year_lp'])
                    dti = dti.fillna(mapped)
                if self.stats_.get('dti_by_lp'):
                    mapped = df['loan_purpose'].map(self.stats_['dti_by_lp'])
                    dti = dti.fillna(mapped)

            dti = dti.fillna(self.stats_.get('dti_global', float(np.nan)))
            df['original_dti'] = pd.Series(dti, index=df.index).round().astype('Int16')

        # 5) CLTV
        if 'original_cltv' in df.columns:
            df['cltv_missing'] = df['original_cltv'].isna().astype('int8')
            cltv = pd.to_numeric(df['original_cltv'], errors='coerce').astype('Float32')

            if 'original_ltv' in df.columns:
                ltv = pd.to_numeric(df['original_ltv'], errors='coerce').astype('Float32')
                cltv = pd.Series(cltv, index=df.index).fillna(ltv)
                cltv = pd.Series(np.where(ltv.notna(), np.maximum(cltv, ltv), cltv), index=df.index).astype('Float32')

            if self.use_cohort and vyear is not None and self.stats_.get('cltv_by_year'):
                mapped = pd.Series(vyear, index=df.index).map(self.stats_['cltv_by_year'])
                cltv = pd.Series(cltv, index=df.index).fillna(mapped)

            cltv = pd.Series(cltv, index=df.index).fillna(self.stats_.get('cltv_global', float(np.nan)))
            df['original_cltv'] = cltv.astype('Float32')

        # 6) Small ordinal → mode
        for col in ['original_loan_term', 'number_of_borrowers']:
            if col in df.columns:
                df[col + '_missing'] = df[col].isna().astype('int8')
                mode_val = self.stats_.get(f'{col}_mode', np.nan)
                df[col] = df[col].fillna(mode_val)
                # si tout était NaN dans le train, sécurité
                if df[col].isna().any():
                    df[col] = df[col].fillna(method='ffill').fillna(method='bfill')
        

        # Added missing flag
        if self.missing_flag:
            df = pd.concat([df, missing0], axis=1)   

        return df

# ---------- Utilisation ----------
# 1) Fit sur TRAIN uniquement
imputer = DataImputer(use_cohort=False, missing_flag=False)
imputer.fit(df_train)

# 2) Transform sur TRAIN et TEST
df_train_imp = imputer.transform(df_train_prep)
df_validation_imp  = imputer.transform(df_validation_prep)

# En cross-val :
# pipe = Pipeline([('imp', DataImputer(use_cohort=True)), ('clf', LogisticRegression(...))])
# scores = cross_val_score(pipe, X, y, cv=KFold(...))

In [None]:
# Save the Data and the data type after the impute
parquet = True
PATH_OUTPUT_IMPUTE = "../data/processed/merged/imputed/"
# If we can use Parquet :
if parquet:
    df_train_imp.to_parquet(f"{PATH_OUTPUT_IMPUTE}train.parquet", index=False)
    df_validation_imp.to_parquet(f"{PATH_OUTPUT_IMPUTE}validation.parquet", index=False)  



# If we want to keep the csv structure :
else :
    import pickle
    from pandas.api.types import CategoricalDtype

    dtypes = df_train_imp.dtypes.to_dict()

    parse_dates = [c for c, dt in dtypes.items() if str(dt).startswith("datetime64")]
    cat_dtypes  = {c: dt for c, dt in dtypes.items() if isinstance(dt, CategoricalDtype)}
    other_dtypes = {c: ("Int64" if str(dt).startswith("int") and df_train_imp[c].isna().any()
                        else dt)
                    for c, dt in dtypes.items() if c not in parse_dates and c not in cat_dtypes}


    with open(f"{PATH_OUTPUT_IMPUTE}parse_dates.pkl","wb") as f:
        pickle.dump(parse_dates, f)
    with open(f"{PATH_OUTPUT_IMPUTE}cat_dtypes.pkl","wb") as f:
        pickle.dump(cat_dtypes, f)
    with open(f"{PATH_OUTPUT_IMPUTE}other_dtypes.pkl","wb") as f:
        pickle.dump(other_dtypes, f)

    df_train_imp.to_csv(f"{PATH_OUTPUT_IMPUTE}train_imputed.csv", index=False)



    ## IF WE LOAD THE CSV FILE AND WE WANT TO OBTAIN THE EXACT DATA TYPES
    # with open("../data/processed/notebook/parse_dates.pkl","rb") as f:
    #     loaded_dict = pickle.load(f) 
    # with open("../data/processed/notebook/cat_dtypes.pkl","rb") as f:
    #     cat_dtypes = pickle.load(f)
    # with open("../data/processed/notebook/other_dtypes.pkl","rb") as f:
    #     other_dtypes = pickle.load(f)


    # test = pd.read_csv("../data/processed/notebook/train_imputed.csv",
    #                 dtype=other_dtypes, parse_dates=parse_dates)

    # for c, dt in cat_dtypes.items():  # réappliquer les catégories exactes
    #     test[c] = test[c].astype(dt)

    # test.dtypes

In [None]:
# EDA PLOT:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def eda_plots_by_type(
    df: pd.DataFrame,
    dict_types: dict,
    target: str = 'default_24m',
    max_cont: int = 6,
    max_cat: int = 6,
    top_n_cat: int = 15,
    min_share_cat: float = 0.01
):
    # --- détecte colonnes par type
    cont_cols = [c for c,t in dict_types.items() if t == 'continuous' and c in df.columns]
    ord_cols  = [c for c,t in dict_types.items() if t == 'integer' and c in df.columns]
    cat_cols  = [c for c,t in dict_types.items() if t in ('category','bool') and c in df.columns]
    date_cols = [c for c,t in dict_types.items() if t == 'date' and c in df.columns]

    # target
    y = None
    if target in df.columns:
        y = df[target]
        if pd.api.types.is_bool_dtype(y):
            y = y.astype('float32')
        else:
            y = pd.to_numeric(y, errors='coerce').astype('float32')

    # ---------- helpers ----------
    def corr_heatmap(cols, title):
        cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
        if len(cols) < 2: return
        cmat = df[cols].corr(method='pearson')
        fig = plt.figure(figsize=(8,6))
        im = plt.imshow(cmat, interpolation='nearest')
        plt.title(title)
        plt.xticks(np.arange(len(cols)), cols, rotation=60, ha='right')
        plt.yticks(np.arange(len(cols)), cols)
        plt.colorbar(im, fraction=0.046, pad=0.04)
        plt.tight_layout()
        plt.show()

    def plot_hist(col, bins=50):
        s = pd.to_numeric(df[col], errors='coerce').dropna()
        if s.empty: return
        plt.figure(figsize=(6,4))
        plt.hist(s, bins=bins)
        plt.title(f"Distribution: {col}")
        plt.xlabel(col); plt.ylabel("Count")
        plt.tight_layout(); plt.show()

    def plot_continuous_default_rate(col, q=10):
        if y is None: return
        s = pd.to_numeric(df[col], errors='coerce')
        m = s.notna() & y.notna()
        s, yt = s[m], y[m]
        if s.empty: return
        try:
            bins = pd.qcut(s, q=q, duplicates='drop')
        except ValueError:
            bins = pd.cut(s, bins=q, include_lowest=True)
        g = pd.DataFrame({'bin': bins, 'y': yt}).groupby('bin').agg(rate=('y','mean'), n=('y','size')).reset_index()
        fig = plt.figure(figsize=(7,4))
        ax1 = plt.gca(); ax2 = ax1.twinx()
        ax1.plot(np.arange(len(g)), g['rate'], marker='o')
        ax2.bar(np.arange(len(g)), g['n'], alpha=0.3)
        ax1.set_title(f"Default rate by {col} (binned)"); ax1.set_xlabel(col+" (bins)")
        ax1.set_ylabel("Default rate"); ax2.set_ylabel("Count")
        ax1.set_xticks(np.arange(len(g))); ax1.set_xticklabels([str(b) for b in g['bin']], rotation=60, ha='right')
        plt.tight_layout(); plt.show()

    def plot_categorical_default_rate(col, top_n=15, min_share=0.01):
        s = df[col].astype('string')
        counts = s.value_counts(dropna=False)
        total = counts.sum()
        keep = counts[counts/total >= min_share].head(top_n).index
        s2 = s.where(s.isin(keep), other='Other')
        if y is not None:
            m = y.notna()
            g = pd.DataFrame({'cat': s2[m], 'y': y[m]}).groupby('cat').agg(rate=('y','mean'), n=('y','size')).reset_index()
        else:
            g = s2.value_counts(dropna=False).rename_axis('cat').reset_index(name='n')
            g['rate'] = np.nan
        g = g.sort_values('n', ascending=False)
        fig = plt.figure(figsize=(7,4))
        ax1 = plt.gca(); ax2 = ax1.twinx()
        ax1.bar(g['cat'], g['rate'])
        ax2.plot(g['cat'], g['n'], marker='o')
        ax1.set_title(f"Default rate by {col} (top {top_n}, ≥{int(min_share*100)}%)")
        ax1.set_ylabel("Default rate"); ax2.set_ylabel("Count"); ax1.set_xlabel(col)
        plt.xticks(rotation=60, ha='right'); plt.tight_layout(); plt.show()

    def plot_integer_effect(col):
        s = pd.to_numeric(df[col], errors='coerce')
        if y is None:
            vc = s.value_counts(dropna=False).sort_index()
            plt.figure(figsize=(6,4))
            plt.bar(vc.index.astype(str), vc.values)
            plt.title(f"Counts: {col}"); plt.xlabel(col); plt.ylabel("Count")
            plt.tight_layout(); plt.show(); return
        m = s.notna() & y.notna(); s, yt = s[m], y[m]
        nun = s.nunique()
        if nun <= 15:
            g = pd.DataFrame({'x': s, 'y': yt}).groupby('x').agg(rate=('y','mean'), n=('y','size')).reset_index()
            fig = plt.figure(figsize=(6,4))
            ax1 = plt.gca(); ax2 = ax1.twinx()
            ax1.bar(g['x'].astype(str), g['rate'])
            ax2.plot(g['x'].astype(str), g['n'], marker='o')
            ax1.set_title(f"Default rate by {col}")
            ax1.set_ylabel("Default rate"); ax2.set_ylabel("Count"); ax1.set_xlabel(col)
            plt.tight_layout(); plt.show()
        else:
            plot_continuous_default_rate(col, q=min(10, nun))

    def plot_time_series(col):
        s = df[col]
        if isinstance(s.dtype, pd.PeriodDtype):
            ts = s.dt.to_timestamp()
        else:
            ts = pd.to_datetime(s, errors='coerce')
        if y is None:
            g = ts.value_counts().sort_index()
            plt.figure(figsize=(8,4))
            plt.plot(g.index, g.values)
            plt.title(f"Counts over time: {col}"); plt.xlabel("Date"); plt.ylabel("Count")
            plt.tight_layout(); plt.show(); return
        data = pd.DataFrame({'t': ts, 'y': y}).dropna()
        if data.empty: return
        data['t'] = data['t'].dt.to_period('M').dt.to_timestamp()
        g = data.groupby('t').agg(rate=('y','mean'), n=('y','size')).reset_index()
        fig = plt.figure(figsize=(8,4))
        ax1 = plt.gca(); ax2 = ax1.twinx()
        ax1.plot(g['t'], g['rate'], marker='o'); ax2.bar(g['t'], g['n'], alpha=0.3)
        ax1.set_title(f"Default rate over time: {col}")
        ax1.set_xlabel("Date"); ax1.set_ylabel("Default rate"); ax2.set_ylabel("Count")
        plt.tight_layout(); plt.show()

    # ---------- plots ----------
    if len(cont_cols) >= 2:
        corr_heatmap(cont_cols, "Pearson correlation (continuous vars)")

    for c in cont_cols[:max_cont]:
        plot_hist(c, bins=50)
        plot_continuous_default_rate(c, q=10)

    preferred_cats = ['occupancy_status','channel','property_type','loan_purpose','msa_md','property_valuation_method']
    cat_to_plot = [c for c in preferred_cats if c in cat_cols][:max_cat]
    if not cat_to_plot:
        cat_to_plot = cat_cols[:max_cat]
    for c in cat_to_plot:
        plot_categorical_default_rate(c, top_n=top_n_cat, min_share=min_share_cat)

    for c in ord_cols[:max(1, min(2, len(ord_cols)))]:
        plot_integer_effect(c)

    for c in date_cols[:2]:
        plot_time_series(c)

# --- Utilisation ---
eda_plots_by_type(df_train, dict_types,
                  target='default_24m',
                  max_cont=6, max_cat=6, top_n_cat=15, min_share_cat=0.01)


# Model

## Regression logistique

In [None]:
import pandas as pd

df_train = pd.read_parquet("../data/processed/default_labels/")
df_train_imp = pd.read_parquet("../data/processed/default_labels_imputed/train.parquet")
df_validation_imp = pd.read_parquet("../data/processed/default_labels_imputed/validation.parquet")

In [None]:
df_train[df_train["first_time_homebuyer_flag"].isna()]

In [None]:

# MAP each column into a data types categories (to better select columns)

from collections import defaultdict

dict_types = {
 'credit_score': 'continuous',
 'first_payment_date': 'date',
 'first_time_homebuyer_flag': 'bool',            # Y/N, 9 -> NA
 'maturity_date': 'date',
 'msa_md': 'category',
 'mi_percent': 'continuous',                     # pas catégoriel
 'number_of_units': 'category',                  # 1–4, 99 -> NA (caté OK)
 'occupancy_status': 'category',
 'original_cltv': 'continuous',
 'original_dti': 'continuous',
 'original_upb': 'continuous',
 'original_ltv': 'continuous',
 'original_interest_rate': 'continuous',
 'channel': 'category',
 'ppm_flag': 'bool',                             # Y/N
 'amortization_type': 'category',                # FRM/ARM
 'property_state': 'category',
 'property_type': 'category',
 'postal_code': 'category',
 'loan_sequence_number': 'id',
 'loan_purpose': 'category',
 'original_loan_term': 'integer',                # nb de mois, pas une caté
 'number_of_borrowers': 'integer',               # petit entier
 'seller_name': 'category',
 'servicer_name': 'category',
 'super_conforming_flag': 'bool',                # Y / vide
 'pre_relief_refi_loan_seq_number': 'id_ref',    # clé de ref, pas une caté
 'special_eligibility_program': 'category',      # H/F/R/9
 'relief_refinance_indicator': 'bool',           # Y / vide
 'property_valuation_method': 'category',        # codes 1–4, 9 -> NA
 'interest_only_indicator': 'bool',              # Y/N
 'mi_cancellation_indicator': 'category',        # Y/N/7/9
 'default_24m': 'bool',
 'vintage': 'date'                               # ex. YYYY-Q à partir de FIRST_PAYMENT_DATE
}

# inversion type -> liste de colonnes
_dict_types_col = defaultdict(list)
for col, t in dict_types.items():
    _dict_types_col[t].append(col)

dict_types_col = dict(_dict_types_col)  # (optionnel) convertir en dict standard

cont_cols  = dict_types_col.get('continuous', [])
ord_cols   = dict_types_col.get('integer', [])      # ici: original_loan_term, number_of_borrowers (traités comme ordinal/binaire)
bools_cols = dict_types_col.get('bool', [])
cat_cols  = dict_types_col.get('category', []) + bools_cols



# Correlations cell

from scipy.stats import spearmanr, pointbiserialr, pearsonr, kendalltau
from scipy.stats import chi2_contingency
from itertools import combinations
import numpy as np

spearman_pairs  = {}
pearson_pairs   = {}
cramer_pairs    = {}
pb_pairs        = {}
kendall_pairs   = {}

# 1) Spearman : ordinal <-> continu (monotone)
for x in ord_cols:
    for y in cont_cols:
        s = df_train_imp[[x, y]].dropna()
        if s.empty: continue
        r, p = spearmanr(s[x], s[y])
        spearman_pairs[(x, y)] = (r, p)

# 2) Pearson : continu <-> continu (sans doublons ni diagonale)
for x, y in combinations(cont_cols, 2):
    s = df_train_imp[[x, y]].dropna()
    if s.empty: continue
    r, p = pearsonr(s[x], s[y])
    pearson_pairs[(x, y)] = (r, p)

# 3) Point-bisérial : continu <-> booléen binaire
bin_cols = [c for c in dict_types_col.get('bool', [])
            if set(df_train_imp[c].dropna().unique()).issubset({0,1,True,False})]
for x in cont_cols:
    for b in bin_cols:
        s = df_train_imp[[x, b]].dropna()
        if s.empty: continue
        r, p = pointbiserialr(s[x], s[b].astype(int))
        pb_pairs[(x, b)] = (r, p)   # <-- bug corrigé

# 4) Kendall tau-b : ordinal <-> ordinal (et en option ordinal <-> continu)
for x, y in combinations(ord_cols, 2):
    s = df_train_imp[[x, y]].dropna()
    if s.empty: continue
    tau, p = kendalltau(s[x], s[y], method="auto")  # tau-b avec correction des ties
    kendall_pairs[(x, y)] = (tau, p)

# Optionnel : Kendall pour ordinal <-> continu
for x in ord_cols:
    for y in cont_cols:
        s = df_train_imp[[x, y]].dropna()
        if s.empty: continue
        tau, p = kendalltau(s[x], s[y], method="auto")
        kendall_pairs[(x, y)] = (tau, p)

# 5) Cramér V corrigé : nominal/ordinal <-> nominal/ordinal
def cramers_v_corrected(x, y):
    tbl = pd.crosstab(x, y)
    if tbl.size == 0: 
        return np.nan
    chi2 = chi2_contingency(tbl, correction=False)[0]
    n = tbl.values.sum()
    r, k = tbl.shape
    phi2 = chi2 / n
    # Correction biais (Bergsma, 2013)
    phi2corr = max(0, phi2 - (k-1)*(r-1)/(n-1))
    rcorr = r - (r-1)**2/(n-1)
    kcorr = k - (k-1)**2/(n-1)
    denom = max(1e-12, min((kcorr-1), (rcorr-1)))
    return np.sqrt(phi2corr / denom)

cats = list(set(cat_cols) | set(ord_cols))  # traiter l'ordinal comme catégoriel ici
for a, b in combinations(cats, 2):
    s = df_train_imp[[a, b]].dropna()
    if s.empty: continue
    cramer_pairs[(a, b)] = cramers_v_corrected(s[a], s[b])


In [None]:
# Shape the correlation into a long data frame

import pandas as pd
import numpy as np

# ---------- 1) Fusion des dictionnaires en un tableau "long" ----------
def _dict_to_df(d, measure, has_p=True):
    rows = []
    for (x, y), val in d.items():
        # tolère (stat, p) ou (stat, p, n)
        if has_p:
            stat = val[0]
            pval = val[1]
            n = val[2] if (isinstance(val, (tuple, list)) and len(val) >= 3) else np.nan
        else:
            stat = val
            pval = np.nan
            n = np.nan
        rows.append({
            "var_x": x,
            "var_y": y,
            "measure": measure,
            "stat": float(stat),
            "p_value": float(pval) if pd.notna(pval) else np.nan,
            "n": float(n) if pd.notna(n) else np.nan,
            "abs_stat": abs(float(stat)),
        })
    return pd.DataFrame(rows)

dfs = []
if len(spearman_pairs): dfs.append(_dict_to_df(spearman_pairs, "spearman", has_p=True))
if len(pearson_pairs):  dfs.append(_dict_to_df(pearson_pairs,  "pearson",  has_p=True))
if len(pb_pairs):       dfs.append(_dict_to_df(pb_pairs,       "pointbiserial", has_p=True))
if len(kendall_pairs):  dfs.append(_dict_to_df(kendall_pairs,  "kendall_tau", has_p=True))
if len(cramer_pairs):   dfs.append(_dict_to_df(cramer_pairs,   "cramers_v", has_p=False))

assocs_long = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(
    columns=["var_x","var_y","measure","stat","p_value","n","abs_stat"]
)

# ---------- 2) Ajout des types de variables (utile pour filtrer) ----------
def _type_of(v):
    if v in bin_cols:  return "bin"
    if v in cont_cols: return "cont"
    if v in ord_cols:  return "ord"
    if v in cat_cols:  return "cat"
    return "unknown"

assocs_long["type_x"] = assocs_long["var_x"].map(_type_of)
assocs_long["type_y"] = assocs_long["var_y"].map(_type_of)

# ---------- 3) Tri + étoiles de significativité + q-values (FDR BH) ----------
def _stars(p):
    if pd.isna(p): return ""
    return "***" if p < 0.001 else "**" if p < 0.01 else "*" if p < 0.05 else ""

assocs_long["sig"] = assocs_long["p_value"].apply(_stars)

# FDR Benjamini–Hochberg (si statsmodels dispo)
try:
    from statsmodels.stats.multitest import multipletests
    m = assocs_long["p_value"].notna()
    assocs_long.loc[m, "q_value"] = multipletests(assocs_long.loc[m, "p_value"], method="fdr_bh")[1]
except Exception:
    assocs_long["q_value"] = np.nan

assocs_long = assocs_long.sort_values(["measure", "abs_stat"], ascending=[True, False]).reset_index(drop=True)

# ---------- 4) Exemples d’usages rapides ----------
# a) Top 20 associations par mesure (en valeur absolue)
top20_by_measure = (
    assocs_long.groupby("measure", group_keys=False)
               .apply(lambda g: g.nlargest(20, "abs_stat"))
               .reset_index(drop=True)
)

# b) Pivots pour heatmaps (ex: Pearson et Cramér V)
pearson_pivot = (assocs_long.query("measure == 'pearson'")
                              .pivot(index="var_x", columns="var_y", values="stat"))
cramers_pivot = (assocs_long.query("measure == 'cramers_v'")
                              .pivot(index="var_x", columns="var_y", values="stat"))

# ---------- 5) Export CSV ----------
assocs_long.to_csv("associations_long.csv", index=False)
top20_by_measure.to_csv("associations_top20_by_measure.csv", index=False)

print("OK • assocs_long :", assocs_long.shape, "| Sauvé: associations_long.csv")
assocs_long



In [None]:
set(assocs_long["measure"])
dict_test = {'cramers_v':0.8}

In [None]:
## Transformation cell (drop, transformation..., allow for multiple transformation)
## Naive transformation / Transformation for linear model / For non linear model...
## Regularisat
# ion or not...

# pre treatment
# df_train_clean[bools_cols].sum()

# We then drop modality with no variability
# df_train_clean.drop(columns=["relief_refinance_indicator","interest_only_indicator","ppm_flag"], inplace= True)

# Graph, data viz 

In [None]:
counts = df_train["default_24m"].value_counts(normalize=True, dropna=False) * 100

# 4) Tracé Matplotlib (et non seaborn)
fig, ax = plt.subplots()
bars = ax.bar(counts.index.astype(str), counts.values, color="lightblue")

ax.set_title("Default modality proportion")
ax.set_xlabel("Modalité (default_24m)")
ax.set_ylabel("Pourcentage (%)")

# Légendes de pourcentage sur chaque barre
ax.bar_label(bars, labels=[f"{v:.1f}%" for v in counts.values], padding=3)

# Marges et rendu propre
ax.set_ylim(0, max(100, counts.max() * 1.15))   # laisse un peu d'espace pour le label
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Données (sans NaN)
x = df_train["original_upb"].dropna().values

# Bins robustes (Freedman–Diaconis) ; repli sur 'auto' si nécessaire
try:
    bins = np.histogram_bin_edges(x, bins='fd')
    if np.isinf(bins).any() or len(bins) < 2:
        raise ValueError
except Exception:
    bins = 'auto'

# Mise en forme "milliers avec espace"
fmt = FuncFormatter(lambda v, pos: f"{v:,.0f}".replace(",", " "))

fig, ax = plt.subplots(figsize=(9, 5), dpi=120)

# Histogramme (effectifs)
n, b, patches = ax.hist(
    x, bins=bins, density=False,
    edgecolor='white', linewidth=0.8, alpha=0.9
)

# Lignes moyenne / médiane
moy = np.mean(x)
med = np.median(x)
ax.axvline(moy, linestyle='--', linewidth=1.6, label=f"Moyenne = {moy:,.0f}".replace(",", " "))
ax.axvline(med, linestyle=':',  linewidth=1.6, label=f"Médiane = {med:,.0f}".replace(",", " "))

# Habillage
ax.set_title("Distribution de original_upb", pad=10)
ax.set_xlabel("original_upb")
ax.set_ylabel("Effectifs")
ax.grid(axis='y', linestyle=':', alpha=0.5)
ax.xaxis.set_major_formatter(fmt)
ax.yaxis.set_major_formatter(fmt)
ax.legend(frameon=False, loc='upper right')

# (Optionnel) Utile si très asymétrique :
# ax.set_xscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Exemple : df est votre DataFrame et la colonne s’appelle 'status'
# 1) Lister les modalités (valeurs distinctes)
df_train['default_24m'].unique()

# 2) Compter chaque modalité
df['status'].value_counts()

# 3) N’afficher que les lignes dont la modalité est 'default'
df_default = df[df['status'].eq('default')]        # équiv. : df.query("status == 'default'")

# 4) Nombre et proportion de 'default'
nb_default = df['status'].eq('default').sum()
prop_default = df['status'].eq('default').mean()   # proportion entre 0 et 1

# 5) Si la colonne est codée 0/1, la mapper vers des libellés puis (optionnel) en catégorie ordonnée
df['status'] = df['status'].map({1: 'default', 0: 'non-default'}).astype('category')

from pandas.api.types import CategoricalDtype
cat = CategoricalDtype(categories=['non-default', 'default'], ordered=True)
df['status'] = df['status'].astype(cat)

# 6) Afficher la catégorie 'default' (si dtype catégoriel) et vérifier qu’elle existe
'default' in df['status'].cat.categories
