In [2]:
import pandas as pd
from pandas import read_json

data_oos = pd.read_parquet("../data/processed/default_labels/window=24m/oos.parquet")
data_oos.columns

Index(['credit_score', 'first_payment_date', 'first_time_homebuyer_flag',
       'maturity_date', 'msa_md', 'mi_percent', 'number_of_units',
       'occupancy_status', 'original_cltv', 'original_dti', 'original_upb',
       'original_ltv', 'original_interest_rate', 'channel', 'ppm_flag',
       'amortization_type', 'property_state', 'property_type', 'postal_code',
       'loan_sequence_number', 'loan_purpose', 'original_loan_term',
       'number_of_borrowers', 'seller_name', 'servicer_name',
       'super_conforming_flag', 'pre_relief_refi_loan_seq_number',
       'special_eligibility_program', 'relief_refinance_indicator',
       'property_valuation_method', 'interest_only_indicator',
       'mi_cancellation_indicator', 'default_24m', '__file_quarter', 'vintage',
       'window', 'quarter'],
      dtype='object')

In [38]:
oos_classif = pd.read_parquet("../data/processed/scored/oos_scored.parquet")


In [39]:
print(oos_classif)

         credit_score first_payment_date first_time_homebuyer_flag  \
0               725.0         2021-03-31                         Y   
1               737.0         2021-03-31                         N   
2               744.0         2021-05-31                         N   
3               676.0         2021-04-30                         N   
4               765.0         2021-03-31                         N   
...               ...                ...                       ...   
5677143         791.0         2024-08-31                         N   
5677144         750.0         2024-05-31                         N   
5677145         773.0         2024-07-31                         N   
5677146         817.0         2024-08-31                         N   
5677147         696.0         2024-09-30                         N   

        maturity_date   msa_md  mi_percent  number_of_units occupancy_status  \
0              205102  15804.0         0.0              1.0                P   

In [40]:
import pandas as pd

def vintage_grade_table(
    df,
    vint,
    vintage_col="vintage",
    grade_col="risk_bucket",
    target_col="default_24m",
    proba_col="proba",
):
    """
    Retourne un tableau avec, pour un vintage donné :
      - une ligne par grade
      - n               : nombre d'individus dans le grade
      - n_default       : nombre de défauts observés
      - pd_obs          : proportion de défaut observée
      - pd_hat          : proba moyenne attribuée à la classe (moyenne des proba individuelles)
    """
    sub = df[df[vintage_col] == vint].copy()

    res = (
        sub.groupby(grade_col)
        .agg(
            n=(proba_col, "size"),
            n_default=(target_col, "sum"),
            pd_obs=(target_col, "mean"),
            pd_hat=(proba_col, "mean"),
        )
        .reset_index()
        .sort_values(grade_col)
    )

    # Optionnel : arrondir un peu pour la lisibilité
    res["pd_obs"] = res["pd_obs"].round(6)
    res["pd_hat"] = res["pd_hat"].round(6)

    return res


# ===== Tableau global sur l'année (tous vintages confondus) =====
year_agg = (
    oos_classif
    .groupby("risk_bucket")
    .agg(
        n=("proba", "size"),
        n_default=("default_24m", "sum"),
        pd_obs=("default_24m", "mean"),
        pd_hat=("proba", "mean"),
    )
    .reset_index()
    .sort_values("risk_bucket")
)


year_agg["pd_obs"] = year_agg["pd_obs"].round(6)
year_agg["pd_hat"] = year_agg["pd_hat"].round(6)

print("\n===== Année complète (tous vintages) =====")
print(year_agg)


# ===== Agrégat par vintage × grade (pour affichage par vintage) =====
agg = (
    oos_classif
    .groupby(["vintage", "risk_bucket"])
    .agg(
        n=("proba", "size"),
        n_default=("default_24m", "sum"),
        pd_obs=("default_24m", "mean"),
        pd_hat=("proba", "mean"),
    )
    .reset_index()
)

agg["pd_obs"] = agg["pd_obs"].round(6)
agg["pd_hat"] = agg["pd_hat"].round(6)

# Afficher le head pour chaque vintage
for vint, sub in agg.groupby("vintage"):
    print(f"\n===== Vintage {vint} =====")
    print(sub.sort_values("risk_bucket").head())


KeyError: 'risk_bucket'

In [1]:
import pandas as pd

df = pd.read_parquet("../data/processed/binned/train.parquet")
print(df.columns)

Index(['__file_quarter__BIN', 'amortization_type__BIN', 'channel__BIN',
       'first_time_homebuyer_flag__BIN', 'interest_only_indicator__BIN',
       'loan_purpose__BIN', 'number_of_borrowers__BIN', 'number_of_units__BIN',
       'occupancy_status__BIN', 'ppm_flag__BIN', 'property_state__BIN',
       'property_type__BIN', 'property_valuation_method__BIN', 'quarter__BIN',
       'relief_refinance_indicator__BIN', 'special_eligibility_program__BIN',
       'super_conforming_flag__BIN', 'window__BIN', 'has__BIN',
       'credit_score__BIN', 'mi_percent__BIN', 'original_cltv__BIN',
       'original_dti__BIN', 'original_interest_rate__BIN',
       'original_loan_term__BIN', 'original_ltv__BIN', 'original_upb__BIN',
       'default_24m'],
      dtype='object')
