In [2]:
import pandas as pd
from pandas import read_json

data_oos = pd.read_parquet("../data/processed/default_labels/window=24m/oos.parquet")
data_oos.columns

Index(['credit_score', 'first_payment_date', 'first_time_homebuyer_flag',
       'maturity_date', 'msa_md', 'mi_percent', 'number_of_units',
       'occupancy_status', 'original_cltv', 'original_dti', 'original_upb',
       'original_ltv', 'original_interest_rate', 'channel', 'ppm_flag',
       'amortization_type', 'property_state', 'property_type', 'postal_code',
       'loan_sequence_number', 'loan_purpose', 'original_loan_term',
       'number_of_borrowers', 'seller_name', 'servicer_name',
       'super_conforming_flag', 'pre_relief_refi_loan_seq_number',
       'special_eligibility_program', 'relief_refinance_indicator',
       'property_valuation_method', 'interest_only_indicator',
       'mi_cancellation_indicator', 'default_24m', '__file_quarter', 'vintage',
       'window', 'quarter'],
      dtype='object')

In [12]:
import pandas as pd
oos_classif = pd.read_parquet("../data/processed/scored/validation_scored.parquet")
print(oos_classif.columns)


import pandas as pd
df_binned_val = pd.read_parquet("../data/processed/binned/validation.parquet")
print(df_binned_val.columns)

# import pandas as pd
# oos_classif = pd.read_csv("../artifacts/model_from_binned/validation_scored.csv")
# print(oos_classif.columns)
# print(oos_classif)


Index(['vintage', 'loan_sequence_number', 'default_24m', 'score_ttc', 'pd',
       'grade'],
      dtype='object')
Index(['__file_quarter__BIN', 'amortization_type__BIN', 'channel__BIN',
       'first_time_homebuyer_flag__BIN', 'interest_only_indicator__BIN',
       'loan_purpose__BIN', 'number_of_borrowers__BIN', 'number_of_units__BIN',
       'occupancy_status__BIN', 'ppm_flag__BIN', 'property_state__BIN',
       'property_type__BIN', 'property_valuation_method__BIN', 'quarter__BIN',
       'relief_refinance_indicator__BIN', 'special_eligibility_program__BIN',
       'super_conforming_flag__BIN', 'window__BIN', 'credit_score__BIN',
       'mi_percent__BIN', 'original_cltv__BIN', 'original_dti__BIN',
       'original_interest_rate__BIN', 'original_loan_term__BIN',
       'original_ltv__BIN', 'original_upb__BIN', 'default_24m', 'vintage',
       'loan_sequence_number'],
      dtype='object')


In [1]:
import pandas as pd
df = pd.read_parquet("../data/processed/scored/oos_scored.parquet")
print(df["pd"].describe())
print(df["default_24m"].mean())
print(df[["pd", "default_24m"]].corr())

count    5.677148e+06
mean     9.921210e-03
std      1.216917e-02
min      0.000000e+00
25%      1.829386e-03
50%      4.982932e-03
75%      1.306956e-02
max      1.312057e-01
Name: pd, dtype: float64
0.012124221528133493
                   pd  default_24m
pd           1.000000     0.113705
default_24m  0.113705     1.000000


In [4]:
import pandas as pd

import pandas as pd
# df = pd.read_parquet("../data/processed/scored/oos_scored.parquet")
df = pd.read_parquet("../data/processed/scored/validation_scored.parquet")

def tables_par_vintage_defaut(
    df,
    vintage_col="vintage",
    grade_col="grade",
    default_col="default_24m"
):
    """
    Retourne un dict {vintage: DataFrame} où chaque DataFrame contient, pour chaque grade :
      - nb_defaut : nombre de défauts
      - nb_individus : nombre total d'individus
      - taux_defaut : nb_defaut / nb_individus
    """
    # Tableau global (vintage, grade)
    table_global = (
        df
        .groupby([vintage_col, grade_col])[default_col]
        .agg(
            nb_defaut="sum",
            nb_individus="count"
        )
        .reset_index()
    )

    table_global["taux_defaut"] = (
        table_global["nb_defaut"] / table_global["nb_individus"]
    )

    # Dictionnaire : un DataFrame par vintage
    dict_tables = {
        vintage: sub_df.drop(columns=[vintage_col]).reset_index(drop=True)
        for vintage, sub_df in table_global.groupby(vintage_col)
    }

    return dict_tables


res = tables_par_vintage_defaut(df)

# print(res)
print(res["2019Q1"])

   grade  nb_defaut  nb_individus  taux_defaut
0      1         11          3520     0.003125
1      2         51          9956     0.005123
2      3         97         13949     0.006954
3      4        186         20102     0.009253
4      5        287         22058     0.013011
5      6        568         28761     0.019749
6      7        809         31486     0.025694
7      8       1336         36907     0.036199
8      9       2618         49332     0.053069
9     10       6309         63790     0.098903


In [6]:
import pandas as pd
import json



def tables_par_vintage_defaut_avec_pd_ttc(
    df,
    bucket_stats_path="bucket_stats.json",
    vintage_col="vintage",
    grade_col="grade",
    default_col="default_24m",
):
    """
    Construit un dict {vintage: DataFrame} où chaque DataFrame contient, pour chaque grade :
      - nb_defaut   : nombre de défauts observés (somme des 0/1)
      - nb_individus: nombre total de lignes
      - taux_defaut : nb_defaut / nb_individus
      - pd_ttc      : PD TTC issue de bucket_stats.json (section 'train', champ 'pd'),
                      en supposant que 'grade' == 'bucket'.
    """

    # 1) Charger le fichier JSON avec les stats de buckets
    with open(bucket_stats_path, "r") as f:
        bucket_stats = json.load(f)

    # 2) Construire un mapping bucket -> pd à partir de la section "train"
    pd_map = {entry["bucket"]: entry["pd"] for entry in bucket_stats["train"]}

    # 3) Tableau global (vintage, grade) avec nb_defaut & nb_individus
    table_global = (
        df
        .groupby([vintage_col, grade_col])[default_col]
        .agg(
            nb_defaut="sum",
            nb_individus="count"
        )
        .reset_index()
    )

    # 4) Calcul du taux de défaut
    table_global["taux_defaut"] = (
        table_global["nb_defaut"] / table_global["nb_individus"]
    )

    # 5) Ajout de la PD TTC selon le grade/bucket
    table_global["pd_ttc"] = table_global[grade_col].map(pd_map)

    # 6) Un DataFrame par vintage, sans la colonne vintage
    dict_tables = {
        vintage: sub_df.drop(columns=[vintage_col]).reset_index(drop=True)
        for vintage, sub_df in table_global.groupby(vintage_col)
    }

    return dict_tables


# df = ton DataFrame de départ
tables_vintage = tables_par_vintage_defaut_avec_pd_ttc(df, "../artifacts/model_from_binned/bucket_stats.json")

# Exemple : tableau pour la vintage 2021Q1
tables_vintage["2021Q1"]


Unnamed: 0,grade,nb_defaut,nb_individus,taux_defaut,pd_ttc
0,1,1401,54875,0.025531,0.00034
1,2,2034,113543,0.017914,0.000806
2,3,1287,120367,0.010692,0.001429
3,4,1003,129275,0.007759,0.002152
4,5,701,136026,0.005153,0.003379
5,6,549,129412,0.004242,0.005161
6,7,454,151156,0.003004,0.007876
7,8,326,144636,0.002254,0.012376
8,9,207,147170,0.001407,0.021472
9,10,83,93126,0.000891,0.04936


In [39]:
print(oos_classif)

         credit_score first_payment_date first_time_homebuyer_flag  \
0               725.0         2021-03-31                         Y   
1               737.0         2021-03-31                         N   
2               744.0         2021-05-31                         N   
3               676.0         2021-04-30                         N   
4               765.0         2021-03-31                         N   
...               ...                ...                       ...   
5677143         791.0         2024-08-31                         N   
5677144         750.0         2024-05-31                         N   
5677145         773.0         2024-07-31                         N   
5677146         817.0         2024-08-31                         N   
5677147         696.0         2024-09-30                         N   

        maturity_date   msa_md  mi_percent  number_of_units occupancy_status  \
0              205102  15804.0         0.0              1.0                P   

In [40]:
import pandas as pd

def vintage_grade_table(
    df,
    vint,
    vintage_col="vintage",
    grade_col="risk_bucket",
    target_col="default_24m",
    proba_col="proba",
):
    """
    Retourne un tableau avec, pour un vintage donné :
      - une ligne par grade
      - n               : nombre d'individus dans le grade
      - n_default       : nombre de défauts observés
      - pd_obs          : proportion de défaut observée
      - pd_hat          : proba moyenne attribuée à la classe (moyenne des proba individuelles)
    """
    sub = df[df[vintage_col] == vint].copy()

    res = (
        sub.groupby(grade_col)
        .agg(
            n=(proba_col, "size"),
            n_default=(target_col, "sum"),
            pd_obs=(target_col, "mean"),
            pd_hat=(proba_col, "mean"),
        )
        .reset_index()
        .sort_values(grade_col)
    )

    # Optionnel : arrondir un peu pour la lisibilité
    res["pd_obs"] = res["pd_obs"].round(6)
    res["pd_hat"] = res["pd_hat"].round(6)

    return res


# ===== Tableau global sur l'année (tous vintages confondus) =====
year_agg = (
    oos_classif
    .groupby("risk_bucket")
    .agg(
        n=("proba", "size"),
        n_default=("default_24m", "sum"),
        pd_obs=("default_24m", "mean"),
        pd_hat=("proba", "mean"),
    )
    .reset_index()
    .sort_values("risk_bucket")
)


year_agg["pd_obs"] = year_agg["pd_obs"].round(6)
year_agg["pd_hat"] = year_agg["pd_hat"].round(6)

print("\n===== Année complète (tous vintages) =====")
print(year_agg)


# ===== Agrégat par vintage × grade (pour affichage par vintage) =====
agg = (
    oos_classif
    .groupby(["vintage", "risk_bucket"])
    .agg(
        n=("proba", "size"),
        n_default=("default_24m", "sum"),
        pd_obs=("default_24m", "mean"),
        pd_hat=("proba", "mean"),
    )
    .reset_index()
)

agg["pd_obs"] = agg["pd_obs"].round(6)
agg["pd_hat"] = agg["pd_hat"].round(6)

# Afficher le head pour chaque vintage
for vint, sub in agg.groupby("vintage"):
    print(f"\n===== Vintage {vint} =====")
    print(sub.sort_values("risk_bucket").head())


KeyError: 'risk_bucket'

In [2]:
import pandas as pd

# On charge la VALIDATION (là où l'AUC est mauvais)
# On prend le fichier 'imputed' car c'est lui qui entre dans le scoring
df_val = pd.read_parquet("../data/processed/imputed/validation.parquet")

print("--- ANALYSE CIBLE VALIDATION (2019-2020) ---")
if "default_24m" in df_val.columns:
    print(df_val["default_24m"].value_counts(dropna=False))
else:
    print("La colonne default_24m n'est même pas là !")

--- ANALYSE CIBLE VALIDATION (2019-2020) ---
default_24m
0    5566820
1     128068
Name: count, dtype: Int64


In [4]:
import pandas as pd
import numpy as np

path = "../data/processed/scored/validation_scored.parquet"
df = pd.read_parquet(path)

print(f"\n--- ANALYSE DE LA PD (Le coupable de l'AUC 0.50) ---")
print(df["pd"].describe())

# Vérification saturation
zeros = (df["pd"] < 0.000001).sum()
print(f"\nNombre de clients avec PD ≈ 0 : {zeros} / {len(df)}")
print(f"Pourcentage saturé à 0 : {zeros/len(df)*100:.2f}%")

print("\n--- VISUALISATION ---")
# On utilise loan_sequence_number
cols = ["loan_sequence_number", "score", "pd", "grade", "default_24m"]
# On filtre les colonnes existantes au cas où
cols = [c for c in cols if c in df.columns]
print(df[cols].sample(5))


--- ANALYSE DE LA PD (Le coupable de l'AUC 0.50) ---
count    5.694888e+06
mean     2.453348e-09
std      4.834260e-07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.168591e-04
Name: pd, dtype: float64

Nombre de clients avec PD ≈ 0 : 5694609 / 5694888
Pourcentage saturé à 0 : 100.00%

--- VISUALISATION ---
        loan_sequence_number  score   pd  grade  default_24m
1038961         F19Q30346847    860  0.0     10            0
4116086         F20Q30893459    817  0.0     10            0
1088161         F19Q30396166    852  0.0     10            0
1592564         F19Q40357746    873  0.0     10            0
3394692         F20Q30170743    870  0.0     10            0
