In [40]:
%run brazil_functions.py

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from ctgan import CTGAN

In [9]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
df1 = predict_students_dropout_and_academic_success.data.features 
y_uci = predict_students_dropout_and_academic_success.data.targets 

In [10]:
df1['email'] = df1.index
df1['source'] = 'real'
mapping = {'Dropout': 1, 'Graduate': 0, 'Enrolled': 0}
df1['dropout'] = y_uci['Target'].map(mapping)

In [11]:
n_clusters = 20

X = df1.drop(columns=['email', 'dropout', 'source']).fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df2 = df1.copy()
df2['cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

0     229
1     105
2     129
3     383
4     168
5      24
6     266
7     110
8     547
9     292
10    521
11     77
12    201
13     13
14    411
15     51
16    506
17     72
18     92
19    227
Name: count, dtype: int64


In [12]:
min_cluster_size = 150
df3 = assign_clusters_with_min_size(df1, n_clusters=20, min_cluster_size=min_cluster_size)

Clusters trop petits à réaffecter : [1, 2, 5, 7, 11, 13, 15, 17, 18]
Nouvelles tailles de clusters :
 0     277
3     401
4     180
6     303
8     644
9     458
10    567
12    279
14    461
16    579
19    275
Name: count, dtype: int64


In [13]:
df4 = augment_minority_clusters(df2)

In [15]:
n_samples = 1000

ctgan1 = CTGAN(
    epochs=50,
    batch_size=100,
    generator_dim=(256, 256),
    discriminator_dim=(256, 256),
    verbose=True
)

# 4. Entraînement
# Le modèle apprendra la distribution de vos données
X = df1.copy()
X = X.drop(columns=['email', 'source']).fillna(0)
ctgan1.fit(X)

synthetic_data = ctgan1.sample(n_samples)
synthetic_data['source'] = 'synth'

df5 = pd.concat([df1, synthetic_data], ignore_index=True)

Gen. (-5.29) | Discrim. (-0.48): 100%|██████████| 50/50 [06:47<00:00,  8.16s/it]


In [None]:
tasks = [
    ("sans cluster", df1, {}),
    ("sans enrichissement", df3, {}),
    (("avec SMOTE", df4, {})),
    # ,("avec enrichissement, calcul déterministe", df4, {"globe": False}), # globe=False
    # ("avec GAN", df5, {})
]

summary_records = []
res = []
for name, dfk, kwargs in tasks:
    # 1) Lancement de la fonction
    df_detail, df_agg, y_all, trained_clfs = run_analysis_bra(
    df=dfk.drop(columns=['dropout']),
    y=dfk['dropout'],
    alpha=0.05,
    nan_fill=0,
    do_plot=False
)
    r =  df_detail.groupby(["method", "model", "cluster", "n_projects"]).agg(
            mean_coverage=("coverage", "mean"),
            mean_width=("width", "mean")
        ).reset_index()
    res.append(r[r['cluster']==-1])
    print(res[-1])

In [38]:
curri_cols = ['Previous qualification', "Curricular units 1st sem", "Curricular units 2nd sem"]

# curri_cols = ['Previous qualification','Admission grade', "Curricular units 1st sem", "Curricular units 2nd sem"]
dyn_cols = [
        col for col in df1.columns
        if any(col.startswith(pref) for pref in curri_cols)
        ]
static_cols = [
c for c in df1.columns
if c not in dyn_cols + ["student_id", "email", "dropout", "source", "cluster"]
]

# 3. DataFrame de base, qu’on ne modifie pas en place
base_df = df1[static_cols].copy()

# 4. Construction cumulative du dictionnaire Xt
Xt = {"t0": base_df.copy()}
cum_df = base_df.copy()
for idx, prefix in enumerate(curri_cols, start=1):
    print(prefix)
    cum_df = cum_df.copy()
    cols = [c for c in df1.columns if c.startswith(prefix)]
    cum_df[cols] = df1[cols]
    Xt[f"t{idx}"] = cum_df

# 5. Construire y pour l’horizon H
H = 1
keys = list(Xt.keys())
y = {}

# cible pour t0
if H < len(keys):
    y["t0"] = Xt[keys[H]].iloc[:, -1].copy()
else:
    y["t0"] = Xt[keys[-1]].iloc[:, -1].copy()

for i, key in enumerate(keys[1:], start=1):
    tgt = i + H
    if tgt < len(keys):
        df_tgt = Xt[keys[tgt]]
    else:
        df_tgt = Xt[keys[-1]]
    y[key] = df_tgt.iloc[:, -2].copy()

# 6. Construire X en ne gardant que les w dernières notes (fenêtre glissante)
w = 1
X = {}

# on parcourt les mêmes clés que pour y, mais on saute celles où i < w
for i, key in enumerate(keys):
    if i < w:
        continue
    # on prend les w derniers item_cols correspondant aux notes t_{i-w+1} … t_i
    window_item_cols = dyn_cols[i-w : i]
    X[key] = df1[static_cols + window_item_cols].copy()

# 7. Conversion en arrays NumPy (alignés sur les mêmes clés)
valid_keys = keys[w:]  # on commence à t{w}
X_array_hori = [X[k].values for k in valid_keys]
y_array_hori = [y[k].values for k in valid_keys]


Previous qualification
Curricular units 1st sem
Curricular units 2nd sem


In [25]:
keys

['t0', 't1', 't2', 't3', 't4']

In [43]:
covs, ecs, models, witdh = [], [], [], []
X_array = X_array_hori
k = len(X_array)
keys = list(Xt.keys())
# Parcours des fenêtres temporelles
for i in tqdm(range(1, k), desc="Fenêtres en ligne"):

    H = len(keys) - i - w
    y = {}

    # cible pour t0
    if H < len(keys):
        y["t0"] = Xt[keys[H]].iloc[:, -1].copy()
    else:
        y["t0"] = Xt[keys[-1]].iloc[:, -1].copy()

    for j, key in enumerate(keys[1:], start=1):
        tgt = j + H
        if tgt < len(keys):
            df_tgt = Xt[keys[tgt]]
        else:
            df_tgt = Xt[keys[-1]]
        y[key] = df_tgt.iloc[:, -2].copy()
    y_array = [y[k].values for k in valid_keys]
# On parcourt i de 1 à len(X_array)-1 (i=0 n'a pas de passé pour entraîner)
    # --- 1) Construction du train sur les fenêtres passées ---
    X_train = np.vstack(X_array[:i])      # fenêtres 0..i-1
    y_train = np.concatenate(y_array[:i])
    # --- 2) Entraînement d'un nouveau modèle ---
    model = TwoSidedSPCI_RFQuant_Offline(alpha=0.05, w=200, random_state=0)
    model.fit(X_train, y_train)
    models.append(model)
    # --- 3) Évaluation sur la fenêtre courante i ---
    X_i, y_i = X_array[i], y_array[i]
    # calcul des bornes supérieures U_t pour chaque échantillon de X_i
    L = np.array([
        model.predict_interval(x.reshape(1, -1))[0]
        for x in X_i
    ])
    U = np.array([
        model.predict_interval(x.reshape(1, -1))[1]
        for x in X_i
    ])
    covs.append(np.mean((U >= y_i) & (L <= y_i)))
    ecs.append(np.mean(np.maximum(0, y_i - U)) + np.mean(np.maximum(0, L - y_i)))
    witdh.append(np.mean(U - L))
    print(f"Fenêtre {i}: cov={covs[-1]:.4f}, excès={ecs[-1]:.4f}, witdh={witdh[-1]:.4f}")

# 4) Rapport final
report = (
    pd.DataFrame({
        "fenêtre": np.arange(1, k),
        "cov"    : covs,
        "excess" : ecs,
    })
    .round(4)
)

print(report)
print(f"\nCouverture moyenne  : {report['cov'].mean():.4f}")
print(f"Excès moyen (width) : {report['excess'].mean():.4f}")
print(report)
print(f"\nCouverture moyenne  : {report['cov'].mean():.4f}")
print(f"Excès moyen (width) : {report['excess'].mean():.4f}")

Fenêtres en ligne:  50%|█████     | 1/2 [11:11<11:11, 671.52s/it]

Fenêtre 1: cov=0.8341, excès=0.1299, witdh=6.6397


Fenêtres en ligne: 100%|██████████| 2/2 [34:24<00:00, 1032.31s/it]

Fenêtre 2: cov=0.8730, excès=0.4166, witdh=4.9132
   fenêtre     cov  excess
0        1  0.8341  0.1299
1        2  0.8730  0.4166

Couverture moyenne  : 0.8536
Excès moyen (width) : 0.2732
   fenêtre     cov  excess
0        1  0.8341  0.1299
1        2  0.8730  0.4166

Couverture moyenne  : 0.8536
Excès moyen (width) : 0.2732





In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import umap
import os
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from typing import Tuple, Dict, Union, List
from mapie.classification import MapieClassifier
from mapie.mondrian import MondrianCP
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    brier_score_loss
)

In [31]:
def encode(pset):
    res = []
    for pred in pset:
        res.append(np.mean(pred))
    return res

In [42]:
covs, ecs = [], []
X_array = X_array_hori
y_array = y_array_hori
U_t = []
# On parcourt i de 1 à len(X_array)-1 (i=0 n'a pas de passé pour entraîner)
for i in tqdm(range(1, len(X_array)), desc="Fenêtres en ligne"):
    # --- 1) Construction du train sur les fenêtres passées ---
    X_train = np.vstack(X_array[:i])      # fenêtres 0..i-1
    y_train = np.concatenate(y_array[:i])
    # --- 2) Entraînement d'un nouveau modèle ---
    model_one = OneSidedSPCI_LGBM_Offline(alpha=0.1, w=200, random_state=0)
    model_one.fit(X_train, y_train)

    # --- 3) Évaluation sur la fenêtre courante i ---
    X_i, y_i = X_array[i], y_array[i]
    # calcul des bornes supérieures U_t pour chaque échantillon de X_i
    U = np.array([
        model_one.predict_interval(x.reshape(1, -1))[1]
        for x in X_i
    ])
    U_t.append(U)
    # coverage & excess
    covs.append(np.mean(U >= y_i))
    ecs.append(np.mean(np.maximum(0, y_i - U)))
    print("RESULTAT :", np.mean(U >= y_i), np.mean(np.maximum(0, y_i - U)))
# --- 4) Rapport final ---
report = (
    pd.DataFrame({
        "fenêtre": np.arange(1, len(X_array)),
        "cov"    : covs,
        "excess" : ecs,
    })
    .round(4)
)

print(report)
print(f"\nCouverture moyenne  : {report['cov'].mean():.4f}")
print(f"Excès moyen (width) : {report['excess'].mean():.4f}")

Fenêtres en ligne:   0%|          | 0/2 [00:00<?, ?it/s]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  50%|█████     | 1/2 [01:18<01:18, 78.88s/it]

RESULTAT : 0.733499095840868 0.3188456312467857
fit 1 ok
fit 2 ok


Fenêtres en ligne: 100%|██████████| 2/2 [03:34<00:00, 107.23s/it]

RESULTAT : 0.7954339963833634 0.1962198893213437
   fenêtre     cov  excess
0        1  0.7335  0.3188
1        2  0.7954  0.1962

Couverture moyenne  : 0.7645
Excès moyen (width) : 0.2575





In [44]:
dfng = df1.copy()
idx2 = dfng.columns.get_loc('Curricular units 1st sem (without evaluations)')
dfng.insert(idx2+1, "Curricular units 1st sem (next grade)", U_t[1])
idx1 = dfng.columns.get_loc('Previous qualification (grade)')
dfng.insert(idx1+1, "Previous qualification (next grade)", U_t[0])


In [48]:
X = dfng.drop(columns=['email', 'dropout', 'source']).fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=20, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
dfng1 = dfng.copy()
dfng1['cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

0     368
1     208
2     182
3     228
4     177
5     401
6     295
7     344
8     189
9     107
10     98
11    194
12     87
13    351
14    234
15    449
16    279
17     20
18     74
19    139
Name: count, dtype: int64


In [52]:
dfng2 = assign_clusters_with_min_size(dfng, n_clusters=20, min_cluster_size=min_cluster_size)

Clusters trop petits à réaffecter : [9, 10, 12, 17, 18, 19]
Nouvelles tailles de clusters :
 0     417
1     234
2     290
3     318
4     180
5     419
6     328
7     350
8     241
11    211
13    402
14    255
15    474
16    305
Name: count, dtype: int64


In [53]:
dfng3 = augment_minority_clusters(dfng1)

## COMBINE

In [66]:
# -----------------------------------------------------------------------------
# Configuration & constants
# -----------------------------------------------------------------------------
RANDOM_STATE: int = 42            # Ensures full reproducibility
ALPHA: float = 0.05               # Target mis-coverage level
W: int = 1                        # Sliding-window size
nan_fill = 0
threshold = 10.001

# -----------------------------------------------------------------------------
# Data preparation
# -----------------------------------------------------------------------------
DF = dfng2.copy()
DF.fillna(nan_fill, inplace=True)
DF.reset_index(drop=True, inplace=True)

curri_cols = [
    'Previous qualification',
    'Curricular units 1st sem',
    'Curricular units 2nd sem'
]
static_cols = [
    c for c in DF.columns
    if c not in curri_cols + ["student_id", "email", "dropout", "source", "cluster"]
]

Y_TARGET = DF['dropout'].astype(int)
DF.drop(columns='dropout', inplace=True)

# -----------------------------------------------------------------------------
# Base models
# -----------------------------------------------------------------------------
MODELS: Dict[str, object] = {
    "RF": RandomForestClassifier(
        n_estimators=1000,
        min_samples_leaf=2,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ),
    "LR": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ),
    "GB": GradientBoostingClassifier(random_state=RANDOM_STATE),
}

# -----------------------------------------------------------------------------
# Conformal prediction evaluation loop
# -----------------------------------------------------------------------------
from tqdm import tqdm

res_fin: List[pd.DataFrame] = []

for name, base_clf in MODELS.items():
    covs_MCP, width_MCP = [], []
    covs_SPCI, width_SPCI = [], []
    covs_comb, width_comb = [], []
    covs_union,  width_union  = [], []
    for n in tqdm(range(W, len(curri_cols)), desc=name):
        gate_clf = RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            class_weight="balanced",
            random_state=RANDOM_STATE,
            n_jobs=-1,
        )
        # 1) split train / tmp / test
        idx_tmp, idx_test, y_tmp, y_test, cl_tmp, cl_test = train_test_split(
            DF.index, Y_TARGET, DF["cluster"],
            test_size=0.20,
            stratify=Y_TARGET,
            random_state=RANDOM_STATE,
        )
        idx_tr, idx_cal, y_tr, y_cal, cl_tr, cl_cal = train_test_split(
            idx_tmp, y_tmp, cl_tmp,
            test_size=0.40/0.80,
            stratify=y_tmp,
            random_state=RANDOM_STATE,
        )
        # **nouveau** split en deux pour CP vs gate
        idx_cal_cp, idx_cal_gate, y_cal_cp, y_cal_gate, cl_cal_cp, cl_cal_gate = train_test_split(
            idx_cal, y_cal, cl_cal,
            test_size=0.5,
            stratify=y_cal,
            random_state=RANDOM_STATE,
        )
        y_cal_cp   = np.array(y_cal_cp)
        y_cal_gate = np.array(y_cal_gate)
        y_test     = np.array(y_test)
        # mask des “réels” pour toutes les évaluations
        mask_real = DF.loc[idx_test, "source"] == "real"

        # 2) build features
        X_tr        = build_X_(DF.loc[idx_tr],       curri_cols, static_cols, n)
        X_cal_cp    = build_X_(DF.loc[idx_cal_cp],   curri_cols, static_cols, n)
        X_cal_gate  = build_X_(DF.loc[idx_cal_gate], curri_cols, static_cols, n)
        X_test      = build_X_(DF.loc[idx_test],     curri_cols, static_cols, n)
        # 3) train base clf + calibrate for MCP
        clf = clone(base_clf)
        clf.fit(X_tr, y_tr)
        calib = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid") \
                    .fit(X_cal_cp, y_cal_cp)

        base_mapie = MapieClassifier(estimator=calib, method="lac", cv="prefit")
        mond_mapie = MondrianCP(mapie_estimator=base_mapie) \
                        .fit(X_cal_cp, y_cal_cp, partition=cl_cal_cp)

        # ---- MCP on TEST ----
        _, yps_van_test = mond_mapie.predict(X_test, alpha=ALPHA, partition=cl_test)
        pset_van_test = yps_van_test[:, :, 0]
        cov_van = classification_coverage_score(y_test[mask_real], pset_van_test[mask_real])
        wid_van = classification_mean_width_score(pset_van_test[mask_real])
        covs_MCP.append(cov_van)
        width_MCP.append(wid_van)
        print("MCP", cov_van, wid_van)
        # ---- SPCI on TEST ----
        model_spci = loaded_models[n - W]
        pos_test   = DF.index.get_indexer(idx_test)
        X_spci_test = X_array_hori[n - W + 1][pos_test]
        intervals = [model_spci.predict_interval(x.reshape(1, -1))
                     for x in X_spci_test]
        L_preds, U_preds = zip(*intervals)

        y_pred_bool_SPCI = np.zeros((len(intervals), 2), dtype=bool)
        for i, (L, U) in enumerate(zip(L_preds, U_preds)):
            if threshold > U:
                y_pred_bool_SPCI[i, 1] = True
            elif threshold < L:
                y_pred_bool_SPCI[i, 0] = True
            else:
                y_pred_bool_SPCI[i, :] = True

        cov_spci = classification_coverage_score(y_test[mask_real],
                                                y_pred_bool_SPCI[mask_real])
        wid_spci = classification_mean_width_score(y_pred_bool_SPCI[mask_real])
        covs_SPCI.append(cov_spci)
        width_SPCI.append(wid_spci)
        print("SPCI", cov_spci, wid_spci)
        ##UNION 
        y_pred_bool_MCP = pset_van_test.astype(bool)
        y_bool_union = y_pred_bool_MCP | y_pred_bool_SPCI
        cov_union = classification_coverage_score(
            y_test[mask_real],
            y_bool_union[mask_real]
        )
        wid_union = classification_mean_width_score(
            y_bool_union[mask_real]
        )
        covs_union.append(cov_union)
        width_union.append(wid_union)
        print("UNION :", cov_union, wid_union)

        # ---- construire la gate sur CAL_GATE ----
        #  a) MCP predictions sur X_cal_gate
        _, yps_van_gate = mond_mapie.predict(
            X_cal_gate, alpha=ALPHA, partition=cl_cal_gate
        )
        pset_cal_cls = yps_van_gate[:, :, 0]

        #  b) SPCI predictions sur X_cal_gate
        pos_cal_gate  = DF.index.get_indexer(idx_cal_gate)
        X_spci_cal    = X_array_hori[n - W + 1][pos_cal_gate]
        intervals_cal = [model_spci.predict_interval(x.reshape(1, -1))
                         for x in X_spci_cal]
        L_cal, U_cal  = zip(*intervals_cal)

        pset_cal_spc = np.zeros_like(pset_cal_cls, dtype=bool)
        for i, (L, U) in enumerate(zip(L_cal, U_cal)):
            if threshold > U:
                pset_cal_spc[i, 1] = True
            elif threshold < L:
                pset_cal_spc[i, 0] = True
            else:
                pset_cal_spc[i, :] = True

        #  c) préparer méta-features & labels pour la gate
        df_sel_arr = []
        labels_g   = []                      # ← on initialise labels_g

        for i in range(len(idx_cal_gate)):
            feat_vec = X_cal_gate[i]
            w_cls    = pset_cal_cls[i].sum()
            w_spc    = pset_cal_spc[i].sum()
            diff     = w_cls - w_spc
            err_cls  = int(y_cal_gate[i] not in np.where(pset_cal_cls[i])[0])
            err_spc  = int(y_cal_gate[i] not in np.where(pset_cal_spc[i])[0])
            if   err_cls == 0 and err_spc == 1:
                gate_y = 0
            elif err_spc == 0 and err_cls == 1:
                gate_y = 1
            elif err_cls == 0 and err_spc == 0:
                gate_y = 0 if w_cls < w_spc else 1
            else:
                gate_y = 2
            labels_g.append(gate_y)           # ← on stocke le label

            meta_vec = np.concatenate([
                 feat_vec,
                 [w_cls, w_spc, diff, err_cls, err_spc]
            ])
            df_sel_arr.append(meta_vec)

        X_gate_train = np.vstack(df_sel_arr)
        gate_clf.fit(X_gate_train, np.array(labels_g))
        # ---- appliquer la gate sur TEST ----
        meta_test_arr = []
        for i in range(len(idx_test)):
            # feat_vec est un array 1D de taille n_features
            feat_vec = X_test[i]
            w_cls = pset_van_test[i].sum()
            w_spc = y_pred_bool_SPCI[i].sum()
            diff = w_cls - w_spc
            # on concatène feat_vec et les 5 features méta
            meta_vec = np.concatenate([
                feat_vec,
                [w_cls, w_spc, diff, 0, 0]    # err_cls=0, err_spc=0
            ])
            meta_test_arr.append(meta_vec)

        # on empile en matrice (n_test × n_features_meta)
        X_gate_test = np.vstack(meta_test_arr)

        # on prédit le choix de la gate
        choices = gate_clf.predict(X_gate_test)
        pset_final = np.zeros_like(pset_van_test, dtype=bool)
        for i, choice in enumerate(choices):
            if choice == 0:
                pset_final[i] = y_pred_bool_MCP[i]
            elif choice == 1:
                pset_final[i] = y_pred_bool_SPCI[i]
            else:
                pset_final[i] = y_pred_bool_MCP[i] | y_pred_bool_SPCI[i]

        cov_c = classification_coverage_score(y_test[mask_real],
                                             pset_final[mask_real])
        wid_c = classification_mean_width_score(pset_final[mask_real])
        covs_comb.append(cov_c)
        width_comb.append(wid_c)
        print("COMBINED", cov_c, wid_c)
    # on agrège les métriques
    n_vals = list(range(W, W + len(covs_MCP)))
    df_metrics = pd.DataFrame({
        "model":             [name] * len(n_vals),
        "n":                 n_vals,
        "coverage_MCP":      covs_MCP,
        "width_MCP":         width_MCP,
        "coverage_SPCI":     covs_SPCI,
        "width_SPCI":        width_SPCI,
        "coverage_union":    covs_union, 
        "width_union":       width_union,
        "coverage_combined": covs_comb,
        "width_combined":    width_comb,
    })
    res_fin.append(df_metrics)


RF:   0%|          | 0/2 [00:00<?, ?it/s]

MCP [0.9627118644067797] [1.3141242937853108]
SPCI [0.9807909604519774] [1.7966101694915255]
UNION : 0.9977401129943503 1.848587570621469


RF:  50%|█████     | 1/2 [01:27<01:27, 87.61s/it]

COMBINED 0.9468926553672317 1.2666666666666666
MCP [0.9627118644067797, 0.9627118644067797] [1.3141242937853108, 1.3231638418079097]
SPCI [0.9807909604519774, 0.9491525423728814] [1.7966101694915255, 1.4903954802259887]
UNION : 0.9830508474576272 1.5909604519774012


RF: 100%|██████████| 2/2 [02:59<00:00, 89.63s/it]


COMBINED 0.9299435028248587 1.2271186440677966


LR:   0%|          | 0/2 [00:00<?, ?it/s]

MCP [0.9694915254237289] [1.3887005649717514]
SPCI [0.9807909604519774] [1.7966101694915255]
UNION : 0.9977401129943503 1.856497175141243


LR:  50%|█████     | 1/2 [01:16<01:16, 76.32s/it]

COMBINED 0.9536723163841808 1.3299435028248587
MCP [0.9694915254237289, 0.9694915254237289] [1.3887005649717514, 1.3954802259887005]
SPCI [0.9807909604519774, 0.9491525423728814] [1.7966101694915255, 1.4903954802259887]
UNION : 0.9864406779661017 1.6282485875706214


LR: 100%|██████████| 2/2 [02:11<00:00, 65.57s/it]


COMBINED 0.9355932203389831 1.264406779661017


GB:   0%|          | 0/2 [00:00<?, ?it/s]

MCP [0.9706214689265537] [1.3333333333333333]
SPCI [0.9807909604519774] [1.7966101694915255]
UNION : 0.9988700564971752 1.8519774011299435


GB:  50%|█████     | 1/2 [00:43<00:43, 43.83s/it]

COMBINED 0.9525423728813559 1.280225988700565
MCP [0.9706214689265537, 0.9706214689265537] [1.3333333333333333, 1.3322033898305086]
SPCI [0.9807909604519774, 0.9491525423728814] [1.7966101694915255, 1.4903954802259887]
UNION : 0.9864406779661017 1.6033898305084746


GB: 100%|██████████| 2/2 [01:28<00:00, 44.40s/it]

COMBINED 0.9344632768361582 1.2350282485875705





In [68]:
res_fin[0]

Unnamed: 0,model,n,coverage_MCP,width_MCP,coverage_SPCI,width_SPCI,coverage_union,width_union,coverage_combined,width_combined
0,RF,1,0.962712,1.314124,0.980791,1.79661,0.99774,1.848588,0.946893,1.266667
1,RF,2,0.962712,1.323164,0.949153,1.490395,0.983051,1.59096,0.929944,1.227119


In [None]:
res_fin[0].iloc[:,1:].mean()

n                    1.500000
coverage_MCP         0.962712
width_MCP            1.318644
coverage_SPCI        0.964972
width_SPCI           1.643503
coverage_union       0.990395
width_union          1.719774
coverage_combined    0.938418
width_combined       1.246893
dtype: float64

In [72]:
res_fin[2].iloc[:,1:].mean()

n                    1.500000
coverage_MCP         0.970621
width_MCP            1.332768
coverage_SPCI        0.964972
width_SPCI           1.643503
coverage_union       0.992655
width_union          1.727684
coverage_combined    0.943503
width_combined       1.257627
dtype: float64

In [74]:
gamma = 0.9
weighted = np.average(
    res_fin[0].iloc[:, 1:],    # toutes vos métriques
    axis=0,
    weights=[gamma, gamma**2]    # vos poids
)
weighted

array([1.47368421, 0.96271186, 1.31840618, 0.96580434, 1.65156111,
       0.99078204, 1.72655367, 0.93886411, 1.24793339])

In [73]:
res_fin[2]

Unnamed: 0,model,n,coverage_MCP,width_MCP,coverage_SPCI,width_SPCI,coverage_union,width_union,coverage_combined,width_combined
0,GB,1,0.970621,1.333333,0.980791,1.79661,0.99887,1.851977,0.952542,1.280226
1,GB,2,0.970621,1.332203,0.949153,1.490395,0.986441,1.60339,0.934463,1.235028


## Weird Stuff

In [None]:
# -----------------------------------------------------------------------------
# Configuration & constants
# -----------------------------------------------------------------------------
RANDOM_STATE: int = 42            # Ensures full reproducibility
ALPHA: float = 0.05               # Target mis‑coverage level
W: int = 1                        # Sliding‑window size (kept as plain int)
nan_fill = 0
# -----------------------------------------------------------------------------
# Data preparation (identical logic)
# -----------------------------------------------------------------------------
DF = dfng2.copy()            
DF.fillna(0, inplace=True)
DF.reset_index(drop=True, inplace=True)

curri_cols = ['Previous qualification', "Curricular units 1st sem", "Curricular units 2nd sem"]
prefixes = curri_cols.copy()
dyn_cols = [
    col for col in DF.columns
    if any(col.startswith(pref) for pref in prefixes)
    ]
static_cols = [
c for c in DF.columns
if c not in dyn_cols + ["student_id", "email", "dropout", "source", "cluster"]
]
# -----------------------------------------------------------------------------
# Model definitions (identical hyper‑parameters)
# -----------------------------------------------------------------------------
MODELS: Dict[str, object] = {
    "RF": RandomForestClassifier(
        n_estimators=1000,
        min_samples_leaf=2,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ),
    "LR": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=RANDOM_STATE,
    ),
    "GB": GradientBoostingClassifier(random_state=RANDOM_STATE),
}

# -----------------------------------------------------------------------------
# Helper functions (ported verbatim where possible)
# -----------------------------------------------------------------------------
loaded_models = models
N_RENDUS: int = 1

threshold = 10.001
Y_TARGET = DF['dropout'].astype(int)
DF.drop(columns='dropout', inplace=True)
# -----------------------------------------------------------------------------
# Conformal prediction evaluation loop (logic unchanged)
# -----------------------------------------------------------------------------

records: List[dict] = []
trained_clfs: Dict[Tuple[str, int, str], Union[MapieClassifier, MondrianCP]] = {}

clusters_all = DF["cluster"]
idx_all = DF.index.values

res_fin = []
for name, base_clf in MODELS.items():
    covs_MCP, width_MCP = [], []
    covs_SPCI, width_SPCI = [], []
    meta_rows = []
    ACC = []
    covs_combine, width_combine = [], []
    for n in tqdm(range(W, len(prefixes)), desc=name):  # noqa: F821 – prefixes provided
        clf = clone(base_clf)

        # ------------------------------------------------------------------
        # Dataset splitting
        # ------------------------------------------------------------------
        idx_tmp, idx_test, y_tmp, y_test, cl_tmp, cl_test = train_test_split(
            idx_all,
            Y_TARGET,
            clusters_all,
            test_size=0.20,
            stratify=Y_TARGET,
            random_state=RANDOM_STATE,
        )
        idx_tr, idx_cal, y_tr, y_cal, cl_tr, cl_cal = train_test_split(
            idx_tmp,
            y_tmp,
            cl_tmp,
            test_size=0.20 / 0.80,
            stratify=y_tmp,
            random_state=RANDOM_STATE,
        )

        # ------------------------------------------------------------------
        # Feature extraction (delegated to user‑supplied `build_X`)
        # ------------------------------------------------------------------
        X_tr = build_X_(DF.loc[idx_tr], prefixes, static_cols, n)     # noqa: F821
        X_cal = build_X_(DF.loc[idx_cal], prefixes, static_cols, n)
        X_test = build_X_(DF.loc[idx_test], prefixes, static_cols, n)

        # ------------------------------------------------------------------
        # Training & calibration
        # ------------------------------------------------------------------
        clf.fit(X_tr, y_tr)
        calib = CalibratedClassifierCV(clf, cv="prefit", method="sigmoid").fit(X_cal, y_cal)

        base_mapie = MapieClassifier(estimator=calib, method="lac", cv="prefit")
        mond_mapie = MondrianCP(mapie_estimator=base_mapie).fit(X_cal, y_cal, partition=cl_cal)

        # ------------------------------------------------------------------
        # MCP evaluation
        # ------------------------------------------------------------------
        mask_real = DF.loc[idx_test, "source"] == "real"
        _, yps_van = mond_mapie.predict(X_test, alpha=ALPHA, partition=cl_test)
        pset_van = yps_van[:, :, 0]
        # print(pset_van)
        RES_MCP = encode(pset_van)
        # print(RES_MCP)
        cov_van_MCP = classification_coverage_score(y_test[mask_real], pset_van[mask_real])
        width_van_MCP = classification_mean_width_score(pset_van[mask_real])
        print("MCP :", cov_van_MCP, width_van_MCP)

        covs_MCP.append(cov_van_MCP)
        width_MCP.append(width_van_MCP)

        # ------------------------------------------------------------------
        # SPCI evaluation
        # ------------------------------------------------------------------
        model_SPCI = loaded_models[n - W]                       # noqa: F821
        pos_test = DF.index.get_indexer(idx_test)
        X_test_SPCI = X_array_hori[n - W + 1][pos_test]         # noqa: F821

        intervals = [model_SPCI.predict_interval(x.reshape(1, -1)) for x in X_test_SPCI]
        L_preds, U_preds = zip(*intervals)

        y_preds_SPCI: List[List[int]] = []
        for L, U in zip(L_preds, U_preds):
            if threshold > U:
                y_preds_SPCI.append([1])
            elif threshold < L:
                y_preds_SPCI.append([0])
            else:
                y_preds_SPCI.append([0, 1])

        RES_SPCI = encode(y_preds_SPCI)

        y_pred_bool_SPCI = np.zeros((len(y_preds_SPCI), 2), dtype=bool)
        for i_row, labels in enumerate(y_preds_SPCI):
            y_pred_bool_SPCI[i_row, labels] = True

        cov_van_SPCI = classification_coverage_score(y_test[mask_real], y_pred_bool_SPCI[mask_real])
        width_van_SPCI = classification_mean_width_score(y_pred_bool_SPCI[mask_real])
        print("SPCI :", cov_van_SPCI, width_van_SPCI)

        covs_SPCI.append(cov_van_SPCI)
        width_SPCI.append(width_van_SPCI)
        META_DF = pd.DataFrame({
            "id": idx_test,
            "res_mcp": RES_MCP,
            "res_spci": RES_SPCI,
            "y": y_test.values
        })


        # META_DF = pd.concat(meta_rows, ignore_index=True)
        X_meta = META_DF[["res_mcp", "res_spci"]]
        y_meta = META_DF["y"]


        meta_clf_n = LogisticRegression(
            class_weight="balanced", max_iter=1000, random_state=RANDOM_STATE
        ).fit(X_meta, y_meta)
        X_fin = build_X_(DF, prefixes, static_cols, n)
        _, yps_van_all = mond_mapie.predict(X_fin, alpha=ALPHA, partition=clusters_all)
        pset_van_all = yps_van_all[:, :, 0]
        RES_MCP_test = encode(pset_van_all)

        X_all_SPCI = X_array_hori[n - W + 1]    

        intervals_all = [model_SPCI.predict_interval(x.reshape(1, -1)) for x in X_all_SPCI]
        L_preds_all, U_preds_all = zip(*intervals_all)

        y_preds_SPCI_all: List[List[int]] = []
        for L, U in zip(L_preds_all, U_preds_all):
            if threshold > U:
                y_preds_SPCI_all.append([1])
            elif threshold < L:
                y_preds_SPCI_all.append([0])
            else:
                y_preds_SPCI_all.append([0, 1])

        RES_SPCI_test = encode(y_preds_SPCI_all)
        META_DF_test = (pd.DataFrame({
            "res_mcp": RES_MCP_test,
            "res_spci": RES_SPCI_test,
            "y": Y_TARGET.values
        }))
        X_meta_test = META_DF_test[["res_mcp", "res_spci"]]
        y_meta_test = META_DF_test["y"]

        y_pred_fin = meta_clf_n.predict_proba(X_meta_test)[:,1]
        threshold = 0.50                       # à adapter selon vos besoins
        y_pred_bin = (y_pred_fin >= threshold).astype(int)

        # ----------------------------------------------------------
        # 2) Métriques principales
        # ----------------------------------------------------------
        acc       = accuracy_score(y_meta_test, y_pred_bin)
        print("accuracy : ", acc)
        ACC.append(acc)
        pset_combine = []
        for y_pr in y_pred_fin:
            if y_pr > 0.7:
              pset_combine.append([1])
            elif y_pr < 0.3:
                pset_combine.append([0])
            else:
                pset_combine.append([0, 1])
        gamma_pred_combine = np.zeros((len(pset_combine), 2), dtype=bool)
        for i_row, labels in enumerate(pset_combine):
            gamma_pred_combine[i_row, labels] = True
        cov_van_combine = classification_coverage_score(Y_TARGET.values, gamma_pred_combine)
        width_van_combine = classification_mean_width_score(gamma_pred_combine)
        print("combine :", cov_van_combine, width_van_combine)
        covs_combine.append(cov_van_combine)
        width_combine.append(width_van_combine)
    df_metrics_model = pd.DataFrame({
    "model": [name] * len(range(W, len(prefixes))),
    "n": list(range(W, len(prefixes))),
    "coverage_MCP": covs_MCP,
    "width_MCP": width_MCP,
    "coverage_SPCI": covs_SPCI,
    "width_SPCI": width_SPCI,
    "coverage_combined": covs_combine,
    "width_combined": width_combine,
    "accuracy_combined": ACC
    })
    res_fin.append(df_metrics_model)