In [16]:
%run functions-CP.py

In [1]:
%run functions-CP2.py

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

In [3]:
year = 23
wac = False

In [4]:
if wac:
    if year == 23:
        CSV_FILE = Path("real_data_fin/merged_horizontal_WAC_2023.csv")
    elif year == 24:
        CSV_FILE = Path("real_data_fin/merged_horizontal_WAC_2024.csv")
else:
    if year == 23:
        CSV_FILE = Path("real_data_fin/merged_horizontal2023.csv")
        n_clusters = 5
    elif year == 24:
        CSV_FILE = Path("real_data_fin/merged_horizontal2024.csv")
        n_clusters = 4
df_ = pd.read_csv(CSV_FILE)
mask = df_.columns.str.startswith("B-CPE-210")
df_ = df_.loc[:, ~mask]
df = df_.copy()
mark_cols = [c for c in df.columns if c.endswith("mark")]
df[mark_cols] = df[mark_cols].div(df[mark_cols].mean())
nb_nan_par_ligne = df.isna().sum(axis=1)
print(max(nb_nan_par_ligne))
if year == 24:
    df = df[nb_nan_par_ligne < 495]
if year==23:
    df = df[nb_nan_par_ligne < 130]

143


In [5]:
df1 = df.copy()
df1["source"] = "real"

In [6]:
if wac:
    # Données WAC (WEB)
    X = df1[[c for c in df1.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    # CPE 2024
    dfcpool = df1[[c for c in df1.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    dfcpool_mark = dfcpool[cols_keep]
    X = dfcpool_mark.fillna(0)
elif year == 23:
    # CPE 2023
    X = df1[[c for c in df1.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
    
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df2 = df1.copy()
df2['cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

0    108
1     49
2    170
3    587
4    137
Name: count, dtype: int64




## GAN à tester

In [23]:
if wac:
    # Données WAC (WEB)
    X = df1[[c for c in df1.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    # CPE 2024
    dfcpool = df1[[c for c in df1.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    dfcpool_mark = dfcpool[cols_keep]
    X = dfcpool_mark.fillna(0)
elif year == 23:
    # CPE 2023
    X = df1[[c for c in df1.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
    
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=6, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
dfganall = df1.copy()
dfganall['cluster'] = clusters

print(pd.Series(clusters).value_counts().sort_index())
counts = dfganall['cluster'].value_counts()
least = counts.nsmallest(3).index.tolist()
dfganleast = dfganall[dfganall['cluster'].isin(least)]

0    331
1    322
2     20
3     88
4     86
5      3
Name: count, dtype: int64




In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# --- 0) Charger et préparer vos données ---
# df : votre DataFrame complet
features = [c for c in df.columns if c not in ("student_id", "cluster", "email", "source")]
X = df1[features].fillna(0).values

# Standardisation (cruciale pour la plupart des méthodes)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 1) Isolation Forest ---
iso = IsolationForest(
    n_estimators=100,
    contamination=0.3,   # fraction estimée d'outliers
    random_state=42
)
y_iso = iso.fit_predict(X_scaled)  
# y_iso == -1 → anomalie, 1 → normal
outliers_iso = df[y_iso == -1]

# --- 2) Local Outlier Factor (LOF) ---
lof = LocalOutlierFactor(
    n_neighbors=10,
    contamination=0.3,   # même fraction
    novelty=False         # False → on ne peut pas appeler .predict sur de nouvelles données
)
y_lof = lof.fit_predict(X_scaled)
# y_lof == -1 → anomalie, 1 → normal
outliers_lof = df[y_lof == -1]

# --- 3) Reconstruction Error via PCA ---
# On choisit k composantes pour capturer par ex. 90% de variance
pca = PCA(n_components=0.9, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_recon = pca.inverse_transform(X_pca)
recon_error = np.mean((X_scaled - X_recon)**2, axis=1)

# Seuil : on prend les points au‑dessus du 95ème percentile d’erreur
thresh = np.percentile(recon_error, 70)
outliers_pca = df[recon_error > thresh]

# --- 4) Résumé et comparaison ---
print("IsolationForest détecte",  len(outliers_iso),  "outliers")
print("LocalOutlierFactor détecte", len(outliers_lof), "outliers")
print("PCA reconstruction >95% quantile :", len(outliers_pca), "outliers")

# Par exemple, les points unanimement détectés
common1 = set(outliers_iso.index) & set(outliers_lof.index) #  & set(outliers_pca.index)
print("Détectés par les méthodes :", len(common1), "points")
common2 = set(outliers_iso.index)  & set(outliers_pca.index)
print("Détectés par les méthodes :", len(common2), "points")
common3 = set(outliers_lof.index)  & set(outliers_pca.index)
print("Détectés par les méthodes :", len(common3), "points")
df_common_outliers = df1.loc[list(common2)]


IsolationForest détecte 255 outliers
LocalOutlierFactor détecte 255 outliers
PCA reconstruction >95% quantile : 255 outliers
Détectés par les méthodes : 91 points
Détectés par les méthodes : 152 points
Détectés par les méthodes : 112 points


## SMOTE like

In [8]:

from sklearn.neighbors import NearestNeighbors

# --- 0) Préparation de X et des colonnes ---
X = df2.copy()
X = X.drop(columns=['email', 'source']).fillna(0).reset_index(drop=True)
clusters = X['cluster'].values
feature_cols = X.columns.drop('cluster')

# distinguez numériques / catégorielles
style_cols = [c for c in feature_cols if '_style' in c]
cat_cols = [c for c in feature_cols if c.endswith('passed')] + style_cols
num_cols = [c for c in feature_cols if c not in cat_cols]

# --- 1) Calcul des distributions par cluster pour chaque catégorielle ---
# cluster_cat_probs[cluster][column] = {val: prob, ...}
cluster_cat_probs = {}
for cl, grp in X.groupby('cluster'):
    cluster_cat_probs[cl] = {}
    for c in cat_cols:
        freqs = grp[c].value_counts(normalize=True).to_dict()
        cluster_cat_probs[cl][c] = freqs

# --- 2) Paramètres d’augmentation ---
minor_counts = X['cluster'].value_counts()
th = minor_counts.quantile(0.25)
minor_cs = minor_counts[minor_counts < th].index.tolist()


# --- 3) Génération des nouveaux points ---
rows_aug = []

for cl in minor_cs:
    idx = np.where(clusters == cl)[0]
    Xk = X.loc[idx, feature_cols].values
    n_new_per    = 100 // len(idx)
    k_neighbors  = min(20, len(idx)-2)
    # si trop peu de points, on duplique
    if len(Xk) < 2:
        for i in idx:
            for _ in range(n_new_per):
                rows_aug.append(X.loc[i, feature_cols].to_dict())
        continue

    # voisins sur la partie numérique
    nbrs = NearestNeighbors(
        n_neighbors=min(k_neighbors+1, len(Xk)),
        metric='euclidean'
    )
    nbrs.fit(Xk[:, [X.columns.get_loc(c) for c in num_cols]])
    neigh_idxs = nbrs.kneighbors(return_distance=False)

    for i, xi in enumerate(Xk):
        for _ in range(n_new_per):
            # choisir un voisin différent
            nbr_list = [j for j in neigh_idxs[i] if j != i]
            j = np.random.choice(nbr_list)
            xj = Xk[j]

            # interpolation SMOTE-like pour num_cols
            lam = np.random.rand()
            num_new = xi[[X.columns.get_loc(c) for c in num_cols]] + \
                      lam * (xj[[X.columns.get_loc(c) for c in num_cols]] - 
                             xi[[X.columns.get_loc(c) for c in num_cols]])

            # génération des variables catégorielles selon distributions
            cat_new = {}
            for c in cat_cols:
                dist = cluster_cat_probs[cl][c]
                labels = list(dist.keys())
                probs = list(dist.values())
                cat_new[c] = np.random.choice(labels, p=probs)

            # assemblage de la ligne synthétique
            new_row = {c: num_new[k] for k, c in enumerate(num_cols)}
            new_row.update(cat_new)
            new_row['cluster'] = cl
            rows_aug.append(new_row)

# --- 4) Création du DataFrame synthétique ---
df_synth = pd.DataFrame(rows_aug)
df_synth['source'] = 'synth'
df_synth['email'] = np.nan

# Concaténation finale

df4 = pd.concat([df2, df_synth], ignore_index=True)
print(df4['cluster'].value_counts().sort_index())


cluster
0    108
1    147
2    170
3    587
4    137
Name: count, dtype: int64


In [7]:
dfnum = df.drop(columns=["email"]).fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(dfnum)
pca = PCA(n_components=2, random_state=42)
proj = pca.fit_transform(X_scaled)
x = proj[:, 0]
y = proj[:, 1]
n_new = 450

coeffs = np.polyfit(x, y, deg=3)
p = np.poly1d(coeffs)
if wac:
    x_new = np.linspace(min(x),max(x), n_new)
else:
    x_new = fill_interval_random(x, n_new=n_new, seed=42)
all_x = np.concatenate([x, x_new])
print("Nombre de points après génération :", len(all_x))

# 1) fit du GPR sur (x.reshape(-1,1), y)
kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0)
gpr = GaussianProcessRegressor(kernel=kernel, random_state=42).fit(np.asarray(x).reshape(-1,1), np.asarray(y) - p(x))

# 2) échantillonnage postérieur pour vos nouveaux x = extra
#    sample_y renvoie un tableau (n_samples, n_draws)
resid = gpr.sample_y(x_new.reshape(-1,1), n_samples=1, random_state=42)
y_new = resid.ravel() + p(x_new)
all_y = np.concatenate([y, y_new])

X_new_pca = np.vstack([x_new, y_new]).T
X_synth_scaled = pca.inverse_transform(X_new_pca)
X_synth = scaler.inverse_transform(X_synth_scaled)
print("on a bien autant de colonnes qu'au départ :", X_synth.shape[1] == dfnum.shape[1])

numeric_cols = dfnum.columns.tolist()
df_synth = pd.DataFrame(X_synth, columns=numeric_cols)
for col in df.columns.difference(df_synth.columns):
    df_synth[col] = np.nan
df_synth = df_synth[df.columns]

df_synth["source"] = "synth"

df5 = pd.concat([df1, df_synth], ignore_index=True)
if wac:
    # Données WAC (WEB)
    X = df5[[c for c in df.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    # CPE 2024
    dfcpool = df5[[c for c in df5.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    dfcpool_mark = dfcpool[cols_keep]
    X = dfcpool_mark.fillna(0)
elif year == 23:
    # CPE 2023
    X = df5[[c for c in df5.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
    
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df5['cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

NameError: name 'fill_interval_random' is not defined

In [27]:
min_cluster_size = 50

if wac:
    # Données WAC (WEB)
    X = df2[[c for c in df2.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    dfcpool = df2[[c for c in df2.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday01_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    X = dfcpool[cols_keep].fillna(0)
elif year == 23:
    X = df2[[c for c in df2.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
else:
    raise ValueError(f"Année inattendue : {year}")

# 2) Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3) Premier KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_

# 4) Mesure des tailles
sizes = pd.Series(labels).value_counts().sort_index()

# 5) Réaffectation des points de petits clusters
small_clusters = sizes[sizes < min_cluster_size].index.tolist()
if small_clusters:
    print(f"Clusters trop petits à réaffecter : {small_clusters}")
    # Pour chaque point des clusters trop petits, on calcule la distance à tous les centroides
    for sc in small_clusters:
        mask_sc = labels == sc
        idxs   = np.where(mask_sc)[0]
        for i in idxs:
            # distances à TOUTES les centroïdes
            dists = np.linalg.norm(X_scaled[i] - centroids, axis=1)
            # on interdit de rester dans le même (petit) cluster
            dists[sc] = np.inf
            # réaffectation vers le plus proche cluster valide
            labels[i] = int(np.argmin(dists))
    # Optionnel : recompute sizes
    sizes = pd.Series(labels).value_counts().sort_index()
    print("Nouvelles tailles de clusters :\n", sizes)

# 6) Stockage dans df4
df3 = df2.copy()
df3["cluster"] = labels

Clusters trop petits à réaffecter : [2]
Nouvelles tailles de clusters :
 0    466
1    166
3    218
Name: count, dtype: int64




In [28]:
df3.shape, df4.shape, df5.shape

((850, 516), (946, 516), (1300, 516))

In [9]:
import warnings
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=".*'force_all_finite' was renamed to 'ensure_all_finite'.*"
)


In [10]:
ecs = []
best = -np.inf
best_prop = None

# On conserve la moyenne précédente pour la comparer
prev_mean = None
n_rendus = 1
df0 = df.copy()
df0 = df0.fillna(0)
mark_cols = [c for c in df0.columns if c.endswith("mark")][::-1]

def last_marks(row):
    vals, cols = [], []
    for c in mark_cols:
        v = row[c]
        if v > 0:
            vals.append(v)
            cols.append(c)
            if len(vals) == n_rendus:
                break
    return pd.Series({"cols": cols, "vals": vals})

tmp = df0.apply(last_marks, axis=1)
df0[["last_cols", "last_vals"]] = tmp
df0["lastvals"] = df0["last_vals"].apply(lambda r: r[0])
def split_by_kmeans(x, random_state=0):
    """
    Coupe la liste x en deux clusters via KMeans (k=2).
    Retourne :
      - t       : seuil estimé (milieu entre les deux centres)
      - labels  : tableau 0/1, 0 = cluster « faible », 1 = cluster « fort »
      - centers : centres des deux clusters
    """
    x = np.asarray(x, dtype=float)
    # reshape pour sklearn
    X = x.reshape(-1, 1)
    # lancer KMeans
    kmeans = KMeans(n_clusters=2, random_state=random_state).fit(X)
    labels_km = kmeans.labels_              # étiquettes 0 ou 1
    centers = kmeans.cluster_centers_.flatten()
    # déterminer quel cluster est le plus « faible »
    low_label = np.argmin(centers)
    # re-mapper pour que 0 = faible, 1 = fort
    labels = (labels_km != low_label).astype(int)
    # seuil = milieu entre les deux centres
    t = float(np.mean(centers))
    return t, labels, centers


t, lbl, ctr = split_by_kmeans(df0["lastvals"])
print(f"Seuil KMeans     : {t:.3f}")
print(f"Centres clusters : {ctr}")
for i, prop in enumerate(np.linspace(0.5, 1, 100)):
    threshold = min(ctr) * prop
    # indicatrice binaire
    y_all = (df0["lastvals"] < threshold).astype(int)
    current_mean = y_all.mean()

    # au premier passage, on n'a pas de précédente moyenne
    if prev_mean is None:
        ec = 0.0
    else:
        ec = abs(current_mean - prev_mean)

    ecs.append(ec)

    # on met à jour le meilleur écart
    if ec > best:
        best = ec
        best_prop0 = np.linspace(0.5, 1, 100)[i]
        best_prop1 = np.linspace(0.5, 1, 100)[i-1]
    # on stocke la moyenne courante pour la prochaine itération
    prev_mean = current_mean

print(f"Meilleur écart = {best:.4f} obtenu pour prop = {best_prop1:.4f}")
threshold = min(ctr) * best_prop1
Y_TARGET =  (df0["lastvals"] < threshold).astype(int).values
if Y_TARGET.mean()*100 < 20:
    threshold = min(ctr) * best_prop0
Y_TARGET =  (df0["lastvals"] < threshold).astype(int).values
print(f"proportion de dropout : {Y_TARGET.mean()}")
print(threshold)

Seuil KMeans     : 1.645
Centres clusters : [0.35220526 2.93743438]
Meilleur écart = 0.2683 obtenu pour prop = 0.7980
proportion de dropout : 0.32540437678401524
0.28105268564221186




In [11]:
threshold

0.28105268564221186

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.base import clone

def build_X(df_sub: pd.DataFrame, prefixes: list, static_cols: list, n: int) -> np.ndarray:
        # on garde student_id + les n premiers items
        dyn_cols = [
        col for col in df_sub.columns
        if any(col.startswith(pref) for pref in prefixes[:n])
        ]
        keep = ["email"] + static_cols + dyn_cols
        return df_sub[keep].set_index("email").values

def compare_classifiers(X_train, X_test, y_train, y_test, mask_test, models, n, do_plot=False):
    """
    Entraîne plusieurs modèles de classification binaire, calcule leurs métriques de performance
    et affiche les courbes ROC.

    Args:
        X_train (array-like): Caractéristiques d'entraînement
        X_test (array-like): Caractéristiques de test
        y_train (array-like): Étiquettes d'entraînement
        y_test (array-like): Étiquettes de test
        models (dict): Dictionnaire {'nom_modèle': instance_modèle}

    Returns:
        pandas.DataFrame: Tableau des métriques pour chaque modèle
    """
    metrics = []
    # plt.figure(figsize=(8, 6))

    for name, mod in (models.items()):
        # Entraînement
        model = clone(mod)
        model.fit(X_train, y_train)

        # Prédictions
        y_pred = model.predict(X_test)
        # Probabilités ou scores de décision pour ROC
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = model.decision_function(X_test)

        # puis on ne garde QUE les indices mask_test == True
        y_test_r = y_test[mask_test]
        y_pred_r = y_pred[mask_test]
        y_prob_r = y_prob[mask_test]

        # calcul métriques
        metrics.append({
            "n_projects": n,
            "Modèle":     name,
            "Accuracy":   accuracy_score(y_test_r, y_pred_r),
            "Precision":  precision_score(y_test_r, y_pred_r, zero_division=0),
            "Recall":     recall_score(y_test_r, y_pred_r),
            "F1 Score":   f1_score(y_test_r, y_pred_r),
            "ROC AUC":    roc_auc_score(y_test_r, y_prob_r),
        })

        # Courbe ROC
        # fpr, tpr, _ = roc_curve(y_test, y_prob)
        # plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
    if do_plot:
        # Courbe aléatoire
        plt.plot([0, 1], [0, 1], 'k--', label='Aléatoire')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Courbes ROC des modèles")
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.show()

    # Tableau des métriques
    return pd.DataFrame(metrics)


if __name__ == "__main__":
    res = []
    X_all = df4.copy()
    X_all.fillna(0, inplace=True)
    mark_cols = [c for c in X_all.columns if c.endswith("mark")][::-1]
    prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols[::-1]))
    static_cols = []
    mask_real = (X_all['source'] == 'real').values

    df0 = X_all.copy()

    def last_marks(row):
        vals, cols = [], []
        for c in mark_cols:
            v = row[c]
            if v > 0:
                vals.append(v)
                cols.append(c)
                if len(vals) == 1:
                    break
        return pd.Series({"cols": cols, "vals": vals})

    tmp = df0.apply(last_marks, axis=1)
    df0[["last_cols", "last_vals"]] = tmp
    df0["lastvals"] = df0["last_vals"].apply(lambda r: r[0])

    y = (df0["lastvals"] < threshold).astype(int).values
    print(f"proportion de dropout : {y.mean()}")
    for n in tqdm(range(1, len(prefixes) + 1 )):
        X = build_X(X_all,  prefixes, static_cols, n)
        # Séparation train/test
        X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(
            X, y, mask_real, test_size=0.2, random_state=42
        )

        # Définition des modèles à comparer
        models = {
            "Régression Logistique": LogisticRegression(max_iter=10000),
            "Forêt Aléatoire": RandomForestClassifier(n_estimators=1000, random_state=42),
            "Gradient Boosting": GradientBoostingClassifier(random_state=42),
            "SVM Linéaire": SVC(kernel='linear', probability=True, random_state=42),
        }

        # Comparaison
        results = compare_classifiers(X_train, X_test, y_train, y_test, mask_test, models, n)
        res.append(results)
    results_df = pd.concat(res, ignore_index=True)
    print(results_df)

proportion de dropout : 0.35944299390774587


100%|██████████| 8/8 [00:57<00:00,  7.23s/it]

    n_projects                 Modèle  Accuracy  Precision    Recall  \
0            1  Régression Logistique  0.764151   0.717949  0.417910   
1            1        Forêt Aléatoire  0.693396   0.520000  0.388060   
2            1      Gradient Boosting  0.721698   0.605263  0.343284   
3            1           SVM Linéaire  0.726415   0.615385  0.358209   
4            2  Régression Logistique  0.745283   0.614035  0.522388   
5            2        Forêt Aléatoire  0.646226   0.431034  0.373134   
6            2      Gradient Boosting  0.726415   0.588235  0.447761   
7            2           SVM Linéaire  0.726415   0.576271  0.507463   
8            3  Régression Logistique  0.688679   0.507692  0.492537   
9            3        Forêt Aléatoire  0.655660   0.444444  0.358209   
10           3      Gradient Boosting  0.698113   0.526316  0.447761   
11           3           SVM Linéaire  0.721698   0.568966  0.492537   
12           4  Régression Logistique  0.712264   0.544118  0.55




In [14]:
results_df.groupby([ "Modèle"]).agg(
            mean_acc=("Accuracy", "mean"),
        ).reset_index()

Unnamed: 0,Modèle,mean_acc
0,Forêt Aléatoire,0.730542
1,Gradient Boosting,0.747642
2,Régression Logistique,0.741156
3,SVM Linéaire,0.732901


In [39]:

tasks = [
    ("sans rien", df1, {}),
    ("sans enrichissement", df3, {}),
    ("avec enrichissement, technique 2", df4, {}),
   #  ("avec enrichissement, technique ACP inverse", df5, {})
]

summary_records = []
res = []
for name, dfk, kwargs in tasks:
    # 1) Lancement de la fonction
    df_detail, df_agg, y_cible, tr_clf = run_analysis_w(
        df=dfk,
        threshold=threshold,
        do_plot=False,
        **kwargs
    )
    r =  df_detail.groupby(["method", "model", "cluster"]).agg(
            mean_coverage=("coverage", "mean"),
            mean_width=("width", "mean")
        ).reset_index()
    res.append(r[r['cluster'] == -1])
    # 2) Agrégation conditionnelle
    df_cond = aggregate_conformal_metrics(df_detail, dfk)
    
    # 3) Moyennes pondérées
    if wac:
        cutoff = 12
    elif year == 24:
        cutoff = 13
    elif year == 23:
        cutoff = 1
    df_w = compute_weighted_conformal_metrics(df_cond, lmbda=0.9, cutoff=cutoff)
    # 4) Collecte des résultats par modèle
    for model in df_cond['model'].unique():
        summary_records.append({
            'run':            name,
            'model':          model,
            'cov_all_mean':   df_cond.loc[df_cond['model']==model, 'cov_all'].mean(),
            'cov_clu_mean':   df_cond.loc[df_cond['model']==model, 'cov_cluster'].mean(),
            'cov_all_w':      df_w.loc[model, 'cov_all_w'],
            'cov_clu_w':      df_w.loc[model, 'cov_cluster_w'],
            'width_all_mean':   df_cond.loc[df_cond['model']==model, 'width_all'].mean(),
            'width_clu_mean':   df_cond.loc[df_cond['model']==model, 'width_cluster'].mean(),
            'width_all_w':    df_w.loc[model, 'width_all_w'],
            'width_clu_w':    df_w.loc[model, 'width_cluster_w'],
        })

# 5) Construire le DataFrame de synthèse
df_summary = pd.DataFrame.from_records(summary_records)
df_summary = df_summary.set_index(['run','model'])

# 6) Affichage
pd.set_option('display.float_format', '{:.3f}'.format)
print("\n=== Synthèse conformal prediction ===")
print(df_summary)

threshold  = 0.243 → 24.9% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]

threshold  = 0.243 → 24.9% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]

threshold  = 0.243 → 24.1% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]


=== Synthèse conformal prediction ===
                                        cov_all_mean  cov_clu_mean  cov_all_w  \
run                              model                                          
sans rien                        RF            0.975         0.000      0.974   
                                 LR            0.974         0.000      0.972   
                                 GB            0.972         0.000      0.970   
sans enrichissement              RF            0.975         0.976      0.974   
                                 LR            0.974         0.974      0.972   
                                 GB            0.972         0.972      0.970   
avec enrichissement, technique 2 RF            0.962         0.966      0.955   
                                 LR            0.955         0.960      0.957   
                                 GB            0.962         0.966      0.957   

                                        cov_clu_w  width_all_mean  \


In [41]:
print("vanilla-CP")
res[0]

vanilla-CP


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.972,1.627
1,mondrian,LR,-1,0.974,1.698
2,mondrian,RF,-1,0.975,1.652
3,vanilla,GB,-1,0.972,1.627
4,vanilla,LR,-1,0.974,1.698
5,vanilla,RF,-1,0.975,1.652


In [42]:
print("Mondrian")
res[1]

Mondrian


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.974,1.7
4,mondrian,LR,-1,0.979,1.746
8,mondrian,RF,-1,0.971,1.697
12,vanilla,GB,-1,0.972,1.627
16,vanilla,LR,-1,0.974,1.698
20,vanilla,RF,-1,0.975,1.652


In [43]:
print("Mondrian + Enrichment")
res[2]

Mondrian + Enrichment


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.966,1.609
5,mondrian,LR,-1,0.965,1.729
10,mondrian,RF,-1,0.972,1.64
15,vanilla,GB,-1,0.962,1.559
20,vanilla,LR,-1,0.955,1.671
25,vanilla,RF,-1,0.962,1.551


## Enrichment with $G_{t+1}$

In [226]:
from tqdm import tqdm
import umap
from sklearn.preprocessing import MinMaxScaler

In [227]:
if wac:
    if year == 23:
        CSV_FILE = Path("real_data_fin/merged_horizontal_WAC_2023.csv")
    elif year == 24:
        CSV_FILE = Path("real_data_fin/merged_horizontal_WAC_2024.csv")
else:
    if year == 23:
        CSV_FILE = Path("real_data_fin/merged_horizontal2023.csv")
    elif year == 24:
        CSV_FILE = Path("real_data_fin/merged_horizontal2024.csv")
df_ = pd.read_csv(CSV_FILE)
mask = df_.columns.str.startswith("B-CPE-210")
df_ = df_.loc[:, ~mask]
df = df_.copy()
mark_cols = [c for c in df.columns if c.endswith("mark")]
df[mark_cols] = df[mark_cols].div(df[mark_cols].mean())

In [228]:

# 1. On récupère la liste des colonnes sans 'email'
cols = df.drop(columns=['email']).columns.to_series()

# 2. On extrait le "suffixe" (ce qui suit le premier '_')
suffixes = cols.apply(lambda x: x.split('_')[1])
ordered_suffixes = suffixes.unique()
# 3. On regroupe les noms de colonnes par suffixe
dfs = {}
for suffix in ordered_suffixes:
    # sélectionne les colonnes dont, une fois splitté, le suffixe correspond
    cols_for_suffix = [
        col for col in df.columns
        if col != 'email' and col.split('_')[1] == suffix
    ]
    dfs[suffix] = df[cols_for_suffix]


# Exemple d’utilisation :
for suffix, subdf in dfs.items():
    print(f"Suffixe = {suffix} → shape {subdf.shape}")
    # display(subdf.head())  # si vous êtes en Jupyter / notebook

Suffixe = cpoolday01 → shape (851, 24)
Suffixe = cpoolday02 → shape (851, 22)
Suffixe = cpoolday03 → shape (851, 26)
Suffixe = cpoolday04 → shape (851, 20)
Suffixe = cpoolday05 → shape (851, 24)
Suffixe = cpoolday06 → shape (851, 50)
Suffixe = cpoolday07 → shape (851, 20)
Suffixe = cpoolday08 → shape (851, 18)
Suffixe = cpoolday09 → shape (851, 20)
Suffixe = cpoolday10 → shape (851, 18)
Suffixe = cpoolday11 → shape (851, 30)
Suffixe = cpoolday12 → shape (851, 16)
Suffixe = cpoolday13 → shape (851, 24)
Suffixe = myls → shape (851, 14)
Suffixe = settingup → shape (851, 32)
Suffixe = mytop → shape (851, 15)
Suffixe = organized → shape (851, 13)
Suffixe = secured → shape (851, 14)
Suffixe = mysudo → shape (851, 11)
Suffixe = minishell1 → shape (851, 19)
Suffixe = robotfactory → shape (851, 12)
Suffixe = minishell2 → shape (851, 25)
Suffixe = amazed → shape (851, 14)
Suffixe = 42sh → shape (851, 32)


In [229]:
Xt = {}
for suffix, subdf in tqdm(dfs.items()):
    test_cols = [c for c in subdf.columns if c.endswith('passed')]
    subdf_tests = subdf[test_cols].fillna(0)
    reducer = umap.UMAP(
    n_components=3,    
    n_neighbors=50,    # nombre de voisins pour la structure locale (par défaut 15)
    min_dist=0.1,      # distance minimale entre points dans l'espace réduit
    metric='hamming',  # métrique de distance
    random_state=42    # pour la reproductibilité
)

    embedding = reducer.fit_transform(subdf_tests.values)

    # 3. Convertir en DataFrame pour plus de commodité
    subdf_tests_umap3 = pd.DataFrame(
        embedding,
        columns=[f"UMAP_{i+1}_passed" for i in range(3)],
        index=df.index
    )
    scaler = MinMaxScaler(feature_range=(0, 1))
    array_norm = scaler.fit_transform(subdf_tests_umap3)
    subdf_tests_umap3_norm = pd.DataFrame(
    array_norm,
    index=subdf_tests_umap3.index,
    columns=subdf_tests_umap3.columns
    )
    # print(subdf_tests_umap3.head())
    Xt[suffix] =  pd.concat([subdf_tests_umap3_norm, subdf.drop(columns=test_cols)], axis=1, ignore_index=True).fillna(0)
    # print(X[suffix].head())
    print(suffix, 'done')

  warn(
  warn(
  4%|▍         | 1/24 [00:11<04:23, 11.48s/it]

cpoolday01 done


  warn(
  warn(
  8%|▊         | 2/24 [00:23<04:20, 11.85s/it]

cpoolday02 done


  warn(
  warn(
 12%|█▎        | 3/24 [00:36<04:20, 12.39s/it]

cpoolday03 done


  warn(
  warn(
 17%|█▋        | 4/24 [00:50<04:17, 12.89s/it]

cpoolday04 done


  warn(
  warn(
 21%|██        | 5/24 [01:02<03:58, 12.58s/it]

cpoolday05 done


  warn(
  warn(
 25%|██▌       | 6/24 [01:11<03:27, 11.52s/it]

cpoolday06 done


  warn(
  warn(
 29%|██▉       | 7/24 [01:24<03:22, 11.91s/it]

cpoolday07 done


  warn(
  warn(
 33%|███▎      | 8/24 [01:38<03:23, 12.70s/it]

cpoolday08 done


  warn(
  warn(
 38%|███▊      | 9/24 [01:53<03:19, 13.33s/it]

cpoolday09 done


  warn(
  warn(
 42%|████▏     | 10/24 [02:09<03:18, 14.17s/it]

cpoolday10 done


  warn(
  warn(
 46%|████▌     | 11/24 [02:23<03:04, 14.16s/it]

cpoolday11 done


  warn(
  warn(
 50%|█████     | 12/24 [02:39<02:56, 14.71s/it]

cpoolday12 done


  warn(
  warn(
 54%|█████▍    | 13/24 [02:55<02:45, 15.05s/it]

cpoolday13 done


  warn(
  warn(
 58%|█████▊    | 14/24 [03:07<02:20, 14.09s/it]

myls done


  warn(
  warn(
 62%|██████▎   | 15/24 [03:16<01:54, 12.69s/it]

settingup done


  warn(
  warn(
 67%|██████▋   | 16/24 [03:25<01:32, 11.54s/it]

mytop done


  warn(
  warn(
 71%|███████   | 17/24 [03:36<01:19, 11.39s/it]

organized done


  warn(
  warn(
 75%|███████▌  | 18/24 [03:46<01:04, 10.82s/it]

secured done


  warn(
  warn(
 79%|███████▉  | 19/24 [03:59<00:57, 11.48s/it]

mysudo done


  warn(
  warn(
 83%|████████▎ | 20/24 [04:06<00:40, 10.13s/it]

minishell1 done


  warn(
  warn(
 88%|████████▊ | 21/24 [04:18<00:31, 10.63s/it]

robotfactory done


  warn(
  warn(
 92%|█████████▏| 22/24 [04:24<00:18,  9.31s/it]

minishell2 done


  warn(
  warn(
 96%|█████████▌| 23/24 [04:34<00:09,  9.43s/it]

amazed done


  warn(
  warn(
100%|██████████| 24/24 [04:45<00:00, 11.88s/it]

42sh done





In [230]:
keys = list(Xt.keys())
w = 3
H = 0
X = []  # liste des features (fenêtres concaténées)
y = []  # liste des targets

for i in range(w, len(keys)):
    # clé courante
    suffix = keys[i]
    # print(suffix)
    # 1) Construction de la fenêtre des w précédents
    x_i_frames = []
    for j in range(1, w+1):
        prev_key = keys[i - j]
        df_prev = Xt[prev_key]
        # supprimer les colonnes 3 et 4
        # df_prev = df_prev.drop(df_prev.columns[[3, 4]], axis=1)
        x_i_frames.append(df_prev)
    
    # concatène horizontalement les w DataFrames
    X.append(pd.concat(x_i_frames, axis=1))
    
    # 2) Extraction de la colonne cible (indice 3) du DataFrame courant
    if H + i < len(keys):
        y_i = Xt[keys[i+H]].iloc[:, 3]
    else:
        y_i = Xt[keys[-1]].iloc[:, 3]
    y.append(y_i)

X_array_hori = [df.values for df in X]
y_array_hori = [s.values for s in y]

In [231]:
X_array = X_array_hori
y_array = y_array_hori
U_t = []
# On parcourt i de 1 à len(X_array)-1 (i=0 n'a pas de passé pour entraîner)
for i in tqdm(range(1, len(X_array) - 1), desc="Fenêtres en ligne"):
    # --- 1) Construction du train sur les fenêtres passées ---
    X_train = np.vstack(X_array[:i])      # fenêtres 0..i-1
    y_train = np.concatenate(y_array[:i])
    # --- 2) Entraînement d'un nouveau modèle ---
    model = OneSidedSPCI_LGBM_Offline(alpha=0.1, w=300, random_state=0)
    model.fit(X_train, y_train)
    X_i, y_i = X_array[i], y_array[i]
    # calcul des bornes supérieures U_t pour chaque échantillon de X_i
    U = np.array([
        model.predict_interval(x.reshape(1, -1))[1]
        for x in X_i
    ])
    U_t.append(U)
# print(U_t)

Fenêtres en ligne:   0%|          | 0/19 [00:00<?, ?it/s]

fit 1 ok
fit 2 ok


Fenêtres en ligne:   5%|▌         | 1/19 [00:12<03:48, 12.68s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  11%|█         | 2/19 [00:42<06:23, 22.56s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  16%|█▌        | 3/19 [01:24<08:25, 31.57s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  21%|██        | 4/19 [02:21<10:25, 41.67s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  26%|██▋       | 5/19 [03:35<12:26, 53.36s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  32%|███▏      | 6/19 [05:09<14:30, 66.98s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  37%|███▋      | 7/19 [06:59<16:12, 81.04s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  42%|████▏     | 8/19 [09:12<17:55, 97.74s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  47%|████▋     | 9/19 [11:28<18:17, 109.77s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  53%|█████▎    | 10/19 [14:01<18:28, 123.12s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  58%|█████▊    | 11/19 [16:51<18:19, 137.43s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  63%|██████▎   | 12/19 [19:54<17:39, 151.34s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  68%|██████▊   | 13/19 [23:14<16:35, 165.96s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  74%|███████▎  | 14/19 [26:53<15:09, 181.86s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  79%|███████▉  | 15/19 [30:45<13:07, 196.98s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  84%|████████▍ | 16/19 [34:35<10:21, 207.04s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  89%|████████▉ | 17/19 [38:42<07:17, 219.00s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne:  95%|█████████▍| 18/19 [43:02<03:51, 231.47s/it]

fit 1 ok
fit 2 ok


Fenêtres en ligne: 100%|██████████| 19/19 [47:39<00:00, 150.52s/it]


In [232]:
df6 = df1.copy()
cols = df6.drop(columns=['email', 'source']).columns.to_series()
# 2. On extrait le "suffixe" (ce qui suit le premier '_')
suffixes = cols.apply(lambda x: x.split('_')[1])
ordered_suffixes = suffixes.unique()
prefixes = df6.columns.str.split("_").str[0]
prefixes2 = df6.drop(columns=['email']).columns.str.split("_").str[:2].str.join("_").unique()[w-1:][::-1]
change_points = [i for i in range(len(cols) -1) if suffixes[i]!= suffixes[i+1]][::-1]
for i, ut in enumerate(reversed(U_t)):
    loc = change_points[i]
    col_name = f"{prefixes2[i]}_next_grade"
    df6.insert(loc - 1, col_name, ut)

  change_points = [i for i in range(len(cols) -1) if suffixes[i]!= suffixes[i+1]][::-1]


In [233]:
tasks = [
    ("", df6, {}),
]

summary_records = []
res = []
for name, dfk, kwargs in tasks:
    # 1) Lancement de la fonction
    df_detail, df_agg, y_cible, tr_clf = run_analysis_w(
        df=dfk,
        threshold=threshold,
        do_plot=False,
        **kwargs
    )
    r =  df_detail.groupby(["method", "model", "cluster"]).agg(
            mean_coverage=("coverage", "mean"),
            mean_width=("width", "mean")
        ).reset_index()
    res.append(r[r['cluster'] == -1])
    # 2) Agrégation conditionnelle

threshold  = 0.242 → 25.0% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]

In [234]:
print("Vanilla + SPCI")
res[0]

Vanilla + SPCI


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.949,1.549
1,mondrian,LR,-1,0.968,1.695
2,mondrian,RF,-1,0.947,1.534
3,vanilla,GB,-1,0.949,1.549
4,vanilla,LR,-1,0.968,1.695
5,vanilla,RF,-1,0.947,1.534


In [235]:
if wac:
    # Données WAC (WEB)
    X = df1[[c for c in df1.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    # CPE 2024
    dfcpool = df1[[c for c in df1.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    dfcpool_mark = dfcpool[cols_keep]
    X = dfcpool_mark.fillna(0)
elif year == 23:
    # CPE 2023
    X = df1[[c for c in df1.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
    
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df7 = df6.copy()
df7['cluster'] = clusters
print(pd.Series(clusters).value_counts().sort_index())

0    257
1    466
2     13
3    115
Name: count, dtype: int64




In [236]:
min_cluster_size = 50
n_clusters       = 4

if wac:
    # Données WAC (WEB)
    X = df2[[c for c in df2.columns if c.startswith("W-WEB-024")]].fillna(0)
elif year == 24:
    dfcpool = df2[[c for c in df2.columns if c.startswith("B-CPE-100")]]
    pat = re.compile(r"B-CPE-100_cpoolday01_\d{2} - task\d+_passed")
    cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
    X = dfcpool[cols_keep].fillna(0)
elif year == 23:
    X = df2[[c for c in df2.columns if c.startswith("B-CPE-110_settingup")]].fillna(0)
else:
    raise ValueError(f"Année inattendue : {year}")

# 2) Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3) Premier KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_

# 4) Mesure des tailles
sizes = pd.Series(labels).value_counts().sort_index()

# 5) Réaffectation des points de petits clusters
small_clusters = sizes[sizes < min_cluster_size].index.tolist()
if small_clusters:
    print(f"Clusters trop petits à réaffecter : {small_clusters}")
    # Pour chaque point des clusters trop petits, on calcule la distance à tous les centroides
    for sc in small_clusters:
        mask_sc = labels == sc
        idxs   = np.where(mask_sc)[0]
        for i in idxs:
            # distances à TOUTES les centroïdes
            dists = np.linalg.norm(X_scaled[i] - centroids, axis=1)
            # on interdit de rester dans le même (petit) cluster
            dists[sc] = np.inf
            # réaffectation vers le plus proche cluster valide
            labels[i] = int(np.argmin(dists))
    # Optionnel : recompute sizes
    sizes = pd.Series(labels).value_counts().sort_index()
    print("Nouvelles tailles de clusters :\n", sizes)

df8 = df7.copy()
df8["cluster"] = labels

Clusters trop petits à réaffecter : [2]
Nouvelles tailles de clusters :
 0    477
1    130
3    244
Name: count, dtype: int64




In [245]:
from sklearn.neighbors import NearestNeighbors

# --- 0) Préparation de X et des colonnes ---
# X contient déjà 'cluster' et toutes les features, y compris les binaires
X = df7.copy()
X = X.drop(columns=['email', 'source']).fillna(0).reset_index(drop=True)
clusters    = X['cluster'].values
feature_cols = X.columns.drop('cluster')

# distinguez numériques / cat
style_cols = [c for c in feature_cols if "_style" in c]
bin_cols = [c for c in feature_cols if c.endswith('passed')] + style_cols
num_cols = [c for c in feature_cols if c not in bin_cols]

# --- 1) Calcul des probabilités par cluster pour chaque binaire ---
cluster_probs = (
    X
    .groupby('cluster')[bin_cols]
    .mean()  # pour chaque cluster, freq de 1 dans chaque colonne binaire
)
dfy = X.copy()
# --- 2) Paramètres d’augmentation ---
minor_counts = X['cluster'].value_counts()
th    = minor_counts.quantile(0.25)  # on cible les 25% les moins gros
minor_cs     = minor_counts[minor_counts < th].index.tolist()

# --- 3) Génération des nouveaux points ---
rows_aug = []

for cl in minor_cs:
    idx = np.where(clusters == cl)[0]
    Xk  = X.loc[idx, feature_cols].values
    n_new_per    = 100 // len(idx)
    k_neighbors  = min(20, len(idx)-2)
    # si trop peu de points, on duplique
    if len(Xk) < 2:
        for i in idx:
            for _ in range(n_new_per):
                rows_aug.append(X.loc[i, feature_cols].to_dict())
        continue

    # on calcule les voisins sur la partie numérique
    nbrs = NearestNeighbors(
        n_neighbors=min(k_neighbors+1, len(Xk)),
        metric='euclidean'
    )
    nbrs.fit(Xk[:, [X.columns.get_loc(c) for c in num_cols]])
    neigh_idxs = nbrs.kneighbors(return_distance=False)

    for i, xi in enumerate(Xk):
        for _ in range(n_new_per):
            # choix d'un voisin différent
            nbr_list = [j for j in neigh_idxs[i] if j != i]
            j = np.random.choice(nbr_list)
            xj = Xk[j]

            # interpolation SMOTE-like pour num_cols
            lam = np.random.rand()
            num_new = xi[[X.columns.get_loc(c) for c in num_cols]] + \
                      lam * (xj[[X.columns.get_loc(c) for c in num_cols]] - 
                             xi[[X.columns.get_loc(c) for c in num_cols]])

            # génération des bin_cols selon cluster_probs
            bin_new = {}
            for c in bin_cols:
                p = cluster_probs.loc[cl, c]
                # on tire un 1 avec prob = p, sinon 0
                bin_new[c] = int(np.random.rand() < p)

            # assemblage de la nouvelle ligne
            new_row = {c: num_new[k] for k, c in enumerate(num_cols)}
            new_row.update(bin_new)
            new_row['cluster'] = cl
            rows_aug.append(new_row)

# --- 4) Création du DataFrame synthétique et concaténation ---
df_synth = pd.DataFrame(rows_aug)
# y_synth = y_synth = (np.random.rand(len(df_synth))  < df_synth['cluster'].map(cluster_y_rate)).astype(int)

df_synth['source'] = "synth"
df_synth['email'] = np.nan
df9 = pd.concat([df7, df_synth], ignore_index=True)
# y_final = np.concatenate([y_cible, y_synth])
print(pd.Series(df9['cluster']).value_counts().sort_index())

cluster
0    257
1    466
2    104
3    115
Name: count, dtype: int64


In [246]:
tasks = [
    ("avec clustering", df8, {}),
     ("avec enrichissement et clustering", df9, {})
]

summary_records = []
res = []
for name, dfk, kwargs in tasks:
    # 1) Lancement de la fonction
    df_detail, df_agg, y_cible, tr_clf = run_analysis_w(
        df=dfk,
        threshold=threshold,
        do_plot=False,
        **kwargs
    )
    r =  df_detail.groupby(["method", "model", "cluster"]).agg(
            mean_coverage=("coverage", "mean"),
            mean_width=("width", "mean")
        ).reset_index()
    res.append(r[r['cluster'] == -1])
    # 2) Agrégation conditionnelle
    df_cond = aggregate_conformal_metrics(df_detail, dfk)
    
    # 3) Moyennes pondérées
    if wac:
        cutoff = 12
    elif year == 24:
        cutoff = 13
    elif year == 23:
        cutoff = 1
    df_w = compute_weighted_conformal_metrics(df_cond, lmbda=0.9, cutoff=cutoff)
    # 4) Collecte des résultats par modèle
    for model in df_cond['model'].unique():
        summary_records.append({
            'run':            name,
            'model':          model,
            'cov_all_mean':   df_cond.loc[df_cond['model']==model, 'cov_all'].mean(),
            'cov_clu_mean':   df_cond.loc[df_cond['model']==model, 'cov_cluster'].mean(),
            'cov_all_w':      df_w.loc[model, 'cov_all_w'],
            'cov_clu_w':      df_w.loc[model, 'cov_cluster_w'],
            'width_all_mean':   df_cond.loc[df_cond['model']==model, 'width_all'].mean(),
            'width_clu_mean':   df_cond.loc[df_cond['model']==model, 'width_cluster'].mean(),
            'width_all_w':    df_w.loc[model, 'width_all_w'],
            'width_clu_w':    df_w.loc[model, 'width_cluster_w'],
        })

# 5) Construire le DataFrame de synthèse
df_summary = pd.DataFrame.from_records(summary_records)
df_summary = df_summary.set_index(['run','model'])

# 6) Affichage
pd.set_option('display.float_format', '{:.3f}'.format)
print("\n=== Synthèse conformal prediction ===")
print(df_summary)

threshold  = 0.242 → 25.0% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]

threshold  = 0.242 → 23.2% positives


RF:   0%|          | 0/24 [00:00<?, ?it/s]

LR:   0%|          | 0/24 [00:00<?, ?it/s]

GB:   0%|          | 0/24 [00:00<?, ?it/s]


=== Synthèse conformal prediction ===
                                         cov_all_mean  cov_clu_mean  \
run                               model                               
avec clustering                   RF            0.947         0.948   
                                  LR            0.968         0.969   
                                  GB            0.949         0.950   
avec enrichissement et clustering RF            0.959         0.962   
                                  LR            0.960         0.964   
                                  GB            0.957         0.961   

                                         cov_all_w  cov_clu_w  width_all_mean  \
run                               model                                         
avec clustering                   RF         0.950      0.949           1.534   
                                  LR         0.971      0.972           1.695   
                                  GB         0.955      0.955       

In [247]:
print("Mondrian + SPCI")
res[0]

Mondrian + SPCI


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.963,1.636
4,mondrian,LR,-1,0.982,1.789
8,mondrian,RF,-1,0.95,1.605
12,vanilla,GB,-1,0.949,1.549
16,vanilla,LR,-1,0.968,1.695
20,vanilla,RF,-1,0.947,1.534


In [248]:
print("Mondrian + Enrichment + SPCI")
res[1]

Mondrian + Enrichment + SPCI


Unnamed: 0,method,model,cluster,mean_coverage,mean_width
0,mondrian,GB,-1,0.969,1.643
5,mondrian,LR,-1,0.97,1.739
10,mondrian,RF,-1,0.969,1.659
15,vanilla,GB,-1,0.957,1.536
20,vanilla,LR,-1,0.96,1.639
25,vanilla,RF,-1,0.959,1.55
