In [1]:
import pandas as pd
from pathlib import Path
import os
import joblib
import numpy as np
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
import re
import matplotlib.pyplot as plt
from utils.model_production_data_processing_utils import cluster_with_min_size

root = Path(os.getcwd()).parent

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from model_production_main import load_and_preprocess_data, prepare_features
from utils.model_production_data_processing_utils import compute_threshold_kmeans, build_X_s, build_umap_windows_by_suffix

In [3]:
def SPCI_lg_to_set(G, threshold):
    L, U = G
    if U < threshold:
        return [1]
    elif L > threshold:
        return [0]
    else:
        return [0,1]

In [5]:
df1 = load_and_preprocess_data(root / "data/DATA.csv", 24)


Data preprocessing done


In [None]:
y_true = pd.read_csv(root / "data/y_true_24")
mark_cols = [c for c in df1.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []

threshold = compute_threshold_kmeans(df1)
# Prepare features
X = prepare_features(df1, 24)

# Perform clustering
df2, info = cluster_with_min_size(
    df1, X, n_clusters=4, min_cluster_size=50, random_state=42
)

mod1_obj = joblib.load(root / "models" / "models_clustering_24.joblib")
mod2_obj = joblib.load(root / "models" / "models_clustering_24.joblib")
alpha = 0.1
w1 = 3

covs = {}
wids = {}
Xt, keys, X_arr, y_arr = build_umap_windows_by_suffix(
        df1, w=w1, H=0, target_col_idx=3, verbose=True
    )

for base_model in ['RF', 'GB']:
    for n  in range(1, 17):
        key = (base_model, n, "vanilla")
        x = build_X_s(df2.fillna(0), prefixes, static_cols, n)
        model = mod1_obj[key]
        yp_van, yps_van = model.predict(x, alpha=alpha) # partition=df2['clusters'])
        pset_van = yps_van[:, :, 0]
        cov = classification_coverage_score(y_true, pset_van)
        wid = classification_mean_width_score(pset_van)
        covs[(base_model, n)] = cov
        wids[(base_model, n)] = wid

In [6]:
[c for c in df1.columns if c.startswith('B-CPE-100')]

['B-CPE-100_cpoolday01_01 - task01_passed',
 'B-CPE-100_cpoolday01_02 - task02_passed',
 'B-CPE-100_cpoolday01_03 - task03_passed',
 'B-CPE-100_cpoolday01_04 - task04_passed',
 'B-CPE-100_cpoolday01_05 - task05_passed',
 'B-CPE-100_cpoolday01_06 - task06_passed',
 'B-CPE-100_cpoolday01_07 - task07_passed',
 'B-CPE-100_cpoolday01_08 - task08_passed',
 'B-CPE-100_cpoolday01_09 - Success of task01_passed',
 'B-CPE-100_cpoolday01_10 - Success of task02_passed',
 'B-CPE-100_cpoolday01_11 - Success of task03_passed',
 'B-CPE-100_cpoolday01_12 - Success of task04_passed',
 'B-CPE-100_cpoolday01_13 - Success of task05_passed',
 'B-CPE-100_cpoolday01_14 - Success of task06_passed',
 'B-CPE-100_cpoolday01_15 - Success of task07_passed',
 'B-CPE-100_cpoolday01_16 - Success of task08_passed',
 'B-CPE-100_cpoolday01_mark',
 'B-CPE-100_cpoolday01_virtualMark',
 'B-CPE-100_cpoolday01_prerequisitesMark',
 'B-CPE-100_cpoolday01_stylePenalty',
 'B-CPE-100_cpoolday01_styleFatal',
 'B-CPE-100_cpoolday01_s

In [20]:
covs

{('RF', 1): 0.94,
 ('RF', 2): 0.928235294117647,
 ('RF', 3): 0.9482352941176471,
 ('RF', 4): 0.9705882352941176,
 ('RF', 5): 0.971764705882353,
 ('RF', 6): 0.9705882352941176,
 ('RF', 7): 0.9705882352941176,
 ('RF', 8): 0.9694117647058823,
 ('RF', 9): 0.9705882352941176,
 ('RF', 10): 0.9694117647058823,
 ('RF', 11): 0.971764705882353,
 ('RF', 12): 0.9694117647058823,
 ('RF', 13): 0.9682352941176471,
 ('RF', 14): 0.9658823529411765,
 ('RF', 15): 0.9635294117647059,
 ('RF', 16): 0.9635294117647059,
 ('GB', 1): 0.9494117647058824,
 ('GB', 2): 0.9364705882352942,
 ('GB', 3): 0.9376470588235294,
 ('GB', 4): 0.9458823529411765,
 ('GB', 5): 0.96,
 ('GB', 6): 0.9658823529411765,
 ('GB', 7): 0.9635294117647059,
 ('GB', 8): 0.9611764705882353,
 ('GB', 9): 0.9705882352941176,
 ('GB', 10): 0.9670588235294117,
 ('GB', 11): 0.9694117647058823,
 ('GB', 12): 0.9694117647058823,
 ('GB', 13): 0.9658823529411765,
 ('GB', 14): 0.9694117647058823,
 ('GB', 15): 0.9635294117647059,
 ('GB', 16): 0.95294117647

In [21]:
wids

{('RF', 1): 1.6541176470588235,
 ('RF', 2): 1.4376470588235295,
 ('RF', 3): 1.3647058823529412,
 ('RF', 4): 1.5823529411764705,
 ('RF', 5): 1.4470588235294117,
 ('RF', 6): 1.4364705882352942,
 ('RF', 7): 1.3776470588235294,
 ('RF', 8): 1.3905882352941177,
 ('RF', 9): 1.3964705882352941,
 ('RF', 10): 1.4011764705882352,
 ('RF', 11): 1.4094117647058824,
 ('RF', 12): 1.3894117647058823,
 ('RF', 13): 1.3623529411764705,
 ('RF', 14): 1.3176470588235294,
 ('RF', 15): 1.1929411764705882,
 ('RF', 16): 1.1094117647058823,
 ('GB', 1): 1.6894117647058824,
 ('GB', 2): 1.5776470588235294,
 ('GB', 3): 1.48,
 ('GB', 4): 1.4776470588235293,
 ('GB', 5): 1.5011764705882353,
 ('GB', 6): 1.4952941176470589,
 ('GB', 7): 1.463529411764706,
 ('GB', 8): 1.4611764705882353,
 ('GB', 9): 1.5082352941176471,
 ('GB', 10): 1.5305882352941176,
 ('GB', 11): 1.5035294117647058,
 ('GB', 12): 1.5035294117647058,
 ('GB', 13): 1.4176470588235295,
 ('GB', 14): 1.4458823529411764,
 ('GB', 15): 1.3235294117647058,
 ('GB', 16

In [2]:
df3 = pd.read_csv(root / "data/DATA_SPCI_ng_24.csv")

In [103]:
import numpy as np
from utils.models_production_utils import build_X_s

def gate_predict_minimal(
    dataframe, X_arr, n, base_model,
    models_c_ng, models_lg, models_comb,
    threshold, w2, prefixes, static_cols,
    alpha=0.05, partition=None  # partition=df['clusters'] si Mondrian
):
    """
    Renvoie:
      - p_final: bool array (n_samples, 2)  -> p-set final (après gate)
      - choice: int array (0=MCP, 1=SPCI, 2=union)
      - y_hat: int array -> 0/1 si singleton, -1 si ambigu (union des deux)
    """
    # 1) Features pour chaque “branche”
    X_CP = build_X_s(dataframe, prefixes, static_cols, n)      # mêmes colonnes/ordre qu’à l’entraînement
    idx_spci = n - w2
    X_SPCI = X_arr[idx_spci]

    # 2) Récupérer les modèles
    key_mcp = (base_model, n, "vanilla")  # ou "mondrian" si vous avez entraîné comme tel
    model_mcp = models_c_ng[key_mcp]
    model_spc = models_lg[n]
    gate = models_comb[(base_model, n)]

    # 3) p-sets MCP via MAPIE
    if partition is None:
        y_pred_mcp_gate, yps_mcp_gate = model_mcp.predict(X_CP, alpha=alpha)
    else:
        y_pred_mcp_gate, yps_mcp_gate = model_mcp.predict(X_CP, alpha=alpha, partition=partition)
    p_mcp = yps_mcp_gate[:, :, 0].astype(bool)  # (n_samples, 2)

    # 4) p-sets SPCI à partir des intervalles [L,U] et du threshold
    intervals = np.array([model_spc.predict_interval(x) for x in X_SPCI], dtype=float)
    L_cal = intervals[:, 0]
    U_cal = intervals[:, 1]
    p_spc = np.zeros_like(p_mcp, dtype=bool)
    p_spc[threshold < L_cal, 0] = True
    p_spc[threshold > U_cal, 1] = True
    amb = ~( (threshold < L_cal) | (threshold > U_cal) )
    p_spc[amb, :] = True  # ambigu → {0,1}

    # 5) Features pour la gate (mêmes que training): X_CP + [w_cls, w_spc, diff]
    w_cls = p_mcp.sum(axis=1)
    w_spc = p_spc.sum(axis=1)
    diff = w_cls - w_spc
    X_gate = np.hstack([X_CP, w_cls.reshape(-1,1), w_spc.reshape(-1,1), diff.reshape(-1,1)])

    # 6) Décision de la gate: 0=MCP, 1=SPCI, 2=union
    choice = gate.predict(X_gate)

    # 7) Composer le p-set final selon la gate
    p_final = np.empty_like(p_mcp, dtype=bool)
    use_mcp = (choice == 0)
    use_spc = (choice == 1)
    use_uni = (choice == 2)
    p_final[use_mcp] = p_mcp[use_mcp]
    p_final[use_spc] = p_spc[use_spc]
    p_final[use_uni] = (p_mcp[use_uni] | p_spc[use_uni])

    # 8) Étiquette ponctuelle minimale: 0/1 si singleton, sinon -1 (ambigu)
    singletons = (p_final.sum(axis=1) == 1)
    y_hat = np.where(singletons, p_final.argmax(axis=1), -1)

    return {
        "p_final": p_final,   # bools shape (n_samples, 2)
        "choice": choice,     # 0/1/2
        "y_hat": y_hat        # 0/1 ou -1 si ambigu
    }


In [3]:
root = Path(os.getcwd()).parent

In [4]:
obj = joblib.load(root / "models" / "models_clustering_24.joblib")

In [66]:
obj2 = joblib.load(root / "models" / "models_clustering_SPCI_ng_24.joblib")

In [67]:
type(obj2)

dict

In [68]:
obj2.keys()

dict_keys([('RF', 1, 'vanilla'), ('RF', 1, 'mondrian'), ('RF', 2, 'vanilla'), ('RF', 2, 'mondrian'), ('RF', 3, 'vanilla'), ('RF', 3, 'mondrian'), ('RF', 4, 'vanilla'), ('RF', 4, 'mondrian'), ('RF', 5, 'vanilla'), ('RF', 5, 'mondrian'), ('RF', 6, 'vanilla'), ('RF', 6, 'mondrian'), ('RF', 7, 'vanilla'), ('RF', 7, 'mondrian'), ('RF', 8, 'vanilla'), ('RF', 8, 'mondrian'), ('RF', 9, 'vanilla'), ('RF', 9, 'mondrian'), ('RF', 10, 'vanilla'), ('RF', 10, 'mondrian'), ('RF', 11, 'vanilla'), ('RF', 11, 'mondrian'), ('RF', 12, 'vanilla'), ('RF', 12, 'mondrian'), ('RF', 13, 'vanilla'), ('RF', 13, 'mondrian'), ('RF', 14, 'vanilla'), ('RF', 14, 'mondrian'), ('RF', 15, 'vanilla'), ('RF', 15, 'mondrian'), ('RF', 16, 'vanilla'), ('RF', 16, 'mondrian'), ('RF', 17, 'vanilla'), ('RF', 17, 'mondrian'), ('RF', 18, 'vanilla'), ('RF', 18, 'mondrian'), ('RF', 19, 'vanilla'), ('RF', 19, 'mondrian'), ('RF', 20, 'vanilla'), ('RF', 20, 'mondrian'), ('RF', 21, 'vanilla'), ('RF', 21, 'mondrian'), ('RF', 22, 'vanilla'

In [93]:
df3 = pd.read_csv(root / "data/DATA_SPCI_ng_24.csv")
df3

Unnamed: 0,email,B-CPE-100_cpoolday01_01 - task01_passed,B-CPE-100_cpoolday01_02 - task02_passed,B-CPE-100_cpoolday01_03 - task03_passed,B-CPE-100_cpoolday01_04 - task04_passed,B-CPE-100_cpoolday01_05 - task05_passed,B-CPE-100_cpoolday01_06 - task06_passed,B-CPE-100_cpoolday01_07 - task07_passed,B-CPE-100_cpoolday01_08 - task08_passed,B-CPE-100_cpoolday01_09 - Success of task01_passed,...,B-CPE-200_corewar_mark,B-CPE-200_corewar_virtualMark,B-CPE-200_corewar_prerequisitesMark,B-CPE-200_corewar_stylePenalty,B-CPE-200_corewar_styleFatal,B-CPE-200_corewar_styleMajor,B-CPE-200_corewar_styleMinor,B-CPE-200_corewar_styleInfo,source,clusters
0,aaron-fidele.ngwabana-ngwemi@epitech.eu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,real,0
1,aaron.abitbol@epitech.eu,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,,,0.5,0.0,0.0,0.0,0.0,0.0,real,1
2,aaron.aniambossou@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,,,2.0,-3.0,0.0,1.0,0.0,0.0,real,1
3,aaron.joseph@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,2.0,-4.0,0.0,1.0,1.0,0.0,real,3
4,aaron.platon@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,2.0,0.0,0.0,0.0,0.0,0.0,real,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845,zhamilya.kozhagulova@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,,,,,,,real,3
846,zhantore.svanov@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,,,,,,,real,3
847,zian.ferrage@epitech.eu,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,,,,,,,,,real,1
848,zie-ange-mohamed.diawara@epitech.eu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,real,0


In [47]:
df = pd.read_csv(root / "data/DATA.csv")
nb_nan_par_ligne = df.isna().sum(axis=1)

df = df[nb_nan_par_ligne < 495]

In [91]:
len(df)

850

In [48]:
def build_X_s(df_sub: pd.DataFrame, prefixes: list, static_cols: list, n: int) -> np.ndarray:
    # on garde student_id + les n premiers items
    dyn_cols = [
    col for col in df_sub.columns
    if any(col.startswith(pref) for pref in prefixes[:n])
    ]
    keep = ["email"] + static_cols + dyn_cols
    return df_sub[keep].set_index("email").values

In [49]:
mark_cols = [c for c in df.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []

In [75]:
X = build_X_s(df.fillna(0), prefixes, static_cols, 3)

In [76]:
dfcpool = df[[c for c in df.columns if c.startswith("B-CPE-100")]]
pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
dfcpool_mark = dfcpool[cols_keep]
X_pool = dfcpool_mark.fillna(0)

In [77]:
df2, info = cluster_with_min_size(
    df, X_pool, n_clusters=4, min_cluster_size=50, random_state=42)

DF plain + clustering done
nombre d'élèves par cluster :
0    117
1    490
2     30
3    213
Name: count, dtype: int64
Clusters trop petits à réaffecter : [2]
Nouvelles tailles de clusters :
0    117
1    490
3    243
Name: count, dtype: int64


In [90]:
res = []
for n in range(1, 16):
    mod = obj2[('GB', n, 'vanilla')]
    X = build_X_s(df2.fillna(0), prefixes, static_cols, n)
    yp_van, yps_van = mod.predict(X, alpha=0.1) # partition=df2['clusters'])
    pset_van = yps_van[:, :, 0]
    print(classification_mean_width_score(pset_van))
    res.append(classification_mean_width_score(pset_van))
print("moy", np.mean(res))

1.391764705882353
1.4823529411764707
1.3541176470588234
1.3741176470588234
1.3858823529411766
1.596470588235294
1.6258823529411766
1.5294117647058822
1.531764705882353
1.5541176470588236
1.5411764705882354
1.591764705882353
1.3952941176470588
1.3976470588235295
1.3529411764705883
moy 1.4736470588235295


In [None]:
df.head()

In [None]:
col_series = df.drop(columns=['email']).columns.to_series()
suffixes = col_series.apply(lambda x: x.split("_")[1])
ordered_suffixes = suffixes.unique()
# 2) Groupement des colonnes par suffixe
dfs = {}
for suffix in ordered_suffixes:
    cols_for_suffix = [c for c in col_series if c.split("_")[1] == suffix]
    subdf = df[cols_for_suffix].copy()
    dfs[suffix] = subdf
    if True:
        print(f"Suffixe = {suffix} → shape {subdf.shape}")


In [None]:
n = 20  # nombre de colonnes à afficher
print(df.isna().sum().sort_values(ascending=False).head(n))
