In [None]:
import pandas as pd
from pathlib import Path
import os
import joblib
import numpy as np
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
import re
import matplotlib.pyplot as plt
from utils.model_production_data_processing_utils import cluster_with_min_size

root = Path(os.getcwd()).parent

In [None]:
from model_production_main import load_and_preprocess_data, prepare_features
from utils.model_production_data_processing_utils import compute_threshold_kmeans, build_X_s, build_umap_windows_by_suffix

In [None]:
def SPCI_lg_to_set(G, threshold):
    L, U = G
    if U < threshold:
        return [1]
    elif L > threshold:
        return [0]
    else:
        return [0,1]

In [None]:
mod = joblib.load(root / "models" / "models_SPCI_lg_24.joblib")
mod.keys()

In [None]:
def to_labelsets(pset_matrix: np.ndarray):
    """
    Mappe chaque ligne (inclusions par classe) vers l'un de {[0], [1], [0,1]}.
    On considère toute valeur non nulle comme 'inclus'.
    Fallback conservateur -> [0,1] si ensemble vide.
    """
    ps = (pset_matrix != 0)  # booléen
    # sécurités si la seconde dimension n'est pas exactement 2
    if ps.ndim != 2 or ps.shape[1] < 2:
        raise ValueError(f"pset_matrix doit être de forme (n_samples, 2), reçu {ps.shape}")

    out = []
    for inc0, inc1 in ps[:, 0], ps[:, 1]:
        if inc0 and inc1:
            out.append([0, 1])
        elif inc0:
            out.append([0])
        elif inc1:
            out.append([1])
        else:
            # très rare (ensemble vide) : fallback conservateur
            out.append([0, 1])
    return out

In [None]:
df1 = load_and_preprocess_data(root / "data/DATA.csv", 24)

In [None]:
y_true = pd.read_csv(root / "data/y_true_24")
mark_cols = [c for c in df1.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []

threshold = compute_threshold_kmeans(df1)
# Prepare features
X = prepare_features(df1, 24)

# Perform clustering
df2, info = cluster_with_min_size(
    df1, X, n_clusters=4, min_cluster_size=50, random_state=42
)

mod1_obj = joblib.load(root / "models" / "models_clustering_24.joblib")
mod2_obj = joblib.load(root / "models" / "models_clustering_24.joblib")
alpha = 0.1
w1 = 3

covs = {}
wids = {}
Xt, keys, X_arr, y_arr = build_umap_windows_by_suffix(
        df1, w=w1, H=0, target_col_idx=3, verbose=True
    )

for base_model in ['RF', 'GB']:
    for n  in range(1, 17):
        key = (base_model, n, "vanilla")
        x = build_X_s(df2.fillna(0), prefixes, static_cols, n)
        model = mod1_obj[key]
        yp_van, yps_van = model.predict(x, alpha=alpha) # partition=df2['clusters'])
        pset_van = yps_van[:, :, 0]
        cov = classification_coverage_score(y_true, pset_van)
        wid = classification_mean_width_score(pset_van)
        covs[(base_model, n)] = cov
        wids[(base_model, n)] = wid

In [None]:

emails = df2['email'].astype(str).values

pred_dfs = {}  # contiendra un DF par base_model (RF, GB)

for base_model in ['RF', 'GB']:
    cols = {}
    for n in range(1, 17):
        key = (base_model, n, "vanilla")
        x_n = build_X_s(df2.fillna(0), prefixes, static_cols, n)

        model = mod1_obj[key]
        _yp_van, yps_van = model.predict(x_n, alpha=alpha)

        # pset_van = ensemble(s) prédictif(s) pour chaque étudiant
        # On le stocke tel quel ; .tolist() garantit un objet sérialisable (scalaires ou listes)
        pset_van = yps_van[:, :, 0]
        cols[n] = [[0,1] if (a and b) else [0] if a else [1] if b else [0,1]
           for a, b in pset_van]

    df_pred = pd.DataFrame(cols, index=emails)
    df_pred.index.name = 'email'
    pred_dfs[base_model] = df_pred

# → Deux dataframes séparés (colonnes = n)
preds_RF = pred_dfs['RF']
preds_GB = pred_dfs['GB']

# (Optionnel) Tout regrouper en colonnes MultiIndex (niveau 0 = base_model, niveau 1 = n)
preds_all = pd.concat(pred_dfs, axis=1)  # colonnes comme ('RF', 1), ('RF', 2), ..., ('GB', 16)


In [None]:

preds_RF.head(20)

In [None]:
preds_GB.head(20)
# preds_all.head()


In [None]:
covs

In [None]:
wids

In [None]:
df3 = pd.read_csv(root / "data/DATA_SPCI_ng_24.csv")

In [None]:
import numpy as np
from utils.models_production_utils import build_X_s

def gate_predict_minimal(
    dataframe, X_arr, n, base_model,
    models_c_ng, models_lg, models_comb,
    threshold, w2, prefixes, static_cols,
    alpha=0.05, partition=None  # partition=df['clusters'] si Mondrian
):
    """
    Renvoie:
      - p_final: bool array (n_samples, 2)  -> p-set final (après gate)
      - choice: int array (0=MCP, 1=SPCI, 2=union)
      - y_hat: int array -> 0/1 si singleton, -1 si ambigu (union des deux)
    """
    # 1) Features pour chaque “branche”
    X_CP = build_X_s(dataframe, prefixes, static_cols, n)      # mêmes colonnes/ordre qu’à l’entraînement
    idx_spci = n - w2
    X_SPCI = X_arr[idx_spci]

    # 2) Récupérer les modèles
    key_mcp = (base_model, n, "vanilla")  # ou "mondrian" si vous avez entraîné comme tel
    model_mcp = models_c_ng[key_mcp]
    model_spc = models_lg[n]
    gate = models_comb[(base_model, n)]

    # 3) p-sets MCP via MAPIE
    if partition is None:
        y_pred_mcp_gate, yps_mcp_gate = model_mcp.predict(X_CP, alpha=alpha)
    else:
        y_pred_mcp_gate, yps_mcp_gate = model_mcp.predict(X_CP, alpha=alpha, partition=partition)
    p_mcp = yps_mcp_gate[:, :, 0].astype(bool)  # (n_samples, 2)

    # 4) p-sets SPCI à partir des intervalles [L,U] et du threshold
    intervals = np.array([model_spc.predict_interval(x) for x in X_SPCI], dtype=float)
    L_cal = intervals[:, 0]
    U_cal = intervals[:, 1]
    p_spc = np.zeros_like(p_mcp, dtype=bool)
    p_spc[threshold < L_cal, 0] = True
    p_spc[threshold > U_cal, 1] = True
    amb = ~( (threshold < L_cal) | (threshold > U_cal) )
    p_spc[amb, :] = True  # ambigu → {0,1}

    # 5) Features pour la gate (mêmes que training): X_CP + [w_cls, w_spc, diff]
    w_cls = p_mcp.sum(axis=1)
    w_spc = p_spc.sum(axis=1)
    diff = w_cls - w_spc
    X_gate = np.hstack([X_CP, w_cls.reshape(-1,1), w_spc.reshape(-1,1), diff.reshape(-1,1)])

    # 6) Décision de la gate: 0=MCP, 1=SPCI, 2=union
    choice = gate.predict(X_gate)

    # 7) Composer le p-set final selon la gate
    p_final = np.empty_like(p_mcp, dtype=bool)
    use_mcp = (choice == 0)
    use_spc = (choice == 1)
    use_uni = (choice == 2)
    p_final[use_mcp] = p_mcp[use_mcp]
    p_final[use_spc] = p_spc[use_spc]
    p_final[use_uni] = (p_mcp[use_uni] | p_spc[use_uni])

    # 8) Étiquette ponctuelle minimale: 0/1 si singleton, sinon -1 (ambigu)
    singletons = (p_final.sum(axis=1) == 1)
    y_hat = np.where(singletons, p_final.argmax(axis=1), -1)

    return {
        "p_final": p_final,   # bools shape (n_samples, 2)
        "choice": choice,     # 0/1/2
        "y_hat": y_hat        # 0/1 ou -1 si ambigu
    }


In [None]:
root = Path(os.getcwd()).parent

In [None]:
obj = joblib.load(root / "models" / "models_clustering_24.joblib")

In [None]:
obj2 = joblib.load(root / "models" / "models_clustering_SPCI_ng_24.joblib")

In [None]:
type(obj2)

In [None]:
obj2.keys()

In [None]:
df3 = pd.read_csv(root / "data/DATA_SPCI_ng_24.csv")
df3

In [None]:
df = pd.read_csv(root / "data/DATA.csv")
nb_nan_par_ligne = df.isna().sum(axis=1)

df = df[nb_nan_par_ligne < 495]

In [None]:
len(df)

In [None]:
def build_X_s(df_sub: pd.DataFrame, prefixes: list, static_cols: list, n: int) -> np.ndarray:
    # on garde student_id + les n premiers items
    dyn_cols = [
    col for col in df_sub.columns
    if any(col.startswith(pref) for pref in prefixes[:n])
    ]
    keep = ["email"] + static_cols + dyn_cols
    return df_sub[keep].set_index("email").values

In [None]:
mark_cols = [c for c in df.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []

In [None]:
X = build_X_s(df.fillna(0), prefixes, static_cols, 3)

In [None]:
dfcpool = df[[c for c in df.columns if c.startswith("B-CPE-100")]]
pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
dfcpool_mark = dfcpool[cols_keep]
X_pool = dfcpool_mark.fillna(0)

In [None]:
df2, info = cluster_with_min_size(
    df, X_pool, n_clusters=4, min_cluster_size=50, random_state=42)

In [None]:
res = []
for n in range(1, 16):
    mod = obj2[('GB', n, 'vanilla')]
    X = build_X_s(df2.fillna(0), prefixes, static_cols, n)
    yp_van, yps_van = mod.predict(X, alpha=0.1) # partition=df2['clusters'])
    pset_van = yps_van[:, :, 0]
    print(classification_mean_width_score(pset_van))
    res.append(classification_mean_width_score(pset_van))
print("moy", np.mean(res))

In [None]:
df.head()

In [None]:
col_series = df.drop(columns=['email']).columns.to_series()
suffixes = col_series.apply(lambda x: x.split("_")[1])
ordered_suffixes = suffixes.unique()
# 2) Groupement des colonnes par suffixe
dfs = {}
for suffix in ordered_suffixes:
    cols_for_suffix = [c for c in col_series if c.split("_")[1] == suffix]
    subdf = df[cols_for_suffix].copy()
    dfs[suffix] = subdf
    if True:
        print(f"Suffixe = {suffix} → shape {subdf.shape}")


In [None]:
n = 20  # nombre de colonnes à afficher
print(df.isna().sum().sort_values(ascending=False).head(n))
