In [None]:
import pandas as pd
from pathlib import Path
import os
import joblib
import numpy as np
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
import re
import matplotlib.pyplot as plt
from utils.model_production_data_processing_utils import cluster_with_min_size
import umap
from sklearn.cluster import KMeans

root = Path(os.getcwd()).parent

In [None]:
from model_production_main import load_and_preprocess_data, prepare_features
from utils.model_production_data_processing_utils import compute_threshold_kmeans, build_X_s, build_umap_windows_by_suffix

In [None]:
df1 = load_and_preprocess_data(root / "data/DATA.csv", 24)

In [None]:
df1

In [None]:
df25 = load_and_preprocess_data(root / "data/DATA_2025.csv", 25)

In [None]:
for c in df25.columns:
    if c.endswith("city"):
        print(c)
    if c.startswith("city"):
        print(c)

In [None]:
df25['city'].unique()

In [None]:
df25 = load_and_preprocess_data(root / "data/DATA_2025.csv", 25)
df2025 = pd.read_csv(root / "data/DATA_2025.csv")
X = prepare_features(df25, 2025)
df2, info = cluster_with_min_size(
    df25, X, n_clusters=4, min_cluster_size=50, random_state=42
)
mark_cols = [c for c in df25.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []
X_CP = build_X_s(df2.fillna(0), prefixes, static_cols, 1)
mod_CP = joblib.load(root / "models" / "models_clustering_24.joblib")
key = ('GB', 1, "vanilla")
model_CP = mod_CP[key]
yp_van, yps_van = model_CP.predict(X_CP, alpha=0.1)  # partition=df2['clusters'])
pset_van = yps_van[:, :, 0].astype(bool)

In [None]:
def map_pset_to_label(pset: np.ndarray) -> np.ndarray:
    """
    Map conformal prediction sets (boolean array of shape (n, 2))
    to integer labels:
      [True, False]  -> 0
      [False, True]  -> 1
      [True, True]   -> 2
    """
    labels = np.full(pset.shape[0], -1, dtype=int)  # init with -1 (invalid)
    labels[(pset[:, 0] == True) & (pset[:, 1] == False)] = 0
    labels[(pset[:, 0] == False) & (pset[:, 1] == True)] = 1
    labels[(pset[:, 0] == True) & (pset[:, 1] == True)] = 2
    return labels

labels_van = map_pset_to_label(pset_van)
df25['prediction1'] = labels_van

In [None]:
dfcpool = df25[[c for c in df25.columns if c.startswith("G-CPE-100")]]
pat = re.compile(r"B-CPE-100_cpoolday\d+_\d{2} - task\d+_passed")
cols_keep = [c for c in dfcpool.columns if not pat.match(c)]
dfcpool_mark = dfcpool[cols_keep]
X = dfcpool_mark.fillna(0)
ta = [c for c in X.columns if c.endswith('passed')]
X_bin = X[ta]

reducer = umap.UMAP(
    n_neighbors=8,          # ↓ pour plus local
    min_dist=0.25,          # ↑ étale un peu les amas
    spread=1.0,
    n_components=2,
    metric="hamming",
    random_state=42,
)
emb = reducer.fit_transform(X_bin)

labels = KMeans(n_clusters=7, n_init="auto", random_state=42).fit_predict(emb)

df25['UMAP1'] = emb[:, 0]
df25['UMAP2'] = emb[:, 1]
df25['clusters'] = labels

In [None]:
for c in df25.columns:
    if c.endswith('mark'):
        df25[c] = df2025[c]

In [None]:
df25['G-CPE-100_cpoolday01_mark']

In [None]:
from IPython.display import FileLink

df25.to_csv("/data/DATA_2025_pred_proj.csv", index=False, encoding="utf-8")
FileLink("/data/DATA_2025_pred_proj.csv")

In [None]:
plt.figure(figsize=(7,6))
unique = np.unique(labels)
for lab in unique:
    mask = labels == lab
    plt.scatter(emb[mask, 0], emb[mask, 1], s=12, label=f"Cluster {lab}" if lab!=-1 else "Bruit (-1)", alpha=0.8)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.title('title')
plt.legend(markerscale=1.5, fontsize=9, frameon=False)
plt.tight_layout()
plt.show()

In [None]:
from tqdm import tqdm

def to_bool2_from_spci_intervals(intervals, thr):
    """
    intervals: (n_samples, 2) with (L, U)
    Retourne une matrice booléenne (n_samples, 2) codant l'ensemble {0},{1},{0,1}.
    Col 0 ⇔ contient '0' ; Col 1 ⇔ contient '1'.
    Règle:
      U<thr  -> {1}
      L>thr  -> {0}
      sinon  -> {0,1}
    """
    L = intervals[:, 0]
    U = intervals[:, 1]
    both = (~(U < thr)) & (~(L > thr))   # ni U<thr ni L>thr -> {0,1}
    inc0 = (L > thr) | both
    inc1 = (U < thr) | both
    return np.stack([inc0, inc1], axis=1)

## WORKING CELL

y_true = pd.read_csv(root / "data/y_true_24")
mark_cols = [c for c in df1.columns if c.endswith("mark")]
prefixes = list(dict.fromkeys(c.rsplit("_",1)[0] for c in mark_cols))
static_cols = []

threshold = compute_threshold_kmeans(df1)
# Prepare features
X = prepare_features(df1, 24)

# Perform clustering
df2, info = cluster_with_min_size(
    df1, X, n_clusters=4, min_cluster_size=50, random_state=42
)

w1 = 3
w2 = 10

covs = {}
wids = {}
covs_spci = {}
wids_spci = {}
covs_comb = {}
wids_comb = {}

Xt, keys, X_arr, y_arr = build_umap_windows_by_suffix(
        df1, w=w2, H=0, target_col_idx=3, verbose=True
    )


In [None]:
df3 = pd.read_csv(root / "data/DATA_SPCI_ng_24.csv")

In [None]:

mod_SPCI = joblib.load(root / "models" / "models_SPCI_lg_24.joblib")
mod_comb = joblib.load(root / "models" / "models_comb1_24.joblib")
mod_CP = joblib.load(root / "models" / "models_clustering_24.joblib")
mod_CP_ng = joblib.load(root / "models" / "models_clustering_SPCI_ng_24.joblib")
mod_comb_ng = joblib.load(root / "models" / "models_comb2_24.joblib")

In [None]:
mod_comb.keys()

In [None]:
y_true_arr = np.asarray(y_true).ravel().astype(int)
alpha = 0.1
max_N = 22

for base_model in ['GB']:
    for n in tqdm(range(1, max_N)):
        key = (base_model, n, "vanilla")
        X_CP = build_X_s(df2.fillna(0), prefixes, static_cols, n)
        model_CP = mod_CP[key]
        yp_van, yps_van = model_CP.predict(X_CP, alpha=alpha)  # partition=df2['clusters'])
        pset_van = yps_van[:, :, 0].astype(bool)
        cov = classification_coverage_score(y_true_arr, pset_van)
        wid = classification_mean_width_score(pset_van)
        covs[(base_model, n)] = cov
        wids[(base_model, n)] = wid
        if n <= w2:
            print(f"n={n:2d} | MCP: cov={cov:.3f}, wid={wid:.3f} | ")
        # Prédictions SPCI et combinées pour n > w2
        if n > w2: ## BELEK INDEX
            # Prédictions SPCI
            model_SPCI = mod_SPCI[n]
            X_SPCI = X_arr[n - w2]  # Ajustement de l'index comme dans train_combined_models
            intervals = np.array([model_SPCI.predict_interval(x) for x in X_SPCI], dtype=float)
            pset_spci = to_bool2_from_spci_intervals(intervals, threshold)
            cov_spci = classification_coverage_score(y_true_arr, pset_spci)
            wid_spci = classification_mean_width_score(pset_spci)
            covs_spci[(base_model, n)] = cov_spci
            wids_spci[(base_model, n)] = wid_spci
            
            # Prédictions combinées avec le modèle gate
            gate_model = mod_comb[(base_model, n)]
            
            # Calcul des features pour le gate (comme dans train_combined_models)
            w_cls = pset_van.sum(axis=1)  # Largeur des prédictions MCP
            w_spc = pset_spci.sum(axis=1)  # Largeur des prédictions SPCI
            diff = w_cls - w_spc
            
            # Construction de X_gate avec les mêmes features que lors de l'entraînement
            meta_features = np.column_stack([w_cls, w_spc, diff])
            X_gate = np.hstack([X_CP, meta_features])
            
            # Prédiction du gate : 0=MCP, 1=SPCI, 2=union
            gate_preds = gate_model.predict(X_gate)
            
            # Construction des prediction sets combinés
            pset_combined = np.zeros_like(pset_van, dtype=bool)
            
            for i in range(len(gate_preds)):
                if gate_preds[i] == 0:  # Utiliser MCP
                    pset_combined[i] = pset_van[i]
                elif gate_preds[i] == 1:  # Utiliser SPCI
                    pset_combined[i] = pset_spci[i]
                else:  # gate_preds[i] == 2, utiliser l'union
                    pset_combined[i] = pset_van[i] | pset_spci[i]
            
            # Calcul des métriques pour le modèle combiné
            cov_comb = classification_coverage_score(y_true_arr, pset_combined)
            wid_comb = classification_mean_width_score(pset_combined)
            covs_comb[(base_model, n)] = cov_comb
            wids_comb[(base_model, n)] = wid_comb
            
            # Affichage optionnel des résultats
            print(f"n={n:2d} | MCP: cov={cov:.3f}, wid={wid:.3f} | "
                  f"SPCI: cov={cov_spci:.3f}, wid={wid_spci:.3f} | "
                  f"Comb: cov={cov_comb:.3f}, wid={wid_comb:.3f}")
            
            # Statistiques sur les décisions du gate
            gate_choices, gate_counts = np.unique(gate_preds, return_counts=True)

In [None]:
y_true_arr = np.asarray(y_true).ravel().astype(int)
alpha = 0.1
max_N = 22

for base_model in ['GB']:
    for n in tqdm(range(1, max_N)):
        key = (base_model, n, "vanilla")
        X_CP = build_X_s(df3.fillna(0), prefixes, static_cols, n)
        model_CP_ng = mod_CP_ng[key]
        yp_van, yps_van = model_CP_ng.predict(X_CP, alpha=alpha)  # partition=df2['clusters'])
        pset_van = yps_van[:, :, 0].astype(bool)
        cov = classification_coverage_score(y_true_arr, pset_van)
        wid = classification_mean_width_score(pset_van)
        covs[(base_model, n)] = cov
        wids[(base_model, n)] = wid
        if n <= w2:
            print(f"n={n:2d} | MCP: cov={cov:.3f}, wid={wid:.3f} | ")
        # Prédictions SPCI et combinées pour n > w2
        if n > w2: ## BELEK INDEX
            # Prédictions SPCI
            model_SPCI = mod_SPCI[n]
            X_SPCI = X_arr[n - w2]  # Ajustement de l'index comme dans train_combined_models
            intervals = np.array([model_SPCI.predict_interval(x) for x in X_SPCI], dtype=float)
            pset_spci = to_bool2_from_spci_intervals(intervals, threshold)
            cov_spci = classification_coverage_score(y_true_arr, pset_spci)
            wid_spci = classification_mean_width_score(pset_spci)
            covs_spci[(base_model, n)] = cov_spci
            wids_spci[(base_model, n)] = wid_spci
            
            # Prédictions combinées avec le modèle gate
            gate_model = mod_comb_ng[(base_model, n)]
            
            # Calcul des features pour le gate (comme dans train_combined_models)
            w_cls = pset_van.sum(axis=1)  # Largeur des prédictions MCP
            w_spc = pset_spci.sum(axis=1)  # Largeur des prédictions SPCI
            diff = w_cls - w_spc
            
            # Construction de X_gate avec les mêmes features que lors de l'entraînement
            meta_features = np.column_stack([w_cls, w_spc, diff])
            X_gate = np.hstack([X_CP, meta_features])
            
            # Prédiction du gate : 0=MCP, 1=SPCI, 2=union
            gate_preds = gate_model.predict(X_gate)
            
            # Construction des prediction sets combinés
            pset_combined = np.zeros_like(pset_van, dtype=bool)
            
            for i in range(len(gate_preds)):
                if gate_preds[i] == 0:  # Utiliser MCP
                    pset_combined[i] = pset_van[i]
                elif gate_preds[i] == 1:  # Utiliser SPCI
                    pset_combined[i] = pset_spci[i]
                else:  # gate_preds[i] == 2, utiliser l'union
                    pset_combined[i] = pset_van[i] | pset_spci[i]
            
            # Calcul des métriques pour le modèle combiné
            cov_comb = classification_coverage_score(y_true_arr, pset_combined)
            wid_comb = classification_mean_width_score(pset_combined)
            covs_comb[(base_model, n)] = cov_comb
            wids_comb[(base_model, n)] = wid_comb
            
            # Affichage optionnel des résultats
            print(f"n={n:2d} | MCP: cov={cov:.3f}, wid={wid:.3f} | "
                  f"SPCI: cov={cov_spci:.3f}, wid={wid_spci:.3f} | "
                  f"Comb: cov={cov_comb:.3f}, wid={wid_comb:.3f}")
            
            # Statistiques sur les décisions du gate
            gate_choices, gate_counts = np.unique(gate_preds, return_counts=True)

In [None]:
df_res = pd.read_csv(root /"data/DATA_2025_pred_proj.csv")

In [None]:
df_res