In [None]:
import pandas as pd
from pathlib import Path
import os
import joblib
import numpy as np
from mapie.metrics import (
    classification_coverage_score,
    classification_mean_width_score
)
import re
import matplotlib.pyplot as plt
from utils.model_production_data_processing_utils import cluster_with_min_size
import umap

root = Path(os.getcwd()).parent

In [None]:
from model_production_main import load_and_preprocess_data, prepare_features
from utils.model_production_data_processing_utils import build_X_s, build_umap_windows_by_suffix

In [None]:
from sklearn.cluster import KMeans


def compute_threshold_kmeans(
    df: pd.DataFrame,
    *,
    n_rendus: int = 3,
    min_prop: float = 0.5,
    max_prop: float = 1.0,
    n_grid: int = 100,
    min_dropout_pct: float = 20.0,
    random_state: int = 0
) -> float:
    """
    Calcule un seuil (threshold) à partir des dernières notes non nulles via un KMeans (k=2),
    puis balaye une grille de proportions pour maximiser la variation de la proportion binaire
    entre deux pas successifs.

    Paramètres
    ----------
    df : DataFrame
        Doit contenir des colonnes se terminant par 'mark'.
    n_rendus : int
        Nombre de notes non nulles les plus récentes à considérer (on prend la plus récente parmi elles).
    min_prop, max_prop : float
        Plage des coefficients appliqués au plus petit centre KMeans (min(centers)).
    n_grid : int
        Taille de la grille linéaire entre min_prop et max_prop.
    min_dropout_pct : float
        Si la proportion de Y_TARGET (dropout) < ce pourcentage, on recule d’un cran (prop courante).
    random_state : int
        Graine pour KMeans.

    Retour
    ------
    float
        Le threshold choisi.
    """

    # 1) Préparation des données
    df0 = df.copy().fillna(0)
    mark_cols = [c for c in df0.columns if c.endswith("mark")]
    if not mark_cols:
        raise ValueError("Aucune colonne se terminant par 'mark' trouvée.")

    # On inverse pour parcourir des colonnes de la plus récente à la plus ancienne (si votre ordre est encodé dans les noms)
    mark_cols = mark_cols[::-1]

    # 2) Récupérer pour chaque ligne la/les dernière(s) notes non nulles
    def last_marks(row):
        vals = []
        for c in mark_cols:
            v = row[c]
            if v > 0:
                vals.append(v)
                if len(vals) == n_rendus:
                    break
        return pd.Series({"last_vals": vals})

    tmp = df0.apply(last_marks, axis=1)
    df0["last_vals"] = tmp["last_vals"]
    # on prend la plus récente parmi les n_rendus collectés (index 0), sinon 0 si aucun
    df0["lastvals"] = df0["last_vals"].apply(lambda vs: vs[0] if len(vs) > 0 else 0.0).astype(float)

    x = df0["lastvals"].to_numpy(dtype=float)

    # 3) KMeans en 2 clusters
    X = x.reshape(-1, 1)
    kmeans = KMeans(n_clusters=2, random_state=random_state).fit(X)
    centers = kmeans.cluster_centers_.flatten()
    low_center = float(np.min(centers))

    # 4) Balayage de la grille et sélection du meilleur "saut" de proportion
    props = np.linspace(min_prop, max_prop, n_grid)
    best_ec = -np.inf
    best_prop_prev = None
    best_prop_curr = None

    prev_mean = None
    for i, prop in enumerate(props):
        threshold_tmp = low_center * prop
        y_all = (x < threshold_tmp).astype(int)
        current_mean = float(y_all.mean())

        if i > 0:  # on peut mesurer l'écart avec l'itération précédente
            ec = abs(current_mean - prev_mean)
            if ec > best_ec:
                best_ec = ec
                best_prop_curr = prop           # prop à l'instant i
                best_prop_prev = props[i - 1]   # prop à l’instant i-1
        prev_mean = current_mean

    # Sécurités si la grille a une seule valeur ou si rien n'a été mis à jour
    if best_prop_prev is None or best_prop_curr is None:
        # fallback : prendre la première prop
        best_prop_prev = props[0]
        best_prop_curr = props[min(1, len(props) - 1)]

    # 5) Choix final du threshold + contrainte min_dropout_pct
    threshold = low_center * best_prop_prev
    y_target = (x < threshold).astype(int)
    if (y_target.mean() * 100.0) < min_dropout_pct:
        threshold = low_center * best_prop_curr
    y_target = (x < threshold).astype(int)
    print(y_target.mean() * 100.0)
    return float(threshold), (y_target.mean() * 100.0)


In [None]:
df22 = pd.read_csv(root /"data/DATA_2022.csv")
df22.shape

In [None]:
df23 = pd.read_csv(root /"data/DATA_2023.csv")
df23.shape

In [None]:
df24 = pd.read_csv(root / "data/DATA_2024.csv")
df24.shape

In [None]:
df25 = pd.read_csv(root / "data/DATA_2025.csv")
df25.shape

In [None]:
df25

In [None]:
df2022 = load_and_preprocess_data(root / "data/DATA_2022.csv", 22)
threshold = compute_threshold_kmeans(df2022)
threshold

In [None]:
df2023 = load_and_preprocess_data(root / "data/DATA_2023.csv", 23)
threshold = compute_threshold_kmeans(df2023)
threshold

In [None]:
df2024 = load_and_preprocess_data(root / "data/DATA_2024.csv", 24)
threshold = compute_threshold_kmeans(df2024)
threshold

In [None]:
common_cols = df22.columns.intersection(df23.columns).intersection(df24.columns)

# Concat verticale sur les colonnes communes
df_all = pd.concat(
    [df22[common_cols], df23[common_cols], df24[common_cols]],
    axis=0,
    ignore_index=True
)

mark_cols = [c for c in df_all.columns if c.endswith("mark")]
df_all[mark_cols] = df_all[mark_cols].div(df_all[mark_cols].mean())
threshold = compute_threshold_kmeans(df_all)
threshold

In [None]:
df_all 

In [None]:
df24 = pd.read_csv(root / "data/DATA_2024.csv")
df24.shape

In [None]:
[c for c in df22.columns if c.startswith('B-CPE-110')]

In [None]:
cols_only_df1 = df23.columns.difference(df22.columns)
cols_only_df1

In [None]:
[c for c in cols_only_df1 if c.startswith('B-CPE-100')]

In [None]:
common_cols = df22.columns.intersection(df23.columns)
common_cols.shape

In [None]:
nb_nan_par_ligne = df22.isna().sum(axis=1)
# df22 = df22[nb_nan_par_ligne < 356]
nb_nan_par_ligne.sort_values().tail(10)

In [None]:
nb_nan_par_ligne = df23.isna().sum(axis=1)
# df23 = df23[nb_nan_par_ligne < 356]
print(df23.shape, nb_nan_par_ligne.sort_values().tail(20))

In [None]:
nb_nan_par_ligne = df23.isna().sum(axis=1)
# df22 = df22[nb_nan_par_ligne < 356]
nb_nan_par_ligne.sort_values().tail(10)