# Fonctions d'obtention des donn√©es
## Import des librairies

In [1]:
import pandas as pd
import requests
from config import URL_HOLIDAYS, URL_SCHOOL, URL_METEO_HOURLY
from datetime import datetime

## Class de lecture des donn√©es velib historiques

In [2]:
class VelibCsvReader:
    def __init__(self):
        self.file_path = "./data/dataset/historique_stations.csv"

    def read_dataframe(self):
        """Combine status + info sur les stations"""
        dataframe = pd.read_csv(
            filepath_or_buffer=self.file_path,
            sep=","
        )
        return dataframe

## Class API pour donn√©es sur les vacances

In [3]:
class HolidaysAPI:
    """
    R√©cup√©ration des jours f√©ri√©s et des vacances scolaires fran√ßaises.
    """

    def fetch_public_holidays(self) -> pd.DataFrame:
        try:
            data = requests.get(URL_HOLIDAYS, timeout=10).json()
            df = pd.DataFrame(list(data.items()), columns=["date", "holiday_name"])
            df["date"] = pd.to_datetime(df["date"])
            return df
        except Exception as e:
            raise RuntimeError(f"Erreur API jours f√©ri√©s : {e}")

    def fetch_school_vacations(self) -> pd.DataFrame:
        try:
            data = requests.get(URL_SCHOOL, timeout=10).json()
            results = data.get("results", [])
            df = pd.DataFrame(results)
            if not df.empty:
                df = df[["start_date", "end_date", "zones", "description"]]
                df["start_date"] = pd.to_datetime(df["start_date"])
                df["end_date"] = pd.to_datetime(df["end_date"])
            return df
        except Exception as e:
            raise RuntimeError(f"Erreur API vacances scolaires : {e}")

## Class API pour les donn√©es m√©t√©o historiques

In [4]:
import pandas as pd
from typing import Dict

class WeatherCsvReader:
    def __init__(self, file_path: str = "./data/dataset/historical_meteo.csv"):
        self.file_path = file_path
        
        # Mapping colonnes brutes -> colonnes standardis√©es
        self.WEATHER_COL_MAP = {
            "AAAAMMJJHH": "timestamp",
            "NUM_POSTE":  "station_id",
            "NOM_USUEL":  "station_name",
            "LAT":        "lat_deg",
            "LON":        "lon_deg",
            "ALTI":       "alt_m",
            "RR1":        "precip_mm",
            "DRR1":       "precip_dur_min",
            "T":          "temp_c",
            "TD":         "dewpoint_c",
            "U":          "humidity_rel_pct",
            "FF":         "wind_speed_10m_ms",
            "DD":         "wind_dir_deg",
            "FXI":        "wind_gust_10m_ms",
            "PSTAT":      "pressure_hpa_station",
            "PMER":       "pressure_hpa_sea",

            "N":          "cloud_oktas",
            "INS":        "insolation_min",            # plan B si N absent
            "GLO":        "global_radiation_j_cm2",    # plan B bis
            "WW":         "wmo_present_weather",       # optionnel mais utile
        }

        # Sch√©ma final (cible) ‚Äî timestamp est pars√© √† part
        self.WEATHER_DTYPES: Dict[str, str] = {
            "station_id":          "string",
            "station_name":        "string",
            "lat_deg":             "float64",
            "lon_deg":             "float64",
            "alt_m":               "Int64",
            "timestamp":           "datetime64[ns]",
            "precip_mm":           "float64",
            "precip_dur_min":      "Int64",
            "temp_c":              "float64",
            "dewpoint_c":          "float64",
            "humidity_rel_pct":    "float64",
            "wind_speed_10m_ms":   "float64",
            "wind_dir_deg":        "float64",
            "wind_gust_10m_ms":    "float64",
            "pressure_hpa":        "float64",
            
            "cloud_oktas":   "float64",
            "insolation_min":        "float64",
            "global_radiation_j_cm2":    "float64",
            "wmo_present_weather":        "float64",
        }

    def read_dataframe(self) -> pd.DataFrame:
        """Lecture brute du CSV (s√©parateur ';')."""
        try:
            return pd.read_csv(self.file_path, sep=";")
        except Exception as e:
            raise RuntimeError(f"Erreur API m√©t√©o : {e}")

    def _select_rename(self, df: pd.DataFrame) -> pd.DataFrame:
        """Garde uniquement les colonnes connues puis les renomme (sans boucles)."""
        cols_src = df.columns.intersection(self.WEATHER_COL_MAP.keys())
        return df.loc[:, cols_src].rename(columns=self.WEATHER_COL_MAP)

    def read_standardized(self) -> pd.DataFrame:
        """
        Pipeline vectoris√© :
        - select/rename
        - parse timestamp (AAAAMMJJHH)
        - coalesce pression (PSTAT prioritaire sur PMER)
        - cast global selon sch√©ma
        - ordre de colonnes propre
        """
        df_raw = self.read_dataframe()
        df = self._select_rename(df_raw)

        # Parse timestamp (format AAAAMMJJHH)
        df = df.assign(
            timestamp=pd.to_datetime(
                df["timestamp"].astype("string"),
                format="%Y%m%d%H",
                errors="coerce"
            )
        )

        # Coalesce pression (priorit√© station -> mer) si colonnes pr√©sentes
        # (si absentes, bfill s'applique sur colonnes manquantes sans boucle)
        pressure_sources = df.filter(items=["pressure_hpa_station", "pressure_hpa_sea"])
        if not pressure_sources.empty:
            df["pressure_hpa"] = pressure_sources.bfill(axis=1).iloc[:, 0]

        # Ordre + cast en une seule passe (les colonnes manquantes seront ajout√©es vides)
        df = (
            df
            .reindex(columns=self.WEATHER_DTYPES.keys())  # ordre final
            .astype(self.WEATHER_DTYPES, errors="ignore")  # cast "df.cast(schema)" version pandas
        )

        return df


# Exploration des donn√©es

In [5]:
velib_historical_dataframe = VelibCsvReader().read_dataframe()
print("Donn√©es V√©los :")
display(velib_historical_dataframe.head(10))

holidays_instance = HolidaysAPI()
public_holidays_dataframe = holidays_instance.fetch_public_holidays()
vacations_dataframe = holidays_instance.fetch_school_vacations()
public_holidays_dataframe["type"] = "holiday"
vacations_dataframe["type"] = "vacation"
calendar_dataframe = pd.concat([public_holidays_dataframe, vacations_dataframe], ignore_index=True)
print("Donn√©es vacances :")
display(calendar_dataframe.head(10))

weather_dataframe = WeatherCsvReader().read_standardized()
print("Donn√©es m√©t√©o :")
display(weather_dataframe.head(10))



Donn√©es V√©los :


Unnamed: 0,time,capacity,available_mechanical,available_electrical,station_name,station_geo,operative
0,2020-11-26T12:59Z,35,4,5,Benjamin Godard - Victor Hugo,"48.86598,2.27572",True
1,2020-11-26T12:59Z,55,23,4,Andr√© Mazet - Saint-Andr√© des Arts,"48.85376,2.33910",True
2,2020-11-26T12:59Z,20,0,0,Charonne - Robert et Sonia Delauney,"48.85591,2.39257",True
3,2020-11-26T12:59Z,21,0,1,Toudouze - Clauzel,"48.87930,2.33736",True
4,2020-11-26T12:59Z,30,3,1,Mairie du 12√®me,"48.84086,2.38755",True
5,2020-11-26T12:59Z,46,18,10,Harpe - Saint-Germain,"48.85152,2.34367",True
6,2020-11-26T12:59Z,60,5,2,Jourdan - Stade Charl√©ty,"48.81943,2.34334",True
7,2020-11-26T12:59Z,40,15,1,Jouffroy d'Abbans - Wagram,"48.88197,2.30113",True
8,2020-11-26T12:59Z,39,12,2,Guersant - Gouvion-Saint-Cyr,"48.88288,2.28767",True
9,2020-11-26T12:59Z,60,2,2,Alibert - Jemmapes,"48.87104,2.36610",True


Donn√©es vacances :


Unnamed: 0,date,holiday_name,type,start_date,end_date,zones,description
0,2030-01-01,1er janvier,holiday,NaT,NaT,,
1,2030-04-22,Lundi de P√¢ques,holiday,NaT,NaT,,
2,2030-05-01,1er mai,holiday,NaT,NaT,,
3,2030-05-08,8 mai,holiday,NaT,NaT,,
4,2030-05-30,Ascension,holiday,NaT,NaT,,
5,2030-06-10,Lundi de Pentec√¥te,holiday,NaT,NaT,,
6,2030-07-14,14 juillet,holiday,NaT,NaT,,
7,2030-08-15,Assomption,holiday,NaT,NaT,,
8,2030-11-01,Toussaint,holiday,NaT,NaT,,
9,2030-11-11,11 novembre,holiday,NaT,NaT,,


Donn√©es m√©t√©o :


Unnamed: 0,station_id,station_name,lat_deg,lon_deg,alt_m,timestamp,precip_mm,precip_dur_min,temp_c,dewpoint_c,humidity_rel_pct,wind_speed_10m_ms,wind_dir_deg,wind_gust_10m_ms,pressure_hpa,cloud_oktas,insolation_min,global_radiation_j_cm2,wmo_present_weather
0,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 00:00:00,0.0,0,0.7,-3.3,74.0,8.9,140.0,14.0,1013.8,8.0,0.0,0.0,0.0
1,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 01:00:00,0.0,0,0.9,-3.3,74.0,9.7,150.0,14.7,1013.1,8.0,0.0,0.0,0.0
2,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 02:00:00,0.0,0,0.7,-2.2,81.0,10.2,140.0,14.3,1012.6,8.0,0.0,0.0,0.0
3,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 03:00:00,0.0,20,0.8,-2.2,81.0,11.5,130.0,16.5,1011.0,8.0,0.0,0.0,0.0
4,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 04:00:00,0.0,0,0.9,-2.0,82.0,12.9,120.0,18.0,1008.7,8.0,0.0,0.0,0.0
5,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 05:00:00,0.0,10,0.5,-1.2,89.0,13.7,120.0,20.2,1006.4,8.0,0.0,0.0,70.0
6,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 06:00:00,0.0,60,0.6,-0.5,92.0,14.0,120.0,22.3,1005.0,8.0,0.0,0.0,68.0
7,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 07:00:00,0.4,60,0.6,0.0,96.0,14.6,120.0,22.1,1003.6,8.0,0.0,0.0,68.0
8,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 08:00:00,1.2,60,0.8,0.2,96.0,15.0,120.0,23.6,1001.7,8.0,0.0,0.0,68.0
9,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 09:00:00,5.1,60,0.9,0.4,97.0,14.1,110.0,21.2,1001.2,8.0,0.0,0.0,68.0


# Mod√©lisation des donn√©es
## Fonctions de mod√©lisation

In [31]:
import pandas as pd
import numpy as np

class FeatureBuilder:
    def __init__(self, velib, weather, calendar):
        self.velib = velib
        self.weather = weather
        self.calendar = calendar

    def _preprocess(self, velib, weather, calendar):
        """Pr√©pare et fusionne les datasets (fusion horaire simple)"""
        print("üßπ Pr√©paration des donn√©es...")

        # Nettoyage des dates (UTC)
        weather["timestamp"] = pd.to_datetime(weather["timestamp"], errors="coerce", utc=True)
        velib["time"] = pd.to_datetime(velib["time"], errors="coerce", utc=True)

        # Cl√© horaire
        weather["ts_hour"] = weather["timestamp"].dt.floor("h")
        velib["ts_hour"] = velib["time"].dt.floor("h")

        # ===== Flags m√©t√©o tr√®s simples (0/1) =====
        precip = pd.to_numeric(weather.get("precip_mm", 0), errors="coerce").fillna(0.0)
        precip_dur = pd.to_numeric(weather.get("precip_dur_min", 0), errors="coerce").fillna(0.0)
        ws = pd.to_numeric(weather.get("wind_speed_10m_ms", 0), errors="coerce").fillna(0.0)
        wg = pd.to_numeric(weather.get("wind_gust_10m_ms", 0), errors="coerce").fillna(0.0)
        cloud_oktas = pd.to_numeric(weather.get("cloud_oktas", np.nan), errors="coerce")

        # R√®gles binaires
        weather["pluie"]  = ((precip >= 0.1) | (precip_dur >= 5)).astype(int)
        weather["vent"]   = ((ws >= 8.0) | (wg >= 10.8)).astype(int)
        weather["soleil"] = cloud_oktas.le(2).fillna(False).astype(int)
        weather["nuage"]  = cloud_oktas.ge(6).fillna(False).astype(int)

        # 1 ligne par heure
        weather_flags = (
            weather.sort_values("timestamp")
                   .drop_duplicates(subset=["ts_hour"], keep="last")
                   [["ts_hour", "pluie", "vent", "soleil", "nuage"]]
                   .copy()
        )
        weather_flags[["pluie", "vent", "soleil", "nuage"]] = weather_flags[["pluie", "vent", "soleil", "nuage"]].fillna(0).astype(int)

        # Fusion m√©t√©o ‚Üî V√©lib
        df = velib.merge(weather_flags, on="ts_hour", how="left")
        for c in ["pluie", "vent", "soleil", "nuage"]:
            if c not in df.columns:
                df[c] = 0
        df[["pluie", "vent", "soleil", "nuage"]] = df[["pluie", "vent", "soleil", "nuage"]].fillna(0).astype(int)

        # Ajout calendrier
        df["date"] = df["time"].dt.tz_convert("Europe/Paris").dt.date
        cal = calendar.copy()
        cal["date"] = pd.to_datetime(cal["start_date"]).dt.date
        cal["holiday_flag"] = 1
        df = df.merge(cal[["date", "holiday_flag"]].drop_duplicates(), on="date", how="left")
        df["holiday_flag"] = df["holiday_flag"].fillna(0).astype(int)

        return df

    def _feature_engineering(self, df):
        """Cr√©e des variables d√©riv√©es utiles pour la pr√©diction"""
        print("‚öôÔ∏è Construction des features...")

        # TOTAL v√©los dispo & bornes libres
        df["available_total"] = df["available_mechanical"].fillna(0) + df["available_electrical"].fillna(0)
        df["docks_available"] = df["capacity"].fillna(0) - df["available_total"]

        # Taux d'occupation
        df["fill_rate"] = (df["available_total"] / df["capacity"].replace(0, pd.NA)).fillna(0).clip(0, 1)

        # ‚úÖ Calcul des ratios s√ªrs
        ratio_empty = (df["docks_available"] / df["capacity"].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0).clip(0, 1)
        ratio_full = (df["available_total"] / df["capacity"].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0).clip(0, 1)

        # ‚úÖ D√©finition automatique des seuils (adapt√©s √† ta distribution)
        empty_threshold = ratio_empty.quantile(0.70)  # stations avec beaucoup de place
        full_threshold  = ratio_full.quantile(0.30)   # stations avec assez de v√©los

        print(f"üìè Seuils appliqu√©s: target_empty >= {empty_threshold:.2f} | target_full >= {full_threshold:.2f}")

        # ‚úÖ Cibles binaires √©quilibr√©es
        df["target_empty"] = (ratio_empty >= empty_threshold).astype(int)
        df["target_full"]  = (ratio_full >= full_threshold).astype(int)

        # Heures / jours
        df = df.sort_values(["station_name", "time"])
        df["hour"] = df["time"].dt.hour
        df["day_of_week"] = df["time"].dt.day_name()

        # Moyenne glissante (3h) par station
        df["rolling_fill_rate"] = (
            df.groupby("station_name")["fill_rate"].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
        )

        # Week-end
        df["is_weekend"] = df["day_of_week"].isin(["Saturday", "Sunday"]).astype(int)

        return df

    def run(self):
        """Ex√©cution compl√®te du pipeline de feature engineering"""
        merged = self._preprocess(self.velib, self.weather, self.calendar)
        if merged is None:
            raise RuntimeError("[FeatureBuilder.run] _preprocess a renvoy√© None (attendu: DataFrame).")
        features = self._feature_engineering(merged)
        return features


# üß™ Exemple d'ex√©cution
feature_dataframe = FeatureBuilder(
    velib_historical_dataframe,
    weather_dataframe,
    calendar_dataframe
).run()


üßπ Pr√©paration des donn√©es...
‚öôÔ∏è Construction des features...


  df["fill_rate"] = (df["available_total"] / df["capacity"].replace(0, pd.NA)).fillna(0).clip(0, 1)


üìè Seuils appliqu√©s: target_empty >= 0.83 | target_full >= 0.17


## Data Exploration

In [None]:
resultat = feature_dataframe["station_name"].groupby("station_name").count()

display(resultat)


519          Jean Bleuzen - Square du 11 Novembre
1916         Jean Bleuzen - Square du 11 Novembre
3313         Jean Bleuzen - Square du 11 Novembre
4710         Jean Bleuzen - Square du 11 Novembre
6107         Jean Bleuzen - Square du 11 Novembre
                            ...                  
10980677                          √éle de la Jatte
10982075                          √éle de la Jatte
10983473                          √éle de la Jatte
10984872                          √éle de la Jatte
10986271                          √éle de la Jatte
Name: station_name, Length: 10986730, dtype: object

In [None]:
from __future__ import annotations
from typing import List, Optional, Tuple, Dict
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
import joblib


class VelibSimpleModel:
    """
    Mod√®le simple et lisible pour pr√©dire si une station sera vide/pleine.

    Principes :
    - Utilise uniquement les colonnes d√©j√† pr√©sentes dans votre DataFrame d'exemple
      (pas de m√©t√©o, pas de mapping).
    - Encodage temporel cyclique (√† partir de 'time').
    - Imputation l√©g√®re + standardisation pour les num√©riques, OHE pour station_id.
    - API courte: fit / predict_proba / predict_label / save / load
    - Param√®tres regroup√©s en dict (to_config / from_config).
    """

    # Colonnes de base attendues (selon votre DataFrame d'entr√©e)
    BASE_NUM_SCALED: List[str] = [
        # features temporelles
        "hour_ssin", "hour_ccos", "dow_sin", "dow_cos", "month",
    ]
    BASE_BIN_PASSTHROUGH: List[str] = [
        "holiday_flag", "is_weekend", "operative",
        "pluie", "vent", "soleil", "nuage",
    ]
    BASE_CATEG: List[str] = ["station_name"]

    def __init__(
        self,
        target_col: str = "target_empty",        # ou "target_full"
        model_type: str = "gb",                  # "gb" ou "logit"
        timezone: str = "Europe/Paris",
        random_state: int = 42,
        test_size: float = 0.2,
        verbose: bool = True,
    ):
        print("[__init__] ‚Üí D√©but initialisation du mod√®le")
        self.target_col = target_col
        self.model_type = model_type
        self.timezone = timezone
        self.random_state = random_state
        self.test_size = test_size
        self.verbose = verbose

        # Objets entra√Æn√©s
        self.feature_list_: List[str] = []
        self.preprocessor_: Optional[ColumnTransformer] = None
        self.pipeline_: Optional[Pipeline] = None
        print("[__init__] ‚úì Fin initialisation du mod√®le")

    # ------------- Helpers ---------------

    @staticmethod
    def _ensure_columns(df: pd.DataFrame, cols: List[str]) -> None:
        print("[_ensure_columns] ‚Üí V√©rification des colonnes requises...")
        missing = [c for c in cols if c not in df.columns]
        if missing:
            raise ValueError(f"Colonnes manquantes: {missing}")
        print("[_ensure_columns] ‚úì Toutes les colonnes sont pr√©sentes.")

    def _add_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Ajoute hour/dow/month + encodage cyclique √† partir de la colonne 'time'.
        On ne d√©pend pas des colonnes 'hour'/'day_of_week' existantes pour garder la logique simple et robuste.
        """
        print("[_add_time_features] ‚Üí D√©but g√©n√©ration des features temporelles...")
        if "time" not in df.columns:
            raise ValueError("La colonne 'time' est requise.")
        ts = pd.to_datetime(df["time"], utc=True).dt.tz_convert(self.timezone)

        out = df.copy()
        out["hour"] = ts.dt.hour
        out["dow"] = ts.dt.weekday     # 0 = lundi
        out["month"] = ts.dt.month

        out["hour_ssin"] = np.sin(2 * np.pi * out["hour"] / 24)
        out["hour_ccos"] = np.cos(2 * np.pi * out["hour"] / 24)
        out["dow_sin"]   = np.sin(2 * np.pi * out["dow"] / 7)
        out["dow_cos"]   = np.cos(2 * np.pi * out["dow"] / 7)
        print("[_add_time_features] ‚úì Features temporelles ajout√©es.")
        return out

    def _build_preprocessor(self) -> None:
        """
        Pr√©processeur:
        - Num√©riques (imputation m√©diane + standardisation)
        - Binaires (imputation la plus fr√©quente, pas de scaling)
        - Cat√©gorielles (OHE handle_unknown='ignore')
        """
        print("[_build_preprocessor] ‚Üí Construction du pr√©processeur...")
        # Gestion valeurs manquantes + standardisation
        num_pipe = Pipeline(steps=[
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler()),
        ])
        # Gestion valeurs manquantes (pas scaler pour garder 0/1 lisible)
        bin_pipe = Pipeline(steps=[
            ("imp", SimpleImputer(strategy="most_frequent")),
        ])
        # Transforme variables cat√©gorielles en binaire
        cat_pipe = Pipeline(steps=[
            ("imp", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
            ("ohe", OneHotEncoder(
                handle_unknown="ignore",
                dtype=np.float32,
                sparse_output=True,
            )),
        ])

        # Liste des features finales
        self.feature_list_ = (
            self.BASE_NUM_SCALED
            + self.BASE_BIN_PASSTHROUGH
            + self.BASE_CATEG
        )

        # Applique les 3 pipelines de transformation + drop autres colonnes
        self.preprocessor_ = ColumnTransformer(
            transformers=[
                ("num", num_pipe, self.BASE_NUM_SCALED),
                ("bin", bin_pipe, self.BASE_BIN_PASSTHROUGH),
                ("cat", cat_pipe, self.BASE_CATEG),
            ],
            remainder="drop",
            sparse_threshold=1.0,
        )
        print("[_build_preprocessor] ‚úì Pr√©processeur construit.")

    def _prepare_features(
        self, df: pd.DataFrame, with_target: bool
    ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        - Ajoute les features temporelles
        - V√©rifie la pr√©sence des colonnes attendues
        - Renvoie X (et y si with_target)
        """
        print("[_prepare_features] ‚Üí Pr√©paration des features...")
        # Ajoute les features temporelles
        df2 = self._add_time_features(df)

        # Construire le pr√©processeur si pas encore fait
        if self.preprocessor_ is None:
            self._build_preprocessor()

        # V√©rifier les colonnes d'entr√©e attendues
        needed = (
            set(self.BASE_NUM_SCALED + self.BASE_BIN_PASSTHROUGH + self.BASE_CATEG)
            - {"hour_ssin", "hour_ccos", "dow_sin", "dow_cos", "month"}  # cr√©√©es ici
        )
        self._ensure_columns(df2, sorted(needed))

        X = df2[self.feature_list_] # Liste de colonne cr√©√©e dans _build_preprocessor()

        y = None
        if with_target: # V√©rification de la pr√©sence de la colonne cible (√† pr√©dire)
            if self.target_col not in df2.columns:
                raise ValueError(f"Colonne cible manquante: '{self.target_col}'")
            y = df2[self.target_col].astype(int)

        # Renvoi les colonnes de param√®tre ainsi que la colonne cible 
        print("[_prepare_features] ‚úì Features pr√©par√©es.")
        return X, y

    # ------------- API publique ---------------

    def fit(self, df: pd.DataFrame) -> Dict[str, float]:
        """
        Entra√Æne le mod√®le choisi et renvoie des m√©triques simples (AUC).
        """
        print("[fit] ‚Üí D√©but entra√Ænement du mod√®le...")
        
        self._build_preprocessor()# (r√©)initialise un pr√©processeur propre
        X, y = self._prepare_features(df, with_target=True) # Pr√©paration des features

        # Choix du classifieur
        if self.model_type == "logit":
            classifier = LogisticRegression(max_iter=300, solver="saga", verbose=1, random_state=self.random_state)
        else:
            classifier = GradientBoostingClassifier(random_state=self.random_state, verbose=1)

        # Assemblage pr√©processeur + classifieur
        self.pipeline_ = Pipeline(steps=[("preprocessor", self.preprocessor_), ("classifier", classifier)])

        X_tr, X_va, y_tr, y_va = train_test_split(
            X, y,
            test_size=self.test_size,
            stratify=y,
            random_state=self.random_state,
        )

        print("[fit] ‚Üí Entra√Ænement en cours...")
        self.pipeline_.fit(X_tr, y_tr)

        print("[fit] ‚úì Entra√Ænement termin√©. √âvaluation en cours...")

        proba = self.pipeline_.predict_proba(X_va)[:, 1]
        y_pred = (proba >= 0.5).astype(int)  # seuil simple

        acc = accuracy_score(y_va, y_pred)
        f1  = f1_score(y_va, y_pred)

        metrics = {
            "val_accuracy": float(acc),
            "val_f1": float(f1),
            "n_samples_train": int(len(X_tr)),
            "n_samples_val": int(len(X_va)),
        }

        if self.verbose:
            print(f"[{self.target_col}] {self.model_type.upper()}  "
                f"ACC={metrics['val_accuracy']:.3f}  F1={metrics['val_f1']:.3f}  "
                f"(train={metrics['n_samples_train']}, val={metrics['n_samples_val']})")


        print("[fit] ‚úì Fin de l'entra√Ænement et des m√©triques.")
        return metrics

    def predict_proba(self, df: pd.DataFrame) -> np.ndarray:
        """Probabilit√© d'√™tre positif (ex: vide si target_empty)."""
        print("[predict_proba] ‚Üí D√©but pr√©diction des probabilit√©s...")
        check_is_fitted(self.pipeline_, "named_steps")
        # NE PAS reconstruire le pr√©processeur ici
        X, _ = self._prepare_features(df, with_target=False)
        print("[predict_proba] ‚úì Fin pr√©diction des probabilit√©s.")
        return self.pipeline_.predict_proba(X)[:, 1]

    def predict_label(self, df: pd.DataFrame, threshold: float = 0.5) -> pd.Series:
        """Label binaire selon un seuil."""
        print("[predict_label] ‚Üí D√©but pr√©diction des labels...")
        p = self.predict_proba(df)
        print("[predict_label] ‚úì Fin pr√©diction des labels.")
        return pd.Series((p >= threshold).astype(int), index=df.index, name=f"{self.target_col}_pred")

    def save(self, path: str) -> None:
        """Sauvegarde le pipeline complet (pr√©traitement + mod√®le)."""
        print("[save] ‚Üí Sauvegarde du mod√®le...")
        check_is_fitted(self.pipeline_, "named_steps")
        joblib.dump(self.pipeline_, path)
        print("[save] ‚úì Mod√®le sauvegard√©.")

    def load(self, path: str) -> None:
        """Charge un pipeline entra√Æn√© (pr√©processeur inclus)."""
        print("[load] ‚Üí Chargement du mod√®le...")
        self.pipeline_ = joblib.load(path)
        # on r√©cup√®re le pr√©processeur et la liste de features du pipeline sauvegard√©
        if hasattr(self.pipeline_, "named_steps") and "preprocessor" in self.pipeline_.named_steps:
            self.preprocessor_ = self.pipeline_.named_steps["preprocessor"]
        # La feature_list_ est utile seulement pour _prepare_features (ordre des colonnes en entr√©e)
        # On la reconstruit √† partir des attributs de classe pour rester d√©terministe :
        self.feature_list_ = (
            self.BASE_NUM_SCALED + self.BASE_BIN_PASSTHROUGH + self.BASE_CATEG
        )
        print("[load] ‚úì Mod√®le charg√© avec succ√®s.")

    # --------- Config dict ---------

    @classmethod
    def from_config(cls, cfg: Dict) -> "VelibSimpleModel":
        print("[from_config] ‚Üí Cr√©ation du mod√®le depuis un dictionnaire de config...")
        model = cls(**cfg)
        print("[from_config] ‚úì Mod√®le cr√©√© depuis la config.")
        return model

    def to_config(self) -> Dict:
        print("[to_config] ‚Üí Export de la configuration du mod√®le...")
        cfg = {
            "target_col": self.target_col,
            "model_type": self.model_type,
            "timezone": self.timezone,
            "random_state": self.random_state,
            "test_size": self.test_size,
            "verbose": self.verbose,
        }
        print("[to_config] ‚úì Configuration export√©e.")
        return cfg


In [45]:
import pandas as pd

# 1. Charger vos donn√©es
df = feature_dataframe # pd.read_csv("velib_data.csv")  # ou df = votre_dataframe d√©j√† charg√©

# 2. Cr√©er le mod√®le avec vos param√®tres
config = {
    "target_col": "target_full",     # ou "target_full"
    "model_type": "logit",               # "gb" ou "logit"
    "timezone": "Europe/Paris",
    "random_state": 42,
    "test_size": 0.2,
    "verbose": True
}
model = VelibSimpleModel.from_config(config)

# 3. Lancer l'entra√Ænement
metrics = model.fit(feature_dataframe)

# 4. Afficher les r√©sultats
print("üìä R√©sultats de l'entra√Ænement :")
for k, v in metrics.items():
    print(f"  - {k}: {v}")

# 5. (Optionnel) Sauvegarder le mod√®le
model.save("target_full.joblib")


[from_config] ‚Üí Cr√©ation du mod√®le depuis un dictionnaire de config...
[__init__] ‚Üí D√©but initialisation du mod√®le
[__init__] ‚úì Fin initialisation du mod√®le
[from_config] ‚úì Mod√®le cr√©√© depuis la config.
[fit] ‚Üí D√©but entra√Ænement du mod√®le...
[_build_preprocessor] ‚Üí Construction du pr√©processeur...
[_build_preprocessor] ‚úì Pr√©processeur construit.
[_prepare_features] ‚Üí Pr√©paration des features...
[_add_time_features] ‚Üí D√©but g√©n√©ration des features temporelles...
[_add_time_features] ‚úì Features temporelles ajout√©es.
[_ensure_columns] ‚Üí V√©rification des colonnes requises...
[_ensure_columns] ‚úì Toutes les colonnes sont pr√©sentes.
[_prepare_features] ‚úì Features pr√©par√©es.
[fit] ‚Üí Entra√Ænement en cours...
convergence after 216 epochs took 1767 seconds
[fit] ‚úì Entra√Ænement termin√©. √âvaluation en cours...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 29.5min finished


[target_full] LOGIT  ACC=0.777  F1=0.851  (train=8789384, val=2197346)
[fit] ‚úì Fin de l'entra√Ænement et des m√©triques.
üìä R√©sultats de l'entra√Ænement :
  - val_accuracy: 0.7769035918785663
  - val_f1: 0.8512563802098346
  - n_samples_train: 8789384
  - n_samples_val: 2197346
[save] ‚Üí Sauvegarde du mod√®le...
[save] ‚úì Mod√®le sauvegard√©.
