# Fonctions d'obtention des données
## Import des librairies

In [1]:
import pandas as pd
import requests
from config import URL_HOLIDAYS, URL_SCHOOL, URL_METEO_HOURLY
from datetime import datetime

## Class de lecture des données velib historiques

In [2]:
class VelibCsvReader:
    def __init__(self):
        self.file_path = "./data/dataset/historique_stations.csv"

    def read_dataframe(self):
        """Combine status + info sur les stations"""
        dataframe = pd.read_csv(
            filepath_or_buffer=self.file_path,
            sep=","
        )
        return dataframe

## Class API pour données sur les vacances

In [3]:
class HolidaysAPI:
    """
    Récupération des jours fériés et des vacances scolaires françaises.
    """

    def fetch_public_holidays(self) -> pd.DataFrame:
        try:
            data = requests.get(URL_HOLIDAYS, timeout=10).json()
            df = pd.DataFrame(list(data.items()), columns=["date", "holiday_name"])
            df["date"] = pd.to_datetime(df["date"])
            return df
        except Exception as e:
            raise RuntimeError(f"Erreur API jours fériés : {e}")

    def fetch_school_vacations(self) -> pd.DataFrame:
        try:
            data = requests.get(URL_SCHOOL, timeout=10).json()
            results = data.get("results", [])
            df = pd.DataFrame(results)
            if not df.empty:
                df = df[["start_date", "end_date", "zones", "description"]]
                df["start_date"] = pd.to_datetime(df["start_date"])
                df["end_date"] = pd.to_datetime(df["end_date"])
            return df
        except Exception as e:
            raise RuntimeError(f"Erreur API vacances scolaires : {e}")

## Class API pour les données météo historiques

In [10]:
import pandas as pd
from typing import Dict

class WeatherCsvReader:
    def __init__(self, file_path: str = "./data/dataset/historical_meteo.csv"):
        self.file_path = file_path
        
        # Mapping colonnes brutes -> colonnes standardisées
        self.WEATHER_COL_MAP = {
            "AAAAMMJJHH": "timestamp",
            "NUM_POSTE":  "station_id",
            "NOM_USUEL":  "station_name",
            "LAT":        "lat_deg",
            "LON":        "lon_deg",
            "ALTI":       "alt_m",
            "RR1":        "precip_mm",
            "DRR1":       "precip_dur_min",
            "T":          "temp_c",
            "TD":         "dewpoint_c",
            "U":          "humidity_rel_pct",
            "FF":         "wind_speed_10m_ms",
            "DD":         "wind_dir_deg",
            "FXI":        "wind_gust_10m_ms",
            "PSTAT":      "pressure_hpa_station",
            "PMER":       "pressure_hpa_sea",

            "N":          "cloud_oktas",
            "INS":        "insolation_min",            # plan B si N absent
            "GLO":        "global_radiation_j_cm2",    # plan B bis
            "WW":         "wmo_present_weather",       # optionnel mais utile
        }

        # Schéma final (cible) — timestamp est parsé à part
        self.WEATHER_DTYPES: Dict[str, str] = {
            "station_id":          "string",
            "station_name":        "string",
            "lat_deg":             "float64",
            "lon_deg":             "float64",
            "alt_m":               "Int64",
            "timestamp":           "datetime64[ns]",
            "precip_mm":           "float64",
            "precip_dur_min":      "Int64",
            "temp_c":              "float64",
            "dewpoint_c":          "float64",
            "humidity_rel_pct":    "float64",
            "wind_speed_10m_ms":   "float64",
            "wind_dir_deg":        "float64",
            "wind_gust_10m_ms":    "float64",
            "pressure_hpa":        "float64",
            
            "cloud_oktas":   "float64",
            "insolation_min":        "float64",
            "global_radiation_j_cm2":    "float64",
            "wmo_present_weather":        "float64",
        }

    def read_dataframe(self) -> pd.DataFrame:
        """Lecture brute du CSV (séparateur ';')."""
        try:
            return pd.read_csv(self.file_path, sep=";")
        except Exception as e:
            raise RuntimeError(f"Erreur API météo : {e}")

    def _select_rename(self, df: pd.DataFrame) -> pd.DataFrame:
        """Garde uniquement les colonnes connues puis les renomme (sans boucles)."""
        cols_src = df.columns.intersection(self.WEATHER_COL_MAP.keys())
        return df.loc[:, cols_src].rename(columns=self.WEATHER_COL_MAP)

    def read_standardized(self) -> pd.DataFrame:
        """
        Pipeline vectorisé :
        - select/rename
        - parse timestamp (AAAAMMJJHH)
        - coalesce pression (PSTAT prioritaire sur PMER)
        - cast global selon schéma
        - ordre de colonnes propre
        """
        df_raw = self.read_dataframe()
        df = self._select_rename(df_raw)

        # Parse timestamp (format AAAAMMJJHH)
        df = df.assign(
            timestamp=pd.to_datetime(
                df["timestamp"].astype("string"),
                format="%Y%m%d%H",
                errors="coerce"
            )
        )

        # Coalesce pression (priorité station -> mer) si colonnes présentes
        # (si absentes, bfill s'applique sur colonnes manquantes sans boucle)
        pressure_sources = df.filter(items=["pressure_hpa_station", "pressure_hpa_sea"])
        if not pressure_sources.empty:
            df["pressure_hpa"] = pressure_sources.bfill(axis=1).iloc[:, 0]

        # Ordre + cast en une seule passe (les colonnes manquantes seront ajoutées vides)
        df = (
            df
            .reindex(columns=self.WEATHER_DTYPES.keys())  # ordre final
            .astype(self.WEATHER_DTYPES, errors="ignore")  # cast "df.cast(schema)" version pandas
        )

        return df


# Exploration des données

In [11]:
velib_historical_dataframe = VelibCsvReader().read_dataframe()
print("Données Vélos :")
display(velib_historical_dataframe.head(10))

holidays_instance = HolidaysAPI()
public_holidays_dataframe = holidays_instance.fetch_public_holidays()
vacations_dataframe = holidays_instance.fetch_school_vacations()
public_holidays_dataframe["type"] = "holiday"
vacations_dataframe["type"] = "vacation"
calendar_dataframe = pd.concat([public_holidays_dataframe, vacations_dataframe], ignore_index=True)
print("Données vacances :")
display(calendar_dataframe.head(10))

weather_dataframe = WeatherCsvReader().read_standardized()
print("Données météo :")
display(weather_dataframe.head(10))



Données Vélos :


Unnamed: 0,time,capacity,available_mechanical,available_electrical,station_name,station_geo,operative
0,2020-11-26T12:59Z,35,4,5,Benjamin Godard - Victor Hugo,"48.86598,2.27572",True
1,2020-11-26T12:59Z,55,23,4,André Mazet - Saint-André des Arts,"48.85376,2.33910",True
2,2020-11-26T12:59Z,20,0,0,Charonne - Robert et Sonia Delauney,"48.85591,2.39257",True
3,2020-11-26T12:59Z,21,0,1,Toudouze - Clauzel,"48.87930,2.33736",True
4,2020-11-26T12:59Z,30,3,1,Mairie du 12ème,"48.84086,2.38755",True
5,2020-11-26T12:59Z,46,18,10,Harpe - Saint-Germain,"48.85152,2.34367",True
6,2020-11-26T12:59Z,60,5,2,Jourdan - Stade Charléty,"48.81943,2.34334",True
7,2020-11-26T12:59Z,40,15,1,Jouffroy d'Abbans - Wagram,"48.88197,2.30113",True
8,2020-11-26T12:59Z,39,12,2,Guersant - Gouvion-Saint-Cyr,"48.88288,2.28767",True
9,2020-11-26T12:59Z,60,2,2,Alibert - Jemmapes,"48.87104,2.36610",True


Données vacances :


Unnamed: 0,date,holiday_name,type,start_date,end_date,zones,description
0,2030-01-01,1er janvier,holiday,NaT,NaT,,
1,2030-04-22,Lundi de Pâques,holiday,NaT,NaT,,
2,2030-05-01,1er mai,holiday,NaT,NaT,,
3,2030-05-08,8 mai,holiday,NaT,NaT,,
4,2030-05-30,Ascension,holiday,NaT,NaT,,
5,2030-06-10,Lundi de Pentecôte,holiday,NaT,NaT,,
6,2030-07-14,14 juillet,holiday,NaT,NaT,,
7,2030-08-15,Assomption,holiday,NaT,NaT,,
8,2030-11-01,Toussaint,holiday,NaT,NaT,,
9,2030-11-11,11 novembre,holiday,NaT,NaT,,


Données météo :


Unnamed: 0,station_id,station_name,lat_deg,lon_deg,alt_m,timestamp,precip_mm,precip_dur_min,temp_c,dewpoint_c,humidity_rel_pct,wind_speed_10m_ms,wind_dir_deg,wind_gust_10m_ms,pressure_hpa,cloud_oktas,insolation_min,global_radiation_j_cm2,wmo_present_weather
0,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 00:00:00,0.0,0,0.7,-3.3,74.0,8.9,140.0,14.0,1013.8,8.0,0.0,0.0,0.0
1,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 01:00:00,0.0,0,0.9,-3.3,74.0,9.7,150.0,14.7,1013.1,8.0,0.0,0.0,0.0
2,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 02:00:00,0.0,0,0.7,-2.2,81.0,10.2,140.0,14.3,1012.6,8.0,0.0,0.0,0.0
3,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 03:00:00,0.0,20,0.8,-2.2,81.0,11.5,130.0,16.5,1011.0,8.0,0.0,0.0,0.0
4,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 04:00:00,0.0,0,0.9,-2.0,82.0,12.9,120.0,18.0,1008.7,8.0,0.0,0.0,0.0
5,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 05:00:00,0.0,10,0.5,-1.2,89.0,13.7,120.0,20.2,1006.4,8.0,0.0,0.0,70.0
6,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 06:00:00,0.0,60,0.6,-0.5,92.0,14.0,120.0,22.3,1005.0,8.0,0.0,0.0,68.0
7,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 07:00:00,0.4,60,0.6,0.0,96.0,14.6,120.0,22.1,1003.6,8.0,0.0,0.0,68.0
8,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 08:00:00,1.2,60,0.8,0.2,96.0,15.0,120.0,23.6,1001.7,8.0,0.0,0.0,68.0
9,97502001,ST-PIERRE,46.766333,-56.179167,21,2020-01-01 09:00:00,5.1,60,0.9,0.4,97.0,14.1,110.0,21.2,1001.2,8.0,0.0,0.0,68.0


# Modélisation des données
## Fonctions de modélisation

In [7]:
import pandas as pd


class FeatureBuilder:
    def __init__(self, velib, weather, calendar):
        self.velib = velib
        self.weather = weather
        self.calendar = calendar

    def _preprocess(self, velib, weather, calendar):
        """Prépare et fusionne les datasets (fusion horaire simple)"""
        print("🧹 Préparation des données...")

        # Nettoyage des dates
        weather["timestamp"] = pd.to_datetime(weather["timestamp"], errors="coerce", utc=True)
        velib["time"] = pd.to_datetime(velib["time"], errors="coerce", utc=True)

        # Création d'une clé horaire commune
        weather["ts_hour"] = weather["timestamp"].dt.floor("h")
        velib["ts_hour"] = velib["time"].dt.floor("h")

        # (Optionnel mais recommandé) supprimer doublons météo par heure
        weather = weather.drop_duplicates(subset=["ts_hour"], keep="last")

        # Fusion météo ↔ Vélib sur l'heure (plus de produit cartésien)
        df = pd.merge(velib, weather, on="ts_hour", how="left", suffixes=("", "_weather"))

        # Ajout indicateurs calendrier (fusion par date)
        df["date"] = df["timestamp"].dt.tz_convert("Europe/Paris").dt.date
        calendar["date"] = pd.to_datetime(calendar["start_date"]).dt.date
        calendar["holiday_flag"] = 1
        df = pd.merge(df, calendar[["date", "holiday_flag"]].drop_duplicates(), on="date", how="left")
        df["holiday_flag"] = df["holiday_flag"].fillna(0).astype(int)

        return df

    def _feature_engineering(self, df):
        """Crée des variables dérivées utiles pour la prédiction"""
        print("⚙️ Construction des features...")

        # TOTAL vélos dispo & bornes libres
        df["available_total"] = df["available_mechanical"].fillna(0) + df["available_electrical"].fillna(0)
        df["docks_available"] = df["capacity"].fillna(0) - df["available_total"]

        # Taux d'occupation
        df["fill_rate"] = (df["available_total"] / df["capacity"].replace(0, pd.NA)).fillna(0).clip(0, 1)

        # Cibles binaires (choisis l’une pour ton modèle)
        # - "VIDE": plus de vélos (≤ seuil)
        df["target_empty"] = (df["available_total"] <= 1).astype(int)   # seuil hackathon : 0 ou 1 vélo

        # - "PLEINE": plus de place pour reposer le vélo (≤ seuil)
        df["target_full"]  = (df["docks_available"] <= 1).astype(int)   # seuil hackathon : 0 ou 1 place

        # Heures et jours
        df["hour"] = df["timestamp"].dt.hour
        df["day_of_week"] = df["timestamp"].dt.day_name()

        # Moyenne glissante par station (3 dernières heures)
        df = df.sort_values(["station_id", "timestamp"])
        df["rolling_fill_rate"] = (
            df.groupby("station_id")["fill_rate"]
              .transform(lambda x: x.rolling(window=3, min_periods=1).mean())
        )

        # Température normalisée
        # df["temp_norm"] = (df["temperature_2m"] - df["temperature_2m"].mean()) / df["temperature_2m"].std()

        # Indicateur jour/week-end
        df["is_weekend"] = df["day_of_week"].isin(["Saturday", "Sunday"]).astype(int)

        return df

    def run(self):
        """Exécution complète du pipeline de feature engineering"""
        velib, weather, calendar = [
            self.velib,
            self.weather, 
            self.calendar
        ]
        merged = self._preprocess(velib, weather, calendar)
        features = self._feature_engineering(merged)
        return features


feature_dataframe = FeatureBuilder(
    velib_historical_dataframe,
    weather_dataframe,
    calendar_dataframe
).run()


🧹 Préparation des données...
⚙️ Construction des features...


  df["fill_rate"] = (df["available_total"] / df["capacity"].replace(0, pd.NA)).fillna(0).clip(0, 1)


## Data Exploration

In [8]:
display(feature_dataframe.head(10))

Unnamed: 0,time,capacity,available_mechanical,available_electrical,station_name,station_geo,operative,ts_hour,station_id,station_name_weather,...,holiday_flag,available_total,docks_available,fill_rate,target_empty,target_full,hour,day_of_week,rolling_fill_rate,is_weekend
0,2020-11-26 12:59:00+00:00,35,4,5,Benjamin Godard - Victor Hugo,"48.86598,2.27572",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,9,26,0.257143,0,0,12,Thursday,0.257143,0
1,2020-11-26 12:59:00+00:00,55,23,4,André Mazet - Saint-André des Arts,"48.85376,2.33910",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,27,28,0.490909,0,0,12,Thursday,0.374026,0
2,2020-11-26 12:59:00+00:00,20,0,0,Charonne - Robert et Sonia Delauney,"48.85591,2.39257",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,0,20,0.0,1,0,12,Thursday,0.249351,0
3,2020-11-26 12:59:00+00:00,21,0,1,Toudouze - Clauzel,"48.87930,2.33736",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,1,20,0.047619,1,0,12,Thursday,0.179509,0
4,2020-11-26 12:59:00+00:00,30,3,1,Mairie du 12ème,"48.84086,2.38755",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,4,26,0.133333,0,0,12,Thursday,0.060317,0
5,2020-11-26 12:59:00+00:00,46,18,10,Harpe - Saint-Germain,"48.85152,2.34367",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,28,18,0.608696,0,0,12,Thursday,0.263216,0
6,2020-11-26 12:59:00+00:00,60,5,2,Jourdan - Stade Charléty,"48.81943,2.34334",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,7,53,0.116667,0,0,12,Thursday,0.286232,0
7,2020-11-26 12:59:00+00:00,40,15,1,Jouffroy d'Abbans - Wagram,"48.88197,2.30113",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,16,24,0.4,0,0,12,Thursday,0.375121,0
8,2020-11-26 12:59:00+00:00,39,12,2,Guersant - Gouvion-Saint-Cyr,"48.88288,2.28767",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,14,25,0.358974,0,0,12,Thursday,0.29188,0
9,2020-11-26 12:59:00+00:00,60,2,2,Alibert - Jemmapes,"48.87104,2.36610",True,2020-11-26 12:00:00+00:00,97502001,ST-PIERRE,...,0,4,56,0.066667,0,0,12,Thursday,0.275214,0
