# Modélisation Température

In [46]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

INPUT = "../datas/processed/capteur_C013_features.csv"
MODEL_DIR = "../models"
os.makedirs(MODEL_DIR, exist_ok=True)

pd.set_option("display.max_columns", 200)


In [47]:
df = pd.read_csv(INPUT, parse_dates=["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)
df.head()


Unnamed: 0,timestamp,nom_salle,capacite_salle,temperature,temperature_ext,humidite,pression,nb_eleves_presents,jour,heure,minute,jour_semaine,est_weekend,salle_ouverte,heure_sin,heure_cos,ratio_occupation,ecart_temp_int_ext,ouvert_x_occupation,delta_temperature,delta_humidite,delta_pression,delta_temperature_ext,delta_occupation,temperature_lag1,temperature_lag2,temperature_ext_lag1,temperature_ext_lag2,humidite_lag1,humidite_lag2,pression_lag1,pression_lag2,nb_eleves_presents_lag1,nb_eleves_presents_lag2,temperature_moy_gliss_2,temperature_moy_gliss_6,temperature_ext_moy_gliss_2,temperature_ext_moy_gliss_6,humidite_moy_gliss_2,humidite_moy_gliss_6,nb_eleves_presents_moy_gliss_2,nb_eleves_presents_moy_gliss_6
0,2025-08-20 22:00:00,C013,30,32.0,34.7,35.5,1014.8,0,2025-08-20,22,0,2,0,0,-0.5,0.866025,0.0,-2.7,0,-1.0,0.1,0.1,-1.7,0.0,32.0,32.0,34.7,34.7,35.5,35.5,1014.8,1014.8,0.0,0.0,32.0,32.0,34.7,34.7,35.5,35.5,0.0,0.0
1,2025-08-20 22:30:00,C013,30,31.0,33.0,35.6,1014.9,0,2025-08-20,22,30,2,0,0,-0.382683,0.92388,0.0,-2.0,0,-1.0,0.1,0.1,-1.7,0.0,32.0,32.0,34.7,34.7,35.5,35.5,1014.8,1014.8,0.0,0.0,31.5,31.5,33.85,33.85,35.55,35.55,0.0,0.0
2,2025-08-20 23:00:00,C013,30,30.7,33.0,40.3,1014.7,0,2025-08-20,23,0,2,0,0,-0.258819,0.965926,0.0,-2.3,0,-0.3,4.7,-0.2,0.0,0.0,31.0,32.0,33.0,34.7,35.6,35.5,1014.9,1014.8,0.0,0.0,30.85,31.233333,33.0,33.566667,37.95,37.133333,0.0,0.0
3,2025-08-20 23:30:00,C013,30,29.9,31.7,36.6,1014.5,0,2025-08-20,23,30,2,0,0,-0.130526,0.991445,0.0,-1.8,0,-0.8,-3.7,-0.2,-1.3,0.0,30.7,31.0,33.0,33.0,40.3,35.6,1014.7,1014.9,0.0,0.0,30.3,30.9,32.35,33.1,38.45,37.0,0.0,0.0
4,2025-08-21 00:00:00,C013,30,29.9,31.0,40.6,1014.5,0,2025-08-21,0,0,3,0,0,0.0,1.0,0.0,-1.1,0,0.0,4.0,0.0,-0.7,0.0,29.9,30.7,31.7,33.0,36.6,40.3,1014.5,1014.7,0.0,0.0,29.9,30.7,31.35,32.68,38.6,37.72,0.0,0.0


In [48]:
H1, H2 = 2, 4
df["temperature_t+2"] = df["temperature"].shift(-H1)
df["temperature_t+4"] = df["temperature"].shift(-H2)

df["dT_t+2"] = df["temperature_t+2"] - df["temperature"]
df["dT_t+4"] = df["temperature_t+4"] - df["temperature"]

df_model = df.dropna(subset=["temperature_t+2", "temperature_t+4"]).reset_index(
    drop=True
)
df_model.shape


(332, 46)

In [49]:
feature_cols = [
    "temperature",
    "temperature_ext",
    "humidite",
    "pression",
    "nb_eleves_presents",
    "salle_ouverte",
    "heure_sin",
    "heure_cos",
    "jour_semaine",
    "est_weekend",
    "temperature_lag1",
    "temperature_lag2",
    "temperature_ext_lag1",
    "temperature_ext_lag2",
    "humidite_lag1",
    "humidite_lag2",
    "pression_lag1",
    "pression_lag2",
    "nb_eleves_presents_lag1",
    "nb_eleves_presents_lag2",
    "temperature_moy_gliss_2",
    "temperature_moy_gliss_6",
    "temperature_ext_moy_gliss_2",
    "temperature_ext_moy_gliss_6",
    "humidite_moy_gliss_2",
    "humidite_moy_gliss_6",
    "nb_eleves_presents_moy_gliss_2",
    "nb_eleves_presents_moy_gliss_6",
    "ratio_occupation",
    "ecart_temp_int_ext",
    "ouvert_x_occupation",
]

target_cols = ["temperature_t+2", "temperature_t+4"]
target_cols_d = ["dT_t+2", "dT_t+4"]

X = df_model[feature_cols].copy()
Y = df_model[target_cols].copy()
Yd = df_model[target_cols_d].copy()

X.shape, Y.shape, Yd.shape


((332, 31), (332, 2), (332, 2))

In [50]:


def eval_regression(y_true: pd.DataFrame, y_pred: np.ndarray, label=""):
    cols = list(y_true.columns)
    print(f"\nÉvaluation {label}")
    for i, col in enumerate(cols):
        y_t, y_p = y_true.iloc[:, i].to_numpy(), y_pred[:, i]
        mae = mean_absolute_error(y_t, y_p)
        rmse = root_mean_squared_error(y_t, y_p)
        print(f"{col:>16s} | MAE: {mae:.3f} | RMSE: {rmse:.3f}")

tscv = TimeSeriesSplit(n_splits=5)


In [51]:

def fit_eval_ridge(X, Y, model_name="ridge_temperature_levels"):
    for fold, (tr, te) in enumerate(tscv.split(X), 1):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        Ytr, Yte = Y.iloc[tr], Y.iloc[te]

        pipe = Pipeline(
            [
                ("scaler", StandardScaler()),
                ("reg", MultiOutputRegressor(Ridge(alpha=1.0, random_state=42))),
            ]
        )
        pipe.fit(Xtr, Ytr)
        y_pred = pipe.predict(Xte)
        eval_regression(Yte, y_pred, label=f"{model_name} - fold {fold}")

    pipe.fit(X, Y)
    joblib.dump(pipe, os.path.join(MODEL_DIR, f"{model_name}.joblib"))
    return pipe


ridge_levels = fit_eval_ridge(X, Y, model_name="ridge_temperature_levels")
ridge_deltas = fit_eval_ridge(X, Yd, model_name="ridge_temperature_deltas")



Évaluation ridge_temperature_levels - fold 1
 temperature_t+2 | MAE: 1.078 | RMSE: 1.305
 temperature_t+4 | MAE: 1.747 | RMSE: 2.111

Évaluation ridge_temperature_levels - fold 2
 temperature_t+2 | MAE: 1.544 | RMSE: 1.718
 temperature_t+4 | MAE: 2.347 | RMSE: 2.678

Évaluation ridge_temperature_levels - fold 3
 temperature_t+2 | MAE: 0.779 | RMSE: 0.971
 temperature_t+4 | MAE: 1.178 | RMSE: 1.402

Évaluation ridge_temperature_levels - fold 4
 temperature_t+2 | MAE: 0.780 | RMSE: 0.984
 temperature_t+4 | MAE: 0.928 | RMSE: 1.104

Évaluation ridge_temperature_levels - fold 5
 temperature_t+2 | MAE: 0.660 | RMSE: 0.811
 temperature_t+4 | MAE: 0.735 | RMSE: 0.896

Évaluation ridge_temperature_deltas - fold 1
          dT_t+2 | MAE: 1.088 | RMSE: 1.335
          dT_t+4 | MAE: 1.813 | RMSE: 2.176

Évaluation ridge_temperature_deltas - fold 2
          dT_t+2 | MAE: 1.485 | RMSE: 1.665
          dT_t+4 | MAE: 2.288 | RMSE: 2.586

Évaluation ridge_temperature_deltas - fold 3
          dT_t+2

In [52]:
def fit_eval_rf(X, Y, model_name="rf_temperature_levels"):
    for fold, (tr, te) in enumerate(tscv.split(X), 1):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        Ytr, Yte = Y.iloc[tr], Y.iloc[te]

        rf = MultiOutputRegressor(
            RandomForestRegressor(
                n_estimators=400,
                max_depth=None,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1,
            )
        )
        rf.fit(Xtr, Ytr)
        y_pred = rf.predict(Xte)
        eval_regression(Yte, y_pred, label=f"{model_name} - fold {fold}")

    rf.fit(X, Y)
    joblib.dump(rf, os.path.join(MODEL_DIR, f"{model_name}.joblib"))
    return rf


rf_levels = fit_eval_rf(X, Y, model_name="rf_temperature_levels")
rf_deltas = fit_eval_rf(X, Yd, model_name="rf_temperature_deltas")



Évaluation rf_temperature_levels - fold 1
 temperature_t+2 | MAE: 1.808 | RMSE: 2.185
 temperature_t+4 | MAE: 1.677 | RMSE: 2.122

Évaluation rf_temperature_levels - fold 2
 temperature_t+2 | MAE: 0.786 | RMSE: 0.970
 temperature_t+4 | MAE: 0.840 | RMSE: 0.977

Évaluation rf_temperature_levels - fold 3
 temperature_t+2 | MAE: 0.761 | RMSE: 0.911
 temperature_t+4 | MAE: 0.783 | RMSE: 1.012

Évaluation rf_temperature_levels - fold 4
 temperature_t+2 | MAE: 0.800 | RMSE: 0.994
 temperature_t+4 | MAE: 0.779 | RMSE: 0.982

Évaluation rf_temperature_levels - fold 5
 temperature_t+2 | MAE: 0.532 | RMSE: 0.665
 temperature_t+4 | MAE: 0.565 | RMSE: 0.692

Évaluation rf_temperature_deltas - fold 1
          dT_t+2 | MAE: 0.775 | RMSE: 0.984
          dT_t+4 | MAE: 1.094 | RMSE: 1.403

Évaluation rf_temperature_deltas - fold 2
          dT_t+2 | MAE: 0.811 | RMSE: 1.017
          dT_t+4 | MAE: 0.692 | RMSE: 0.870

Évaluation rf_temperature_deltas - fold 3
          dT_t+2 | MAE: 0.594 | RMSE: 0.

In [53]:


param_grid = {
    "estimator__n_estimators": [200, 400, 600, 800],
    "estimator__max_depth": [5, 10, 20, None],
    "estimator__min_samples_leaf": [1, 2, 5],
    "estimator__max_features": ["sqrt", "log2", 0.7],
}

base_rf = MultiOutputRegressor(RandomForestRegressor(random_state=42, n_jobs=-1))
search = RandomizedSearchCV(
    base_rf,
    param_distributions=param_grid,
    n_iter=20,
    cv=TimeSeriesSplit(n_splits=5),
    scoring="neg_mean_absolute_error",
    verbose=2,
    random_state=42,
    n_jobs=-1,
)

search.fit(X, Y)
print("Meilleurs hyperparamètres :", search.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__n_estimators=400; total time=   0.8s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__n_estimators=400; total time=   0.8s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__n_estimators=400; total time=   0.9s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__n_estimators=400; total time=   0.9s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=5, estimator__n_estimators=400; total time=   1.0s
[CV] END estimator__max_depth=5, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__n_estimators=800; total time=   1.6s
[CV] END estimator__max_depth=5, estimator__max_features=log2, estimato

In [54]:
BEST_RF_PARAMS = dict(
    n_estimators=400,
    max_depth=20,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1,
)


def fit_eval_rf_tuned(X, Y, model_name="rf_temperature_levels_tuned"):
    for fold, (tr, te) in enumerate(tscv.split(X), 1):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        Ytr, Yte = Y.iloc[tr], Y.iloc[te]

        rf = MultiOutputRegressor(RandomForestRegressor(**BEST_RF_PARAMS))
        rf.fit(Xtr, Ytr)
        y_pred = rf.predict(Xte)
        eval_regression(Yte, y_pred, label=f"{model_name} - fold {fold}")

    rf.fit(X, Y)
    path = os.path.join(MODEL_DIR, f"{model_name}.joblib")
    joblib.dump(rf, path)
    print("Modèle sauvegardé :", path)
    return rf


rf_levels = fit_eval_rf_tuned(X, Y, model_name="rf_temperature_levels_tuned")
rf_deltas = fit_eval_rf_tuned(X, Yd, model_name="rf_temperature_deltas_tuned")



Évaluation rf_temperature_levels_tuned - fold 1
 temperature_t+2 | MAE: 1.242 | RMSE: 1.607
 temperature_t+4 | MAE: 1.346 | RMSE: 1.798

Évaluation rf_temperature_levels_tuned - fold 2
 temperature_t+2 | MAE: 0.948 | RMSE: 1.088
 temperature_t+4 | MAE: 1.034 | RMSE: 1.289

Évaluation rf_temperature_levels_tuned - fold 3
 temperature_t+2 | MAE: 0.706 | RMSE: 0.887
 temperature_t+4 | MAE: 0.838 | RMSE: 1.029

Évaluation rf_temperature_levels_tuned - fold 4
 temperature_t+2 | MAE: 0.762 | RMSE: 0.938
 temperature_t+4 | MAE: 0.793 | RMSE: 0.979

Évaluation rf_temperature_levels_tuned - fold 5
 temperature_t+2 | MAE: 0.557 | RMSE: 0.711
 temperature_t+4 | MAE: 0.663 | RMSE: 0.859
Modèle sauvegardé : ../models/rf_temperature_levels_tuned.joblib

Évaluation rf_temperature_deltas_tuned - fold 1
          dT_t+2 | MAE: 0.775 | RMSE: 1.033
          dT_t+4 | MAE: 1.152 | RMSE: 1.479

Évaluation rf_temperature_deltas_tuned - fold 2
          dT_t+2 | MAE: 0.788 | RMSE: 0.981
          dT_t+4 | M

In [55]:
MODEL_LVL_PATH = os.path.join(MODEL_DIR, "rf_temperature_levels_tuned.joblib")
MODEL_DELTA_PATH = os.path.join(MODEL_DIR, "rf_temperature_deltas_tuned.joblib")

model_levels = joblib.load(MODEL_LVL_PATH)
model_deltas = joblib.load(MODEL_DELTA_PATH)

target_cols, target_cols_d

(['temperature_t+2', 'temperature_t+4'], ['dT_t+2', 'dT_t+4'])

In [56]:
def predire_temperature(df_features_courant: pd.DataFrame):
    
    assert all(col in df_features_courant.columns for col in X.columns), (
        "Les features d'entrée ne correspondent pas au jeu d'entraînement."
    )

    x_last = df_features_courant[X.columns].iloc[[-1]]

    y_levels = model_levels.predict(x_last)[0]
    y_deltas = model_deltas.predict(x_last)[0]

    return {
        "niveaux": {
            "temperature_t+1h": round(float(y_levels[0]), 1),
            "temperature_t+2h": round(float(y_levels[1]), 1),
        },
        "deltas": {
            "dT_+1h": round(float(y_deltas[0]), 1),
            "dT_+2h": round(float(y_deltas[1]), 1),
        },
    }


In [57]:
df_courant = df_model.copy()
preds_temp = predire_temperature(df_courant)
preds_temp


{'niveaux': {'temperature_t+1h': 28.7, 'temperature_t+2h': 28.7},
 'deltas': {'dT_+1h': -0.1, 'dT_+2h': -0.1}}

## Synthèse

### 1. Objectifs
- **Prévoir** la valeur de la variable cible (température / humidité / pression) aux horizons :
  - **+1h** (t+2 pas de 30 min),
  - **+2h** (t+4 pas de 30 min).
- **Prévoir la variation** (delta) de la cible aux mêmes horizons pour indiquer la **tendance** (hausse/baisse).

### 2. Données & cibles
- Données en entrée : **features enrichies** du notebook 02 (`../datas/processed/capteur_C013_features.csv`).
- Cibles créées par décalage temporel:
  - **Niveaux** : `cible_t+2`, `cible_t+4`.
  - **Deltas** : `dCible_t+2 = cible_t+2 - cible_t`, `dCible_t+4 = cible_t+4 - cible_t`.
- Les lignes de fin de série sans labels ont été **supprimées**.

### 3. Features utilisées
- **Signaux instantanés** : `temperature`, `temperature_ext`, `humidite`, `pression`, `nb_eleves_presents`.
- **Contexte temporel** : `salle_ouverte`, `heure_sin`, `heure_cos`, `jour_semaine`, `est_weekend`.
- **Mémoire** : `*_lag1`, `*_lag2`.
- **Tendances locales** : `*_moy_gliss_2` (1h), `*_moy_gliss_6` (3h).
- **Métier / interactions** : `ratio_occupation`, `ecart_temp_int_ext`, `ouvert_x_occupation`.

### 4. Modèles entraînés
- **Baseline linéaire** : `Ridge` (multi-sorties).
- **Modèle non linéaire** : `RandomForestRegressor` en **multi-sorties** pour prédire les deux horizons.
- **Validation** : `TimeSeriesSplit(n_splits=5)` pour respecter l’ordre temporel.

### 5. Évaluation (MAE / RMSE)
- Impression des **scores par fold** et par horizon ( +1h / +2h ), pour **niveaux** et **deltas**.
- Graphiques **Observé vs Prédit** sur le **dernier fold** :
  - Séries temporelles (courbes) et scatter avec diagonale **y = x**.
- **Lecture typique** : erreurs < 1.0 en MAE sur +1h / +2h pour la température (selon les folds et le tuning).

### 6. Tuning des hyperparamètres (RF)
- Recherche (RandomizedSearch) → meilleurs paramètres (exemple) :
  - `n_estimators=400`, `max_depth=20`, `min_samples_leaf=1`, `max_features="sqrt"`.
- **Bénéfices** : meilleure stabilité entre folds, compromis biais/variance plus robuste.
- 
