# Feature Engineering

In [3]:
import pandas as pd
import numpy as np
import os

os.makedirs("../datas/processed", exist_ok=True)

SOURCE_CSV = "../datas/raw/capteur_C013.csv"
OUTPUT_CSV = "../datas/processed/capteur_C013_features.csv"

pd.set_option("display.max_columns", 100)


In [7]:
df = pd.read_csv(SOURCE_CSV, parse_dates=["timestamp"])
df = df.sort_values("timestamp").drop_duplicates(subset=["timestamp"])
print(df.shape)
df.head()

(336, 8)


Unnamed: 0,nom_salle,capacite_salle,timestamp,pression,temperature,nb_eleves_presents,temperature_ext,humidite
0,C013,30,2025-08-20 22:00:00,1014.8,32.0,0,34.7,35.5
1,C013,30,2025-08-20 22:30:00,1014.9,31.0,0,33.0,35.6
2,C013,30,2025-08-20 23:00:00,1014.7,30.7,0,33.0,40.3
3,C013,30,2025-08-20 23:30:00,1014.5,29.9,0,31.7,36.6
4,C013,30,2025-08-21 00:00:00,1014.5,29.9,0,31.0,40.6


In [8]:
df["jour"] = df["timestamp"].dt.date
df["heure"] = df["timestamp"].dt.hour
df["minute"] = df["timestamp"].dt.minute
df["jour_semaine"] = df["timestamp"].dt.dayofweek  # 0=lundi
df["est_weekend"] = (df["jour_semaine"] >= 5).astype(int)

hhmm = df["heure"] + df["minute"] / 60.0
df["salle_ouverte"] = ((hhmm >= 7.5) & (hhmm < 19.5)).astype(int)

phase = hhmm / 24.0 * 2 * np.pi
df["heure_sin"] = np.sin(phase)
df["heure_cos"] = np.cos(phase)

df[
    [
        "timestamp",
        "jour",
        "heure",
        "minute",
        "jour_semaine",
        "est_weekend",
        "salle_ouverte",
        "heure_sin",
        "heure_cos",
    ]
].head()

Unnamed: 0,timestamp,jour,heure,minute,jour_semaine,est_weekend,salle_ouverte,heure_sin,heure_cos
0,2025-08-20 22:00:00,2025-08-20,22,0,2,0,0,-0.5,0.866025
1,2025-08-20 22:30:00,2025-08-20,22,30,2,0,0,-0.382683,0.92388
2,2025-08-20 23:00:00,2025-08-20,23,0,2,0,0,-0.258819,0.965926
3,2025-08-20 23:30:00,2025-08-20,23,30,2,0,0,-0.130526,0.991445
4,2025-08-21 00:00:00,2025-08-21,0,0,3,0,0,0.0,1.0


In [9]:
if "capacite_salle" in df.columns:
    df["ratio_occupation"] = df["nb_eleves_presents"] / df["capacite_salle"]
else:
    df["ratio_occupation"] = np.nan

df["ecart_temp_int_ext"] = df["temperature"] - df["temperature_ext"]

df["ouvert_x_occupation"] = df["salle_ouverte"] * df["nb_eleves_presents"]

df[
    [
        "nb_eleves_presents",
        "capacite_salle",
        "ratio_occupation",
        "ecart_temp_int_ext",
        "ouvert_x_occupation",
    ]
].head()


Unnamed: 0,nb_eleves_presents,capacite_salle,ratio_occupation,ecart_temp_int_ext,ouvert_x_occupation
0,0,30,0.0,-2.7,0
1,0,30,0.0,-2.0,0
2,0,30,0.0,-2.3,0
3,0,30,0.0,-1.8,0
4,0,30,0.0,-1.1,0


In [10]:
df["delta_temperature"] = df["temperature"].diff()
df["delta_humidite"] = df["humidite"].diff()
df["delta_pression"] = df["pression"].diff()
df["delta_temperature_ext"] = df["temperature_ext"].diff()
df["delta_occupation"] = df["nb_eleves_presents"].diff()

for col in [
    "temperature",
    "temperature_ext",
    "humidite",
    "pression",
    "nb_eleves_presents",
]:
    df[f"{col}_lag1"] = df[col].shift(1)
    df[f"{col}_lag2"] = df[col].shift(2)

df[[c for c in df.columns if "lag" in c or "delta_" in c]].head()


Unnamed: 0,delta_temperature,delta_humidite,delta_pression,delta_temperature_ext,delta_occupation,temperature_lag1,temperature_lag2,temperature_ext_lag1,temperature_ext_lag2,humidite_lag1,humidite_lag2,pression_lag1,pression_lag2,nb_eleves_presents_lag1,nb_eleves_presents_lag2
0,,,,,,,,,,,,,,,
1,-1.0,0.1,0.1,-1.7,0.0,32.0,,34.7,,35.5,,1014.8,,0.0,
2,-0.3,4.7,-0.2,0.0,0.0,31.0,32.0,33.0,34.7,35.6,35.5,1014.9,1014.8,0.0,0.0
3,-0.8,-3.7,-0.2,-1.3,0.0,30.7,31.0,33.0,33.0,40.3,35.6,1014.7,1014.9,0.0,0.0
4,0.0,4.0,0.0,-0.7,0.0,29.9,30.7,31.7,33.0,36.6,40.3,1014.5,1014.7,0.0,0.0


In [11]:
def add_roll_feats(frame, col, windows=(2, 6)):
    for w in windows:
        frame[f"{col}_moy_gliss_{w}"] = (
            frame[col].rolling(window=w, min_periods=1).mean()
        )


for col in ["temperature", "temperature_ext", "humidite", "nb_eleves_presents"]:
    add_roll_feats(df, col)

df[[c for c in df.columns if "moy_gliss" in c]].head()


Unnamed: 0,temperature_moy_gliss_2,temperature_moy_gliss_6,temperature_ext_moy_gliss_2,temperature_ext_moy_gliss_6,humidite_moy_gliss_2,humidite_moy_gliss_6,nb_eleves_presents_moy_gliss_2,nb_eleves_presents_moy_gliss_6
0,32.0,32.0,34.7,34.7,35.5,35.5,0.0,0.0
1,31.5,31.5,33.85,33.85,35.55,35.55,0.0,0.0
2,30.85,31.233333,33.0,33.566667,37.95,37.133333,0.0,0.0
3,30.3,30.9,32.35,33.1,38.45,37.0,0.0,0.0
4,29.9,30.7,31.35,32.68,38.6,37.72,0.0,0.0


In [12]:
df = df.ffill().bfill()
df.isnull().sum().sort_values(ascending=False).head(10)

nom_salle               0
pression_lag2           0
delta_occupation        0
temperature_lag1        0
temperature_lag2        0
temperature_ext_lag1    0
temperature_ext_lag2    0
humidite_lag1           0
humidite_lag2           0
pression_lag1           0
dtype: int64

In [13]:
colonnes_out = [
    "timestamp",
    "nom_salle",
    "capacite_salle",
    "temperature",
    "temperature_ext",
    "humidite",
    "pression",
    "nb_eleves_presents",
    "jour",
    "heure",
    "minute",
    "jour_semaine",
    "est_weekend",
    "salle_ouverte",
    "heure_sin",
    "heure_cos",
    "ratio_occupation",
    "ecart_temp_int_ext",
    "ouvert_x_occupation",
    "delta_temperature",
    "delta_humidite",
    "delta_pression",
    "delta_temperature_ext",
    "delta_occupation",
    "temperature_lag1",
    "temperature_lag2",
    "temperature_ext_lag1",
    "temperature_ext_lag2",
    "humidite_lag1",
    "humidite_lag2",
    "pression_lag1",
    "pression_lag2",
    "nb_eleves_presents_lag1",
    "nb_eleves_presents_lag2",
    "temperature_moy_gliss_2",
    "temperature_moy_gliss_6",
    "temperature_ext_moy_gliss_2",
    "temperature_ext_moy_gliss_6",
    "humidite_moy_gliss_2",
    "humidite_moy_gliss_6",
    "nb_eleves_presents_moy_gliss_2",
    "nb_eleves_presents_moy_gliss_6",
]
colonnes_out = [c for c in colonnes_out if c in df.columns]
df_out = df[colonnes_out].copy()

df_out.to_csv(OUTPUT_CSV, index=False)
print("Export réalisé :", OUTPUT_CSV)
df_out.head()


Export réalisé : ../datas/processed/capteur_C013_features.csv


Unnamed: 0,timestamp,nom_salle,capacite_salle,temperature,temperature_ext,humidite,pression,nb_eleves_presents,jour,heure,minute,jour_semaine,est_weekend,salle_ouverte,heure_sin,heure_cos,ratio_occupation,ecart_temp_int_ext,ouvert_x_occupation,delta_temperature,delta_humidite,delta_pression,delta_temperature_ext,delta_occupation,temperature_lag1,temperature_lag2,temperature_ext_lag1,temperature_ext_lag2,humidite_lag1,humidite_lag2,pression_lag1,pression_lag2,nb_eleves_presents_lag1,nb_eleves_presents_lag2,temperature_moy_gliss_2,temperature_moy_gliss_6,temperature_ext_moy_gliss_2,temperature_ext_moy_gliss_6,humidite_moy_gliss_2,humidite_moy_gliss_6,nb_eleves_presents_moy_gliss_2,nb_eleves_presents_moy_gliss_6
0,2025-08-20 22:00:00,C013,30,32.0,34.7,35.5,1014.8,0,2025-08-20,22,0,2,0,0,-0.5,0.866025,0.0,-2.7,0,-1.0,0.1,0.1,-1.7,0.0,32.0,32.0,34.7,34.7,35.5,35.5,1014.8,1014.8,0.0,0.0,32.0,32.0,34.7,34.7,35.5,35.5,0.0,0.0
1,2025-08-20 22:30:00,C013,30,31.0,33.0,35.6,1014.9,0,2025-08-20,22,30,2,0,0,-0.382683,0.92388,0.0,-2.0,0,-1.0,0.1,0.1,-1.7,0.0,32.0,32.0,34.7,34.7,35.5,35.5,1014.8,1014.8,0.0,0.0,31.5,31.5,33.85,33.85,35.55,35.55,0.0,0.0
2,2025-08-20 23:00:00,C013,30,30.7,33.0,40.3,1014.7,0,2025-08-20,23,0,2,0,0,-0.258819,0.965926,0.0,-2.3,0,-0.3,4.7,-0.2,0.0,0.0,31.0,32.0,33.0,34.7,35.6,35.5,1014.9,1014.8,0.0,0.0,30.85,31.233333,33.0,33.566667,37.95,37.133333,0.0,0.0
3,2025-08-20 23:30:00,C013,30,29.9,31.7,36.6,1014.5,0,2025-08-20,23,30,2,0,0,-0.130526,0.991445,0.0,-1.8,0,-0.8,-3.7,-0.2,-1.3,0.0,30.7,31.0,33.0,33.0,40.3,35.6,1014.7,1014.9,0.0,0.0,30.3,30.9,32.35,33.1,38.45,37.0,0.0,0.0
4,2025-08-21 00:00:00,C013,30,29.9,31.0,40.6,1014.5,0,2025-08-21,0,0,3,0,0,0.0,1.0,0.0,-1.1,0,0.0,4.0,0.0,-0.7,0.0,29.9,30.7,31.7,33.0,36.6,40.3,1014.5,1014.7,0.0,0.0,29.9,30.7,31.35,32.68,38.6,37.72,0.0,0.0


## Synthèse

### 1. Qualité & préparation
- Données triées par `timestamp`.
- Aucune valeur manquante détectée après création des features.
- Granularité : 30 minutes, horizon d’analyse de 7 jours.

### 2. Features créées
- **Temporelles** : `jour`, `heure`, `minute`, `jour_semaine`, `est_weekend`, `salle_ouverte` (7h30–19h30).
- **Cyclicité** : `heure_sin`, `heure_cos`.
- **Contexte occupation** :  
  - `ratio_occupation = nb_eleves_presents / capacite_salle`  
  - `ecart_temp_int_ext = temperature - temperature_ext`  
  - `ouvert_x_occupation = salle_ouverte * nb_eleves_presents`
- **Mémoire & dynamique** :  
  - **Deltas** : `delta_temperature`, `delta_humidite`, `delta_pression`, `delta_temperature_ext`, `delta_occupation`  
  - **Lags** (30 min, 1 h) : `*_lag1`, `*_lag2` pour température, humidité, pression, ext., occupation
- **Tendances locales (lissage)** : moyennes glissantes 1 h et 3 h (`*_moy_gliss_2`, `*_moy_gliss_6`) pour température, température_ext, humidité, occupation.

### 3. Pourquoi ?
- **Température_ext** : principal moteur de la **température intérieure**.  
- **Occupation / ratio_occupation** : apporte la charge thermique humaine.  
- **Cyclicité & salle_ouverte** : distingue les régimes jour/nuit et l’effet « HVAC + élèves ».  
- **Lags / deltas / moyennes glissantes** : donnent au modèle **mémoire** et **tendance**, cruciales en séries temporelles.

### 4. Export
- Dataset enrichi sauvegardé dans :  
  **`../datas/processed/capteur_C013_features.csv`** 