In [34]:
# Import
import pandas as pd
import numpy as np

In [35]:
# Charger le dataset ML
df = pd.read_csv("../data/dataset_ml.csv")

In [36]:
df.head()

Unnamed: 0,user_id,mois,depense_totale,depense_prev,target,profil,salaire,loyer,nb_enfants,tier_conso,conso_factor
0,1,2020-02-01,2583.554464,2548.787804,2280.959964,Famille Modeste,2025,605,3,Standard,1.9
1,1,2020-03-01,2280.959964,2583.554464,2227.944459,Famille Modeste,2025,605,3,Standard,1.9
2,1,2020-04-01,2227.944459,2280.959964,2315.965943,Famille Modeste,2025,605,3,Standard,1.9
3,1,2020-05-01,2315.965943,2227.944459,2406.175426,Famille Modeste,2025,605,3,Standard,1.9
4,1,2020-06-01,2406.175426,2315.965943,2402.397344,Famille Modeste,2025,605,3,Standard,1.9


In [37]:
df.shape

(88500, 11)

### Nettoyage de base

In [38]:
df = df.dropna(subset=["target"])
df.isnull().sum()

user_id           0
mois              0
depense_totale    0
depense_prev      0
target            0
profil            0
salaire           0
loyer             0
nb_enfants        0
tier_conso        0
conso_factor      0
dtype: int64

In [39]:
df["profil"].value_counts()

profil
Jeune actif        22951
Famille Modeste    19765
Cadre              16933
Étudiant           12744
Famille Aisée       8378
Retraité            7729
Name: count, dtype: int64

In [40]:
df.tier_conso.value_counts()

tier_conso
Standard    52687
Premium     22420
Eco         13393
Name: count, dtype: int64

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88500 entries, 0 to 88499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         88500 non-null  int64  
 1   mois            88500 non-null  object 
 2   depense_totale  88500 non-null  float64
 3   depense_prev    88500 non-null  float64
 4   target          88500 non-null  float64
 5   profil          88500 non-null  object 
 6   salaire         88500 non-null  int64  
 7   loyer           88500 non-null  int64  
 8   nb_enfants      88500 non-null  int64  
 9   tier_conso      88500 non-null  object 
 10  conso_factor    88500 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 7.4+ MB


### Conversion des dates

In [12]:
df["mois"] = pd.to_datetime(df["mois"])
df = df.sort_values(["user_id", "mois"])

### Extraction de features temporelles

On extrait :
- le mois (1–12)
- le trimestre
- est-ce un début d’année ?
- est-ce une période de fêtes ?
- un index temporel global

In [13]:
df["month"] = df["mois"].dt.month
df["quarter"] = df["mois"].dt.quarter
df["year"] = df["mois"].dt.year

# Saison des fêtes (Nov-Dec)
df["periode_fetes"] = df["month"].isin([11, 12]).astype(int)

# Index temporel global (important pour séries courtes)
df["time_idx"] = df.groupby("user_id").cumcount()

### Visualisation variable a encoder

In [14]:
#profil
df["profil"].value_counts()

profil
Jeune actif        22951
Famille Modeste    19765
Cadre              16933
Étudiant           12744
Famille Aisée       8378
Retraité            7729
Name: count, dtype: int64

In [15]:
df["tier_conso"].value_counts()

tier_conso
Standard    52687
Premium     22420
Eco         13393
Name: count, dtype: int64

### Encodage des variables catégorielles

on encode :
- profil
- tier_conso

In [16]:
df["profil"] = df["profil"].astype("category")
df["tier_conso"] = df["tier_conso"].astype("category")

df["profil_code"] = df["profil"].cat.codes
df["tier_code"] = df["tier_conso"].cat.codes

In [17]:
# Suppression 
df = df.drop(columns=["profil", "tier_conso"])

In [18]:
# visualiser le profil encodés
"""
0 : Cadres
1 : Famille Aisée
2 : Famille Modestes
3 : Jeunes actif
4 : Retraités
5 : Etudiants
"""
df["profil_code"].value_counts()

profil_code
3    22951
2    19765
0    16933
5    12744
1     8378
4     7729
Name: count, dtype: int64

In [19]:
# visualiser le tier_conso encodés
"""
0 : Eco
1 : Premium
2 : Standard
"""
df["tier_code"].value_counts()

tier_code
2    52687
1    22420
0    13393
Name: count, dtype: int64

### Feature Engineering
 
Les modèles temporels marchent mieux avec plusieurs `lags` (rétroviseur) :
- Transformer un simple tableau de chiffres en données temporelles intelligentes
- Donner de la mémoire à notre modèle
- La "Rolling Mean" (La Tendance de fond) : train de vie moyen récent

In [20]:
df["lag_1"] = df.groupby("user_id")["depense_totale"].shift(1)
df["lag_2"] = df.groupby("user_id")["depense_totale"].shift(2)
df["lag_3"] = df.groupby("user_id")["depense_totale"].shift(3)
df["rolling_mean_3"] = df.groupby("user_id")["depense_totale"].shift(1).rolling(3).mean()

In [21]:
df = df.dropna()

In [29]:
df.head()

Unnamed: 0,user_id,mois,depense_totale,depense_prev,target,salaire,loyer,nb_enfants,conso_factor,month,quarter,year,periode_fetes,time_idx,profil_code,tier_code,lag_1,lag_2,lag_3,rolling_mean_3
3,1,2020-05-01,2315.965943,2227.944459,2406.175426,2025,605,3,1.9,5,2,2020,0,3,2,2,2227.944459,2280.959964,2583.554464,2364.152962
4,1,2020-06-01,2406.175426,2315.965943,2402.397344,2025,605,3,1.9,6,2,2020,0,4,2,2,2315.965943,2227.944459,2280.959964,2274.956789
5,1,2020-07-01,2402.397344,2406.175426,2828.328576,2025,605,3,1.9,7,3,2020,0,5,2,2,2406.175426,2315.965943,2227.944459,2316.695276
6,1,2020-08-01,2828.328576,2402.397344,1574.711335,2025,605,3,1.9,8,3,2020,0,6,2,2,2402.397344,2406.175426,2315.965943,2374.846238
7,1,2020-09-01,1574.711335,2828.328576,2531.992039,2025,605,3,1.9,9,3,2020,0,7,2,2,2828.328576,2402.397344,2406.175426,2545.633782


In [30]:
df.to_csv("../data/dataset_ml_prepared.csv", index=False)
print("Dataset préparé sauvegardé !")

Dataset préparé sauvegardé !
