In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sqlalchemy import create_engine
import logging
from tqdm import tqdm

In [2]:
DOSSIER_RACINE = "data_part1/Data_All_Variables_2019_2024"
ANNEES = [str(year) for year in range(2019, 2025)]
EXTENSION = ".csv"
ENCODAGE = "utf-8"

In [7]:
# 🧹 Étape 2 : Chargement et concaténation de tous les CSV
def charger_tous_les_csv(dossier_racine, annees):
    dataframes = []
    for annee in annees:
        chemin_annee = os.path.join(dossier_racine, annee)
        if not os.path.exists(chemin_annee):
            continue
        for fichier in tqdm(os.listdir(chemin_annee)):
            if fichier.endswith(EXTENSION):
                chemin_fichier = os.path.join(chemin_annee, fichier)
                try:
                    df = pd.read_csv(chemin_fichier, encoding=ENCODAGE, delimiter=';')
                    df["source_annee"] = annee
                    dataframes.append(df)
                except Exception as e:
                    print(f"Erreur avec {fichier}: {e}")
    return pd.concat(dataframes, ignore_index=True)

In [16]:
df = charger_tous_les_csv(DOSSIER_RACINE, ANNEES)
print(f"✅ Données chargées : {df.shape}")
df.head()

100%|██████████| 13/13 [00:00<00:00, 217.37it/s]
100%|██████████| 12/12 [00:00<00:00, 510.14it/s]
100%|██████████| 13/13 [00:00<00:00, 521.44it/s]
100%|██████████| 12/12 [00:00<00:00, 166.95it/s]
100%|██████████| 12/12 [00:00<00:00, 62.69it/s]
100%|██████████| 11/11 [00:00<00:00, 469.51it/s]

✅ Données chargées : (365376, 7)





Unnamed: 0,FechaHora,ROOT.meteoPDL.AirTemperature.hf,ROOT.meteoPDL.Pyr1IrradianceCompensated.hf,ROOT.meteoPDL.RelativeHumidity.hf,ROOT.meteoPDL.WindSpeed.hf,Unnamed: 5,source_annee
0,14/04/2019 00:00:59.000,31.700001,0.02737,17.251152,0.84757,,2019
1,14/04/2019 00:30:59.000,31.874424,0.09228,16.5,1.893272,,2019
2,14/04/2019 01:00:59.000,32.0,0.0,15.3,2.09578,,2019
3,14/04/2019 01:30:59.000,31.981928,0.021053,15.171184,2.0,,2019
4,14/04/2019 02:00:59.000,28.387892,0.57,21.265627,2.2422,,2019


In [23]:
def nettoyer_dataframe(df):
    df = df.drop(columns=['Unnamed: 5'])
    columns_name = ['datetime', 'airtemperature', 'irradiance', 'humidity', 'wind_speed']
    df.columns = columns_name + list(df.columns[len(columns_name):])
    # df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    df = df.drop_duplicates()
    df = df.dropna(how='all')  # Supprime lignes vides
    df['datetime'] = pd.to_datetime(df['datetime'], format="%d/%m/%Y %H:%M:%S.%f", errors='coerce')

    return df


In [24]:
df_clean = nettoyer_dataframe(df)
df_clean.head(10)

Unnamed: 0,datetime,airtemperature,irradiance,humidity,wind_speed,source_annee
0,2019-04-14 00:00:59,31.700001,0.02737,17.251152,0.84757,2019
1,2019-04-14 00:30:59,31.874424,0.09228,16.5,1.893272,2019
2,2019-04-14 01:00:59,32.0,0.0,15.3,2.09578,2019
3,2019-04-14 01:30:59,31.981928,0.021053,15.171184,2.0,2019
4,2019-04-14 02:00:59,28.387892,0.57,21.265627,2.2422,2019
5,2019-04-14 02:30:59,30.5,0.0,17.071901,2.683918,2019
6,2019-04-14 03:00:59,29.0,0.368471,19.300001,0.697779,2019
7,2019-04-14 03:30:59,29.961927,0.0,16.338074,1.557688,2019
8,2019-04-14 04:00:59,30.428099,0.0,15.343803,1.1,2019
9,2019-04-14 04:30:59,30.200001,0.096841,14.770206,1.355424,2019


In [25]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365372 entries, 0 to 365375
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        365372 non-null  datetime64[ns]
 1   airtemperature  365372 non-null  float64       
 2   irradiance      365372 non-null  float64       
 3   humidity        365372 non-null  float64       
 4   wind_speed      365372 non-null  float64       
 5   source_annee    365372 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 19.5+ MB
