**Notebook** : setup_meteo_folder.ipynb

**Description** : Préparation du dossier meteo/ pour diohine/modeleCommun

In [34]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from typing import List, Union

## 1. Creation des dossiers `meteo/` et `meteo/simulee/` dans le dossier `diohine/modeleCommun`

In [2]:
# Définir le chemin de base du projet
base_dir = Path.cwd().parent.resolve()

# Définir les chemins spécifiques
meteo_dir = base_dir / "diohine" / "modeleCommun" / "meteo"
simulee_dir = meteo_dir / "simulee"

In [3]:
# Créer meteo/ s'il n'existe pas
if not meteo_dir.exists():
    meteo_dir.mkdir(parents=True, exist_ok=True)
    print(f"Création du dossier : {meteo_dir}")
else:
    print(f"Dossier déjà existant : {meteo_dir}")

Création du dossier : C:\Users\Cheikhou\Desktop\Ferlo_Sine\maelia-instanciation-ferlo-sine\diohine\modeleCommun\meteo


In [4]:
# Créer simulee/ s'il n'existe pas
if not simulee_dir.exists():
    simulee_dir.mkdir(parents=True, exist_ok=True)
    print(f"Création du dossier : {simulee_dir}")
else:
    print(f"Dossier déjà existant : {simulee_dir}")

Création du dossier : C:\Users\Cheikhou\Desktop\Ferlo_Sine\maelia-instanciation-ferlo-sine\diohine\modeleCommun\meteo\simulee


In [5]:
# Résumé de la situation
print("\nStructure prévue :")
print(f"- {meteo_dir}")
print(f"- {simulee_dir}")


Structure prévue :
- C:\Users\Cheikhou\Desktop\Ferlo_Sine\maelia-instanciation-ferlo-sine\diohine\modeleCommun\meteo
- C:\Users\Cheikhou\Desktop\Ferlo_Sine\maelia-instanciation-ferlo-sine\diohine\modeleCommun\meteo\simulee


## 2. Data Cleaning des donnees climatiques brutes issues de CRNM-ESM2-1

### 2.1. Exploratory Data Analysis

In [12]:
# Définir manuellement le chemin vers le fichier
fichier_climat = base_dir / "data" / "climat" / "niakharData" / "CNRM-ESM2-1_ssp126.csv"

# Lire le fichier
df = pd.read_csv(fichier_climat, sep=",")  # Ajuste le séparateur si besoin

# Afficher les 5 premières lignes
df.head()

Unnamed: 0,time,rsds,tasmin,tasmax,hurs,sfcWind,pr
0,date,W/m2,Celsius,Celsius,%,m/s,mm/day
1,1950-01-01 12:00:00,271.7951354980469,17.899871826171875,34.546234130859375,26.503536224365234,5.849405765533447,0.0
2,1950-01-02 12:00:00,273.6487121582031,16.197845458984375,34.462249755859375,29.381072998046875,4.4467082023620605,0.0
3,1950-01-03 12:00:00,272.430908203125,12.836456298828125,35.140869140625,33.332759857177734,2.7305986881256104,0.0
4,1950-01-04 12:00:00,268.4210205078125,13.605072021484375,34.72869873046875,41.25194549560547,2.702117681503296,0.0


In [10]:
df.columns

Index(['time', 'rsds', 'tasmin', 'tasmax', 'hurs', 'sfcWind', 'pr'], dtype='object')

In [13]:
df.dtypes

time       object
rsds       object
tasmin     object
tasmax     object
hurs       object
sfcWind    object
pr         object
dtype: object

In [15]:
df.describe()

Unnamed: 0,time,rsds,tasmin,tasmax,hurs,sfcWind,pr
count,36891,36891.0,36891.0,36891.0,36891.0,36891.0,36891.0
unique,36891,36781.0,35123.0,34352.0,36834.0,36827.0,11415.0
top,2050-12-31 12:00:00,331.67315673828125,24.517486572265625,34.53302001953125,82.84820556640625,3.3301892280578613,0.0
freq,1,2.0,4.0,4.0,2.0,2.0,25475.0


In [16]:
df.sample(10)

Unnamed: 0,time,rsds,tasmin,tasmax,hurs,sfcWind,pr
5662,1965-07-02 12:00:00,248.86489868164065,23.81201171875,33.266632080078125,77.70514678955078,4.259986400604248,1.58824356039986
11805,1982-04-27 12:00:00,357.9740905761719,17.611846923828125,37.33615112304688,47.61207962036133,2.7390637397766118,0.0
15201,1991-08-14 12:00:00,236.53253173828125,24.1866455078125,31.469390869140625,86.41510772705078,2.6207427978515625,4.613034788053483
25198,2018-12-27 12:00:00,235.9857940673828,21.143035888671875,35.261993408203125,39.1005744934082,5.169499397277832,0.0
24618,2017-05-26 12:00:00,349.20489501953125,24.959930419921875,38.21331787109375,60.6336555480957,4.807073593139648,0.0
18526,2000-09-20 12:00:00,272.7701110839844,23.318634033203125,32.276153564453125,83.2411880493164,4.059689998626709,1.0703166743041947
32120,2037-12-09 12:00:00,231.59243774414065,19.354400634765625,33.470245361328125,48.09737396240234,4.486200332641602,0.0
5375,1964-09-18 12:00:00,236.5559539794922,23.13763427734375,30.803375244140625,89.35624694824219,3.330995321273804,13.118197303265331
27294,2024-09-22 12:00:00,267.9747009277344,24.5523681640625,33.058837890625,81.36646270751953,2.6771817207336426,0.0519421955686993
430,1951-03-06 12:00:00,326.1204833984375,16.541259765625,32.185272216796875,53.8969612121582,6.9023284912109375,0.0


### 2.2. Nettoyage des donnees

In [20]:
# Définir les chemins
input_dir = base_dir / "data" / "climat" / "niakharData"
output_dir = base_dir / "data" / "climat" / "processed"

# Créer output_dir s'il n'existe pas
output_dir.mkdir(parents=True, exist_ok=True)

# Charger le fichier climat brut
fichier_climat = input_dir / "CNRM-ESM2-1_ssp126.csv"

df_raw = pd.read_csv(fichier_climat, sep=",")

In [21]:
# Affichage rapide pour vérifier
print("Colonnes d'origine :", df_raw.columns.tolist())
print("Premières lignes :")
display(df_raw.head())

Colonnes d'origine : ['time', 'rsds', 'tasmin', 'tasmax', 'hurs', 'sfcWind', 'pr']
Premières lignes :


Unnamed: 0,time,rsds,tasmin,tasmax,hurs,sfcWind,pr
0,date,W/m2,Celsius,Celsius,%,m/s,mm/day
1,1950-01-01 12:00:00,271.7951354980469,17.899871826171875,34.546234130859375,26.503536224365234,5.849405765533447,0.0
2,1950-01-02 12:00:00,273.6487121582031,16.197845458984375,34.462249755859375,29.381072998046875,4.4467082023620605,0.0
3,1950-01-03 12:00:00,272.430908203125,12.836456298828125,35.140869140625,33.332759857177734,2.7305986881256104,0.0
4,1950-01-04 12:00:00,268.4210205078125,13.605072021484375,34.72869873046875,41.25194549560547,2.702117681503296,0.0


In [22]:
# Supprimer la première ligne (unités)
df_clean = df_raw.drop(index=0).reset_index(drop=True)

# Parser la colonne 'time' en datetime
df_clean["time"] = pd.to_datetime(df_clean["time"], format="%Y-%m-%d %H:%M:%S")

# Conversion des colonnes numériques
cols_to_convert = ["rsds", "tasmin", "tasmax", "hurs", "sfcWind", "pr"]
for col in cols_to_convert:
    df_clean[col] = df_clean[col].astype(float)

# Renommage des colonnes
df_clean = df_clean.rename(columns={
    "time": "DATE",
    "pr": "RRmm",
    "tasmin": "Tmin",
    "tasmax": "Tmax",
    "rsds": "RGI"
})

In [24]:
# Affichage rapide pour vérifier
print("Colonnes d'origine :", df_clean.columns.tolist())
print("Premières lignes :")
display(df_clean.head())

Colonnes d'origine : ['DATE', 'RGI', 'Tmin', 'Tmax', 'hurs', 'sfcWind', 'RRmm']
Premières lignes :


Unnamed: 0,DATE,RGI,Tmin,Tmax,hurs,sfcWind,RRmm
0,1950-01-01 12:00:00,271.795135,17.899872,34.546234,26.503536,5.849406,0.0
1,1950-01-02 12:00:00,273.648712,16.197845,34.46225,29.381073,4.446708,0.0
2,1950-01-03 12:00:00,272.430908,12.836456,35.140869,33.33276,2.730599,0.0
3,1950-01-04 12:00:00,268.421021,13.605072,34.728699,41.251945,2.702118,0.0
4,1950-01-05 12:00:00,265.328827,14.521393,34.071106,45.360634,3.337594,0.0


### 2.4. Approximation de l'ETP (formule simplifiée Hargreaves-Samani)

In [25]:
# Calcul ETP
df_clean["ETP"] = 0.0023 * ((df_clean["Tmax"] - df_clean["Tmin"]) ** 0.5) * (df_clean["Tmax"] + df_clean["Tmin"] + 17.8)

### 2.5. Conversion RGI

In [26]:
# Conversion RGI (de W/m² vers MJ/m²/jour)
df_clean["RGI"] = (df_clean["RGI"] * 86400) / 1_000_000

### 2.5. Sauvegrader dans `climat/processed/`

In [27]:
# Réorganiser les colonnes selon ordre documentation
# (Pas encore d'ID_PDG pour l'instant, on l'ajoutera plus tard)

# Reformater la date au format DD/MM/YYYY
df_clean["DATE"] = df_clean["DATE"].dt.strftime("%d/%m/%Y")

df_clean = df_clean[["DATE", "RRmm", "Tmin", "Tmax", "ETP", "RGI"]]

output_file = output_dir / "climat_diohine_clean_global.csv"
df_clean.to_csv(output_file, sep=";", index=False)

print(f"Fichier nettoyé sauvegardé ici : {output_file}")

Fichier nettoyé sauvegardé ici : C:\Users\Cheikhou\Desktop\Ferlo_Sine\maelia-instanciation-ferlo-sine\data\climat\processed\climat_diohine_clean_global.csv


## 3. Ajout des `ID_PDG`

In [35]:
## Helper Functions

def extract_attributes_from_shapefile(
    shapefile_path: Union[str, Path],
    attributes: List[str],
    include_geometry: bool = False
) -> pd.DataFrame:
    """
    Reads a shapefile and returns a DataFrame with the specified attributes.

    Parameters:
    - shapefile_path (str or Path): Path to the shapefile.
    - attributes (List[str]): List of attribute column names to extract.
    - include_geometry (bool): If True, includes the geometry column in the result.

    Returns:
    - pd.DataFrame: DataFrame containing the specified attributes (and geometry if requested).
    """
    # Convert to Path object if necessary
    shapefile_path = Path(shapefile_path)

    # Read the shapefile
    gdf = gpd.read_file(shapefile_path)

    # Ensure all requested attributes exist in the shapefile
    missing_attrs = [attr for attr in attributes if attr not in gdf.columns]
    if missing_attrs:
        raise ValueError(f"The following attributes are not present in the shapefile: {missing_attrs}")

    # Select the desired columns
    selected_columns = attributes.copy()
    if include_geometry:
        selected_columns.append(gdf.geometry.name)  # Typically 'geometry'

    # Return the DataFrame with selected columns
    return gdf[selected_columns]

In [36]:
# Définir le chemin du shapefile (corrigé)
shapefile_path = base_dir / "data" / "climat" / "shapefiles" / "Parcellaire_Diohine_2.shp"

# Définir les attributs à extraire
attributes_to_extract = ["ID_PDG"]  # Adapter selon les besoins

# Appeler la fonction
df_attributes = extract_attributes_from_shapefile(shapefile_path, attributes_to_extract)

# Afficher les premiers résultats
df_attributes.head()

Unnamed: 0,ID_PDG
0,1
1,2
2,3
3,4
4,5
