In [1]:
import geopandas as gpd
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import box
import matplotlib.pyplot as plt
from pyproj import CRS
import time
import math


import importlib
import fonctions_annexes_biodiv
importlib.reload(fonctions_annexes_biodiv)
from fonctions_annexes_biodiv import generer_dictionnaire_taxonomie

%matplotlib qt
path = 'C:/Users/anormand/Documents/Projet Python/Biodiv/Data'  # chemin vers le fichier


In [2]:
def degrees_per_km(latitude):
    """
    Calculate the degree equivalent of 1 km for both latitude and longitude
    at a specific latitude.

    :param latitude: Latitude in degrees
    :return: Tuple (lat_deg_per_km, lon_deg_per_km)
    """
    # 1 degree of latitude is ~111.32 km everywhere
    lat_deg_per_km = 1 / 111.32

    # 1 degree of longitude varies with latitude
    lon_deg_per_km = 1 / 111.32 / math.cos(math.radians(latitude))

    return lat_deg_per_km, lon_deg_per_km

def create_country_grid_WGS84(gdf, country_code,col_code="color_code", grid_size_km=10,midpoint_lat=None,display=False):
    """
    Generates a grid within the specified country's boundaries with cell names based on 
    coordinates and grid size using WGS 84 reference system (EPSG:4326).
    
    Parameters:
    - gdf: GeoDataFrame containing global boundaries with a column for the country's unique code (e.g., "color_code").
    - country_name: Name of the country for which to generate the grid.
    - grid_size_km: Size of the grid cells in kilometers (default is 10 km x 10 km).
    
    Returns:
    - grid: GeoDataFrame with grid cells and a 'name' column for each cell's unique identifier.
    """
    # Convert grid size from kilometers to degrees (approximation for WGS 84)
    # 1 degree of latitude ≈ 111 km, but longitude degrees vary by latitude
    # Filter to get the specified country
    country = gdf[gdf[col_code].isin(country_code)]


    if country.empty:
        raise ValueError(f"Country '{country_code}' not found in the GeoDataFrame.")

    # Define a function to create a grid of specified resolution (in degrees) with alignment to a reference point
    def create_grid(country, grid_size=10, reference_point=(0, 0),midpoint_lat=None):
        # Unpack reference point
        ref_x, ref_y = reference_point
        
        country_geometry = country.geometry.unary_union  # Combine all geometries into one
    
        # Get the bounds of the input GeoDataFrame
        minx, miny, maxx, maxy = country_geometry.bounds


            # Step 3: Adjust bounding box to align with (0, 0)
        # Use approximate degrees per km for the middle of the country
        if midpoint_lat is None:
            midpoint_lat = (miny + maxy) / 2
        lat_deg_per_km, lon_deg_per_km = degrees_per_km(midpoint_lat)
        dy = grid_size * lat_deg_per_km
        dx = grid_size * lon_deg_per_km
    
        # Align the starting point to the reference point
        start_x = ref_x + ((minx - ref_x) // dx) * dx
        start_y = ref_y + ((miny - ref_y) // dy) * dy
    
        grid_cells = []
        x = start_x
        while x < maxx:
            y = start_y
            while y < maxy:
                cell = box(x, y, x + dx, y + dy)
                min_lon, min_lat, max_lon, max_lat = cell.bounds
    
                # Append the grid cell along with its bounds
                grid_cells.append({
                    "geometry": cell,
                    "min_lon": min_lon,
                    "min_lat": min_lat,
                    "max_lon": max_lon,
                    "max_lat": max_lat
                })
                y += dy
            x += dx
    
        # Create a GeoDataFrame from the grid cells with the additional columns
        grid_gdf = gpd.GeoDataFrame(
            grid_cells, 
            columns=["geometry", "min_lon", "min_lat", "max_lon", "max_lat"], 
            crs=country.crs
        )
        
        return grid_gdf

    
    # Create the grid and clip it to the country's boundary

    grid = create_grid(country, grid_size=grid_size_km,midpoint_lat=midpoint_lat)
    grid = grid[grid.intersects(country.unary_union)]
    
    # Generate names based on cell centroid latitude and longitude
    def generate_grid_name(cell, grid_size_km):
        centroid = cell.geometry.centroid
        lon, lat = centroid.x, centroid.y
        lon_label = f"E{int(abs(lon) * 10000):05d}" if lon >= 0 else f"W{int(abs(lon) * 10000):05d}"
        lat_label = f"N{int(abs(lat) * 10000):05d}" if lat >= 0 else f"S{int(abs(lat) * 10000):05d}"
        return f"{grid_size_km}km{lon_label}{lat_label}"

    # Apply naming to each grid cell
    grid["cell_name"] = grid.apply(lambda cell: generate_grid_name(cell, grid_size_km), axis=1)
    #grid["country_code"] = country_code

    if display is True:
        # Plot the grid and the country boundary for reference
        fig, ax = plt.subplots(figsize=(10, 10))
        gdf.plot(ax=ax, edgecolor="black", linewidth=0.5)
        country.plot(ax=ax, edgecolor="red", linewidth=2, facecolor="none")
        grid.plot(ax=ax, color="lightblue", edgecolor="grey", alpha=0.6)
    
        plt.show()
    
    return grid

def add_grid_to_country(df_country, grid,cle_geo):
    """
    Optimized version of adding the corresponding grid cell to each row in df_country based on latitude and longitude.
    
    Parameters:
    - df_country: DataFrame containing the columns 'decimalLatitude' and 'decimalLongitude'.
    - grid: DataFrame containing the grid cells with 'name', 'min_lon', 'min_lat', 'max_lon', and 'max_lat' columns.
    
    Returns:
    - df_country: Updated DataFrame with an additional 'grid_name' column indicating the grid cell for each point.
    """
    # Convert grid bounds to NumPy arrays for efficient vectorized comparison
    min_lons = grid['min_lon'].values
    max_lons = grid['max_lon'].values
    min_lats = grid['min_lat'].values
    max_lats = grid['max_lat'].values
    grid_names = grid[cle_geo].values
    
    # Initialize an array to store the grid names
    grid_names_for_points = []
    
    # Iterate over each point in df_country and apply vectorized comparison
    for lon, lat in zip(df_country['decimalLongitude'], df_country['decimalLatitude']):
        # Find the grid cell by comparing the point coordinates with grid bounds
        matching_grid = np.where((min_lons <= lon) & (lon <= max_lons) & (min_lats <= lat) & (lat <= max_lats))[0]
        
        if matching_grid.size > 0:
            grid_names_for_points.append(grid_names[matching_grid[0]])  # Take the first matching grid cell
        else:
            grid_names_for_points.append(None)  # No matching grid
    
    # Add the grid names to the DataFrame
    df_country['grid_name'] = grid_names_for_points
    
    return df_country

def formater_maille_espece_GBIF(df,cle_geo='codeMaille10Km',cle_ID='cdRef',annee_min=None,bornes_temporelles=None):
    df_dico=generer_dictionnaire_taxonomie(df,cle_ID)
    # Convertir la colonne 'year' en int
    df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)

    # Convertir la colonne 'cdNom' en int
    df[cle_ID] = df[cle_ID].astype(int)

    if annee_min is not None:
        df=df[df['year']>=annee_min]
    
    # Choisir des bornes temporelles et assigner une période aux données
    if bornes_temporelles is not None:
        df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles, 
                       labels=[f'Période {i+1}: {bornes_temporelles[i]+1} à {bornes_temporelles[i+1]}' for i in range(len(bornes_temporelles) - 1)],
                       include_lowest=False)  # include_lowest=True inclut la borne inférieureure
        # Compter le nombre de données dans chaque intervalle
        compte_par_periode = df['periode'].value_counts()
        print(compte_par_periode)
         # Compter les occurrences d'observation de chaque taxon pour chaque code et période
        df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True).size().reset_index(name='nombreObs')
        #df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['individualCount'].sum().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
       
    else:
        # Compter les occurrences d'observation de chaque taxon pour chaque code
        df_maille_espece = df.groupby([cle_geo, cle_ID], observed=True).size().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
        
    df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)
    
    return df_maille_espece

In [3]:
departement_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\carte_departements.geojson'  # Remplace par le chemin vers ton fichier
departement_gpd = gpd.read_file(departement_fichier)

PNR_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PNR_S_000.shx'  # Remplace par le chemin vers ton fichier
PNR_gpd = gpd.read_file(PNR_fichier)
PNR_gpd=PNR_gpd[['NOM_SITE','geometry']]

PN_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PN_S_000.shx'  # Remplace par le chemin vers ton fichier
PN_gpd = gpd.read_file(PN_fichier)
PN_gpd=PN_gpd[['NOM_SITE','geometry']]

# Reprojeter en WGS84 (EPSG:4326)
PNR_gpd = PNR_gpd.to_crs(epsg=4326)
PN_gpd = PN_gpd.to_crs(epsg=4326)
departement_gpd = departement_gpd.to_crs(epsg=4326)



In [4]:
departement_gpd

Unnamed: 0,code,nom,geometry
0,01,Ain,"POLYGON ((4.78021 46.17668, 4.78024 46.18905, ..."
1,02,Aisne,"POLYGON ((3.17296 50.01131, 3.17382 50.01186, ..."
2,03,Allier,"POLYGON ((3.03207 46.79491, 3.03424 46.7908, 3..."
3,04,Alpes-de-Haute-Provence,"POLYGON ((5.67604 44.19143, 5.67817 44.19051, ..."
4,05,Hautes-Alpes,"POLYGON ((6.26057 45.12685, 6.26417 45.12641, ..."
...,...,...,...
91,91,Essonne,"POLYGON ((2.22656 48.7761, 2.22866 48.77451, 2..."
92,92,Hauts-de-Seine,"POLYGON ((2.29097 48.95097, 2.29162 48.95077, ..."
93,93,Seine-Saint-Denis,"POLYGON ((2.55306 49.00982, 2.55814 49.01201, ..."
94,94,Val-de-Marne,"POLYGON ((2.3319 48.81701, 2.33371 48.81677, 2..."


In [5]:
# Charger les données (si ce n'est pas encore un GeoDataFrame)
 # Remplace par ton fichier si besoin

# Nettoyer les noms des sites
PN_gpd["NOM_SITE"] = PN_gpd["NOM_SITE"].str.replace(r"\s*\[aire d'adhésion\]\s*", "", regex=True)
PN_gpd["NOM_SITE"] = PN_gpd["NOM_SITE"].str.replace(r"\s*\[Aire d'adhésion\]\s*", "", regex=True)


# Fusionner les géométries par site
PN_gpd_fusionne = PN_gpd.dissolve(by="NOM_SITE")

# Sauvegarde si besoin
#gdf_fusionne.to_file("output.geojson", driver="GeoJSON")
PN_gpd_fusionne.reset_index(inplace=True)

In [6]:
PN_gpd_fusionne

Unnamed: 0,NOM_SITE,geometry
0,Calanques,"POLYGON ((5.34577 43.28208, 5.34579 43.28206, ..."
1,Cévennes,"POLYGON ((3.44487 44.02465, 3.44482 44.02463, ..."
2,Ecrins,"POLYGON ((6.01676 44.9867, 6.01599 44.98834, 6..."
3,Forêts,"MULTIPOLYGON (((4.67735 47.73921, 4.67768 47.7..."
4,Mercantour,"POLYGON ((7.58032 44.02968, 7.57988 44.02961, ..."
5,Port-Cros,"MULTIPOLYGON (((6.20402 43.14906, 6.20408 43.1..."
6,Pyrénées,"POLYGON ((0.11582 42.83451, 0.11638 42.83422, ..."
7,Vanoise,"POLYGON ((6.5105 45.44497, 6.51057 45.44497, 6..."


In [7]:
# Sauvegarde du fichier pré-traité sous le format 1 ligne = 1 observation
"""
df_cleaned.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+
                  country_name.replace(' ', '_')+
                  "/Filtered"+
                  "/data_GBIF_"+country_name.replace(' ', '_')+"_"+str(grid_size_km)+"Km_filtered.csv",
                  index=False)
"""

'\ndf_cleaned.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+\n                  country_name.replace(\' \', \'_\')+\n                  "/Filtered"+\n                  "/data_GBIF_"+country_name.replace(\' \', \'_\')+"_"+str(grid_size_km)+"Km_filtered.csv",\n                  index=False)\n'

In [8]:
def importer_data_zone(zone_name,var_name,zone_gpd,country_name,grid_size_km):

    # Génération de la grille pour le pays choisi
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    zone = zone_gpd[zone_gpd[var_name].isin(zone_name)]
    zone_geometry = zone.geometry.unary_union  # Combine all geometries into one

    # Génération de la grille pour le pays choisi
    # Get the bounds of the input GeoDataFrame
    minx, miny, maxx, maxy = zone_geometry.bounds
    
    # Step 3: Adjust bounding box to align with (0, 0)
    # Use approximate degrees per km for the middle of the country
    midpoint_lat = (miny + maxy) / 2
    
    local_grid = create_country_grid_WGS84(zone_gpd, zone_name,var_name,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=True)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    local_grid = local_grid.rename(columns=nouveaux_noms_columns)
    nouveaux_noms_columns = {'country_code': 'zone_name'}
    local_grid = local_grid.rename(columns=nouveaux_noms_columns)
    zone_name_str = "_".join(zonename.replace(" ", "_") for zonename in zone_name)
    # Sauvegarder le fichier grille
    local_grid.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG"+r"/"+zone_name_str+"_grid_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
    ### Importer les données de biodiversité du pays ###

    colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                             'taxonRank','countryCode','occurrenceStatus', 'individualCount', 
                         'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year']
    fichier = "C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')+"/Raw/extractGBIF_"+country_name.replace(' ', '_')+"_12112024.csv"  # Remplace par le chemin vers ton fichier

    
    # Lire le fichier en filtrant directement
    chunksize = 10000000  # Lecture par morceaux pour gros fichiers
    filtered_data = []
    
    # Lire en chunks (utile pour les fichiers volumineux)
    for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
        # Convertir en float en gérant les erreurs
        chunk["decimalLatitude"] = pd.to_numeric(chunk["decimalLatitude"], errors='coerce')
        chunk["decimalLongitude"] = pd.to_numeric(chunk["decimalLongitude"], errors='coerce')
    
        # Filtrer les valeurs valides
        chunk_filtered = chunk.dropna(subset=["decimalLatitude", "decimalLongitude"])
        chunk_filtered = chunk_filtered[
            (chunk_filtered["decimalLongitude"] >= minx) & (chunk_filtered["decimalLongitude"] <= maxx) &
            (chunk_filtered["decimalLatitude"] >= miny) & (chunk_filtered["decimalLatitude"] <= maxy)
        ]
        filtered_data.append(chunk_filtered)
    
    # Concaténer les morceaux filtrés
    df_biodiv = pd.concat(filtered_data, ignore_index=True)

    return df_biodiv

def formater_data_PN_all_in_one(df_biodiv,zone_name,var_name,zone_gpd,country_name,grid_size_km,
                                cle_ID='speciesKey',cle_date='year',
                                annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024]):
    # Génération de la grille pour le pays choisi
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    zone = zone_gpd[zone_gpd[var_name].isin(zone_name)]
    zone_geometry = zone.geometry.unary_union  # Combine all geometries into one

    # Génération de la grille pour le pays choisi
    # Get the bounds of the input GeoDataFrame
    minx, miny, maxx, maxy = zone_geometry.bounds
    
    # Step 3: Adjust bounding box to align with (0, 0)
    # Use approximate degrees per km for the middle of the country
    midpoint_lat = (miny + maxy) / 2
    
    local_grid = create_country_grid_WGS84(zone_gpd, zone_name,var_name,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=True)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    local_grid = local_grid.rename(columns=nouveaux_noms_columns)
    nouveaux_noms_columns = {'country_code': 'zone_name'}
    local_grid = local_grid.rename(columns=nouveaux_noms_columns)
    zone_name_str = "_".join(zonename.replace(" ", "_") for zonename in zone_name)
    # Sauvegarder le fichier grille
    local_grid.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG"+r"/"+zone_name_str+"_grid_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
                       
    # Compter le nombre de NaN par colonne
    df_biodiv['eventDate'] = pd.to_datetime(df_biodiv['eventDate'], errors='coerce', utc=True)
    
    df_biodiv['year'] = df_biodiv['year'].fillna(df_biodiv['eventDate'].dt.year)

    # Associer chaque observation à une maille de la grille
    
    # Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
    df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
    df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')
    
    # Filtrer les lignes où les coordonnées sont NaN
    df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)
    
    # Ajouter le grid aux points
    df_biodiv_with_grid = add_grid_to_country(df_biodiv, local_grid,cle_geo)
    

    # Pré-traitement des données
    
    # Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
    df=df_biodiv_with_grid.copy()
    
    n_especes_entrée=len(df[cle_ID].unique())
    n_obs_entrée=len(df)
    print("En entrée : nombre d'espèces observées :",n_especes_entrée)
    print("En entrée : nombre d'obs :",n_obs_entrée)
    
    colonnes_obligatoires=[cle_ID]
    # Supprimer les lignes ou une donnée cruciale manque
    df_cleaned = df.dropna(subset=colonnes_obligatoires)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    #valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
    #df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)
    
    df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)
    
    df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)
    
    # Ajouter des colonnes year, month, day et day_of_year
    #df_cleaned[cle_date] = pd.to_datetime(df_cleaned[cle_date], utc=True, errors='coerce')
    
    # Now safely access .dt.year, .dt.month, etc., from df_cleaned
    #df_cleaned['year'] = df_cleaned[cle_date].dt.year
    
    # Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
    df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')
    
    # Remplacer les NaN par 1
    df_cleaned['individualCount'].fillna(1, inplace=True)
    
    
    print(f"En sortie : nombre d'espèces observées :{len(df_cleaned[cle_ID].unique())} soit une perte de {100-(round(len(df_cleaned[cle_ID].unique())/n_especes_entrée*100))}%")
    print(f"En sortie : nombre d'obs :{len(df_cleaned)} soit une perte de {100-round(len(df_cleaned)/n_obs_entrée*100)} %") 
    # Grouper les données par maille et par période 

    annee_mini=1
    bornes_temporelles=[1800, 1990,2010, 2024] 
    
    df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)
    
    print(f"En sortie : nombre d'obs :{len(df_cleaned[df_cleaned['year']>annee_mini])} soit une perte de {100-round(len(df_cleaned[df_cleaned['year']>annee_mini])/n_obs_entrée*100)} %") 

    dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")

    df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")
    
    # Sauvegarder les données groupées
    
    colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                         'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
    df_to_save=df_maille_espece[colonnes_a_conserver]
    df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                      +"/data_GBIF_"+zone_name_str+'_'+cle_geo+"_periodes.csv", index=False)

  local_grid.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG"+r"/"+zone_name_str+"_grid_"+str(grid_size_km)+"km.geojson",
  local_grid.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG"+r"/"+zone_name_str+"_grid_"+str(grid_size_km)+"km.geojson",


In [9]:
zone_name=["66","11","81","34","12","48","30","07","26","84","13","05","04","83","06"]
var_name="code"
zone_gpd=departement_gpd
country_name="France"
grid_size_km=1

df_biodiv=importer_data_zone(zone_name,var_name,zone_gpd,country_name,grid_size_km)

  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer):
  for chunk in pd.read_csv(fichier, chunksize=chunksize, sep='\t',on_bad_lines='skip',us

In [None]:
for(zonename)in zone_name:
    print(zonename)
    grid_size_km=[2]
    for grid_size in grid_size_km:
        formater_data_PN_all_in_one(df_biodiv,[zonename],var_name,zone_gpd,country_name,grid_size,
                                        cle_ID='speciesKey',cle_date='year',
                                        annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024])

66


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
11


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
81


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
34


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
12


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
48


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
30


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
07


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


En entrée : nombre d'espèces observées : 58019
En entrée : nombre d'obs : 53611778


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :57939 soit une perte de 0%
En sortie : nombre d'obs :53479029 soit une perte de 0 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


periode
Période 3: 2011 à 2024    28039835
Période 2: 1991 à 2010    19648760
Période 1: 1801 à 1990     4580716
Name: count, dtype: int64
En sortie : nombre d'obs :52395557 soit une perte de 2 %
26


  zone_geometry = zone.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


In [14]:
type(zonename)

str

In [15]:
[zonename]

['66']