In [1]:
import geopandas as gpd
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import box
import matplotlib.pyplot as plt
from pyproj import CRS
import time
import math


import importlib
import fonctions_annexes_biodiv
importlib.reload(fonctions_annexes_biodiv)
from fonctions_annexes_biodiv import generer_dictionnaire_taxonomie

%matplotlib qt
path = 'C:/Users/anormand/Documents/Projet Python/Biodiv/Data'  # chemin vers le fichier


In [2]:
def degrees_per_km(latitude):
    """
    Calculate the degree equivalent of 1 km for both latitude and longitude
    at a specific latitude.

    :param latitude: Latitude in degrees
    :return: Tuple (lat_deg_per_km, lon_deg_per_km)
    """
    # 1 degree of latitude is ~111.32 km everywhere
    lat_deg_per_km = 1 / 111.32

    # 1 degree of longitude varies with latitude
    lon_deg_per_km = 1 / 111.32 / math.cos(math.radians(latitude))


    return lat_deg_per_km, lon_deg_per_km

def create_country_grid_WGS84(gdf, country_code,col_code="color_code", grid_size_km=10,midpoint_lat=None,display=False):
    """
    Generates a grid within the specified country's boundaries with cell names based on 
    coordinates and grid size using WGS 84 reference system (EPSG:4326).
    
    Parameters:
    - gdf: GeoDataFrame containing global boundaries with a column for the country's unique code (e.g., "color_code").
    - country_name: Name of the country for which to generate the grid.
    - grid_size_km: Size of the grid cells in kilometers (default is 10 km x 10 km).
    
    Returns:
    - grid: GeoDataFrame with grid cells and a 'name' column for each cell's unique identifier.
    """
    # Convert grid size from kilometers to degrees (approximation for WGS 84)
    # 1 degree of latitude ≈ 111 km, but longitude degrees vary by latitude
    # Filter to get the specified country
    country= gdf[gdf[col_code] == country_code]

    if country.empty:
        raise ValueError(f"Country '{country_code}' not found in the GeoDataFrame.")

    # Define a function to create a grid of specified resolution (in degrees) with alignment to a reference point
    def create_grid(country, grid_size=10, reference_point=(0, 0),midpoint_lat=None):
        # Unpack reference point
        ref_x, ref_y = reference_point
        
        country_geometry = country.geometry.unary_union  # Combine all geometries into one
    
        # Get the bounds of the input GeoDataFrame
        minx, miny, maxx, maxy = country_geometry.bounds


            # Step 3: Adjust bounding box to align with (0, 0)
        # Use approximate degrees per km for the middle of the country
        if midpoint_lat is None:
            midpoint_lat = (miny + maxy) / 2
        lat_deg_per_km, lon_deg_per_km = degrees_per_km(midpoint_lat)
        dy = grid_size * lat_deg_per_km
        dx = grid_size * lon_deg_per_km
    
        # Align the starting point to the reference point
        start_x = ref_x + ((minx - ref_x) // dx) * dx
        start_y = ref_y + ((miny - ref_y) // dy) * dy
    
        grid_cells = []
        x = start_x
        while x < maxx:
            y = start_y
            while y < maxy:
                cell = box(x, y, x + dx, y + dy)
    
                min_lon, min_lat, max_lon, max_lat = cell.bounds
    
                # Append the grid cell along with its bounds
                grid_cells.append({
                    "geometry": cell,
                    "min_lon": min_lon,
                    "min_lat": min_lat,
                    "max_lon": max_lon,
                    "max_lat": max_lat
                })
                y += dy
            x += dx
    
        # Create a GeoDataFrame from the grid cells with the additional columns
        grid_gdf = gpd.GeoDataFrame(
            grid_cells, 
            columns=["geometry", "min_lon", "min_lat", "max_lon", "max_lat"], 
            crs=country.crs
        )
        
        return grid_gdf

    
    # Create the grid and clip it to the country's boundary

    grid = create_grid(country, grid_size=grid_size_km,midpoint_lat=midpoint_lat)
    grid = grid[grid.intersects(country.unary_union)]
    
    # Generate names based on cell centroid latitude and longitude
    def generate_grid_name(cell, grid_size_km):
        centroid = cell.geometry.centroid
        lon, lat = centroid.x, centroid.y
        lon_label = f"E{int(abs(lon) * 100):03d}" if lon >= 0 else f"W{int(abs(lon) * 100):03d}"
        lat_label = f"N{int(abs(lat) * 100):03d}" if lat >= 0 else f"S{int(abs(lat) * 100):03d}"
        return f"{grid_size_km}km{lon_label}{lat_label}"

    # Apply naming to each grid cell
    grid["cell_name"] = grid.apply(lambda cell: generate_grid_name(cell, grid_size_km), axis=1)
    grid["country_code"] = country_code

    if display is True:
        # Plot the grid and the country boundary for reference
        fig, ax = plt.subplots(figsize=(10, 10))
        gdf.plot(ax=ax, edgecolor="black", linewidth=0.5)
        country.plot(ax=ax, edgecolor="red", linewidth=2, facecolor="none")
        grid.plot(ax=ax, color="lightblue", edgecolor="grey", alpha=0.6)
    
        plt.show()
    
    return grid

def add_grid_to_country(df_country, grid,cle_geo):
    """
    Optimized version of adding the corresponding grid cell to each row in df_country based on latitude and longitude.
    
    Parameters:
    - df_country: DataFrame containing the columns 'decimalLatitude' and 'decimalLongitude'.
    - grid: DataFrame containing the grid cells with 'name', 'min_lon', 'min_lat', 'max_lon', and 'max_lat' columns.
    
    Returns:
    - df_country: Updated DataFrame with an additional 'grid_name' column indicating the grid cell for each point.
    """
    # Convert grid bounds to NumPy arrays for efficient vectorized comparison
    min_lons = grid['min_lon'].values
    max_lons = grid['max_lon'].values
    min_lats = grid['min_lat'].values
    max_lats = grid['max_lat'].values
    grid_names = grid[cle_geo].values
    
    # Initialize an array to store the grid names
    grid_names_for_points = []
    
    # Iterate over each point in df_country and apply vectorized comparison
    for lon, lat in zip(df_country['decimalLongitude'], df_country['decimalLatitude']):
        # Find the grid cell by comparing the point coordinates with grid bounds
        matching_grid = np.where((min_lons <= lon) & (lon <= max_lons) & (min_lats <= lat) & (lat <= max_lats))[0]
        
        if matching_grid.size > 0:
            grid_names_for_points.append(grid_names[matching_grid[0]])  # Take the first matching grid cell
        else:
            grid_names_for_points.append(None)  # No matching grid
    
    # Add the grid names to the DataFrame
    df_country['grid_name'] = grid_names_for_points
    
    return df_country

def formater_maille_espece_GBIF(df,cle_geo='codeMaille10Km',cle_ID='cdRef',annee_min=None,bornes_temporelles=None):
    df_dico=generer_dictionnaire_taxonomie(df,cle_ID)
    # Convertir la colonne 'year' en int
    df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)

    # Convertir la colonne 'cdNom' en int
    df[cle_ID] = df[cle_ID].astype(int)

    if annee_min is not None:
        df=df[df['year']>=annee_min]
    
    # Choisir des bornes temporelles et assigner une période aux données
    if bornes_temporelles is not None:
        df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles, 
                       labels=[f'Période {i+1}: {bornes_temporelles[i]+1} à {bornes_temporelles[i+1]}' for i in range(len(bornes_temporelles) - 1)],
                       include_lowest=False)  # include_lowest=True inclut la borne inférieureure
        # Compter le nombre de données dans chaque intervalle
        compte_par_periode = df['periode'].value_counts()
        print(compte_par_periode)
         # Compter les occurrences d'observation de chaque taxon pour chaque code et période
        df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True).size().reset_index(name='nombreObs')
        #df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['individualCount'].sum().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
       
    else:
        # Compter les occurrences d'observation de chaque taxon pour chaque code
        df_maille_espece = df.groupby([cle_geo, cle_ID], observed=True).size().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
        
    df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)
    
    return df_maille_espece

In [3]:
# Load the world boundaries

world_terrestre = gpd.read_file(path+"/SIG_global/world-administrative-boundaries.geojson")
world_maritime = gpd.read_file(path+"/SIG_global/eez_v11.gpkg")


In [None]:
departement_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\carte_departements.geojson'  # Remplace par le chemin vers ton fichier
departement_gpd = gpd.read_file(departement_fichier)

PNR_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PNR_S_000.shx'  # Remplace par le chemin vers ton fichier
PNR_gpd = gpd.read_file(PNR_fichier)
PNR_gpd=PNR_gpd[['NOM_SITE','geometry']]

PN_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PN_S_000.shx'  # Remplace par le chemin vers ton fichier
PN_gpd = gpd.read_file(PN_fichier)
PN_gpd=PN_gpd[['NOM_SITE','geometry']]

In [None]:
PN_gpd

In [None]:
# Load the GeoJSON file
world_terrestre = gpd.read_file(path + "/SIG_global/world-administrative-boundaries.geojson")

# Update the name from "United Republic of Tanzania" to "Tanzania"
world_terrestre.loc[world_terrestre['name'] == "Czech Republic", 'name'] = "Czechia"

# Save the updated GeoDataFrame back to a GeoJSON file if needed
world_terrestre.to_file(path + "/SIG_global/world-administrative-boundaries.geojson", driver="GeoJSON")


In [None]:
world_terrestre[world_terrestre['name'].str.contains("Cze",case=False)]

In [4]:
# Choix du pays

country_name="France"
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_code = world_terrestre[world_terrestre["name"] == country_name]["color_code"].iloc[0]

In [5]:
# Génération de la grille pour le pays choisi
country = world_terrestre[world_terrestre["color_code"] == country_code]
country_geometry = country.geometry.unary_union  # Combine all geometries into one

# Get the bounds of the input GeoDataFrame
minx, miny, maxx, maxy = country_geometry.bounds

# Step 3: Adjust bounding box to align with (0, 0)
# Use approximate degrees per km for the middle of the country
midpoint_lat = (miny + maxy) / 2

country_grid_terrestre = create_country_grid_WGS84(world_terrestre, country_code,"color_code",grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=False)
nouveaux_noms_columns = {'cell_name': cle_geo}
country_grid_terrestre = country_grid_terrestre.rename(columns=nouveaux_noms_columns)

country_maritime = world_maritime[world_maritime["ISO_TER1"] == country_code]

country_grid_maritime = gpd.GeoDataFrame()
if not country_maritime.empty:
    country_grid_maritime = create_country_grid_WGS84(world_maritime, country_code,"ISO_TER1" ,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=False)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    country_grid_maritime = country_grid_maritime.rename(columns=nouveaux_noms_columns)

# Concatenate the two DataFrames
combined_grid = pd.concat([country_grid_maritime, country_grid_terrestre])

# Check for duplicates based on all columns
duplicates = combined_grid[combined_grid.duplicated(subset=[cle_geo])]
combined_grid_unique = combined_grid.drop_duplicates(subset=[cle_geo])

# Display duplicates
print("Doublons trouvés :")
duplicates

  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


Doublons trouvés :


Unnamed: 0,geometry,min_lon,min_lat,max_lon,max_lat,codeMaille20Km,country_code
459698,"POLYGON ((-4.67155 47.96982, -4.67155 48.14948...",-4.851225,47.969817,-4.671550,48.149479,20kmW476N4805,FRA
459700,"POLYGON ((-4.67155 48.32914, -4.67155 48.5088,...",-4.851225,48.329141,-4.671550,48.508803,20kmW476N4841,FRA
459701,"POLYGON ((-4.67155 48.5088, -4.67155 48.68847,...",-4.851225,48.508803,-4.671550,48.688466,20kmW476N4859,FRA
460260,"POLYGON ((-4.49188 47.96982, -4.49188 48.14948...",-4.671550,47.969817,-4.491875,48.149479,20kmW458N4805,FRA
460261,"POLYGON ((-4.49188 48.14948, -4.49188 48.32914...",-4.671550,48.149479,-4.491875,48.329141,20kmW458N4823,FRA
...,...,...,...,...,...,...,...
504067,"POLYGON ((9.52278 42.75961, 9.52278 42.93927, ...",9.343100,42.759612,9.522775,42.939274,20kmE943N4284,FRA
504068,"POLYGON ((9.52278 42.93927, 9.52278 43.11894, ...",9.343100,42.939274,9.522775,43.118936,20kmE943N4302,FRA
504625,"POLYGON ((9.70245 42.04096, 9.70245 42.22063, ...",9.522775,42.040963,9.702450,42.220625,20kmE961N4213,FRA
504626,"POLYGON ((9.70245 42.22063, 9.70245 42.40029, ...",9.522775,42.220625,9.702450,42.400287,20kmE961N4231,FRA


In [6]:
# Sauvegarder le fichier grille
country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson", 
                             driver="GeoJSON")
if not country_grid_maritime.empty:
    country_grid_maritime.to_file(
        r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson", 
        driver="GeoJSON"
    )
combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson", 
                             driver="GeoJSON")

  country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson",
  r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson",
  combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson",


In [None]:
### Importer les données de biodiversité du pays ###
cle_ID='speciesKey'
cle_geo="codeMaille"+str(grid_size_km)+'Km'
colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                         'taxonRank','countryCode','occurrenceStatus', 'individualCount', 
                     'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year']
fichier = "C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')+"/Raw/extractGBIF_"+country_name.replace(' ', '_')+"_12112024.csv"  # Remplace par le chemin vers ton fichier

df_biodiv = pd.read_csv(fichier, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer,nrows=4572978 )

In [None]:
# Compter le nombre de NaN par colonne
df_biodiv['eventDate'] = pd.to_datetime(df_biodiv['eventDate'], errors='coerce', utc=True)

df_biodiv['year'] = df_biodiv['year'].fillna(df_biodiv['eventDate'].dt.year)
# Compter le nombre de NaN par colonne
nan_counts = df_biodiv.isna().sum()

# Afficher le résultat
print(nan_counts)

In [None]:
# Associer chaque observation à une maille de la grille

# Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')

# Filtrer les lignes où les coordonnées sont NaN
df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)

# Ajouter le grid aux points
df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)

# Afficher les premières lignes
df_biodiv_with_grid.head()

In [None]:
# Pré-traitement des données

# Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
df=df_biodiv_with_grid.copy()

cle_ID='speciesKey'
cle_date='year'
cle_geo="codeMaille"+str(grid_size_km)+'Km'

n_especes_entrée=len(df[cle_ID].unique())
n_obs_entrée=len(df)
print("En entrée : nombre d'espèces observées :",n_especes_entrée)
print("En entrée : nombre d'obs :",n_obs_entrée)

colonnes_obligatoires=[cle_ID]
# Supprimer les lignes ou une donnée cruciale manque
df_cleaned = df.dropna(subset=colonnes_obligatoires)

# Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
#valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
#df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)

# Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)

df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)

df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)

# Ajouter des colonnes year, month, day et day_of_year
#df_cleaned[cle_date] = pd.to_datetime(df_cleaned[cle_date], utc=True, errors='coerce')

# Now safely access .dt.year, .dt.month, etc., from df_cleaned
#df_cleaned['year'] = df_cleaned[cle_date].dt.year

# Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')

# Remplacer les NaN par 1
df_cleaned['individualCount'].fillna(1, inplace=True)


print(f"En sortie : nombre d'espèces observées :{len(df_cleaned[cle_ID].unique())} soit une perte de {100-(round(len(df_cleaned[cle_ID].unique())/n_especes_entrée*100))}%")
print(f"En sortie : nombre d'obs :{len(df_cleaned)} soit une perte de {100-round(len(df_cleaned)/n_obs_entrée*100)} %") 



In [None]:
# Sauvegarde du fichier pré-traité sous le format 1 ligne = 1 observation
"""
df_cleaned.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+
                  country_name.replace(' ', '_')+
                  "/Filtered"+
                  "/data_GBIF_"+country_name.replace(' ', '_')+"_"+str(grid_size_km)+"Km_filtered.csv",
                  index=False)
"""

In [None]:
# Grouper les données par maille et par période 
annee_mini=1
bornes_temporelles=[1800, 1990,2010, 2024] 

df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)

print(f"En sortie : nombre d'obs :{len(df_cleaned[df_cleaned['year']>annee_mini])} soit une perte de {100-round(len(df_cleaned[df_cleaned['year']>annee_mini])/n_obs_entrée*100)} %") 

In [None]:
dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")

df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")

In [None]:
# Sauvegarder les données groupées

colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                     'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
df_to_save=df_maille_espece[colonnes_a_conserver]
df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                  +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)

In [4]:
def formater_data_gbif_all_in_one(country_name,grid_size_km,cle_ID='speciesKey',annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024]):
    
    world_terrestre = gpd.read_file(path+"/SIG_global/world-administrative-boundaries.geojson")
    world_maritime = gpd.read_file(path+"/SIG_global/eez_v11.gpkg")
    
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    country_code = world_terrestre[world_terrestre["name"] == country_name]["color_code"].iloc[0]
    
    # Génération de la grille pour le pays choisi
    country = world_terrestre[world_terrestre["color_code"] == country_code]
    country_geometry = country.geometry.unary_union  # Combine all geometries into one
    
    # Get the bounds of the input GeoDataFrame
    minx, miny, maxx, maxy = country_geometry.bounds
    
    # Step 3: Adjust bounding box to align with (0, 0)
    # Use approximate degrees per km for the middle of the country
    midpoint_lat = (miny + maxy) / 2
    
    country_grid_terrestre = create_country_grid_WGS84(world_terrestre, country_code,"color_code",grid_size_km=grid_size_km,midpoint_lat=midpoint_lat)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    country_grid_terrestre = country_grid_terrestre.rename(columns=nouveaux_noms_columns)
    
    country_maritime = world_maritime[world_maritime["ISO_TER1"] == country_code]
    
    country_grid_maritime = gpd.GeoDataFrame()
    if not country_maritime.empty:
        country_grid_maritime = create_country_grid_WGS84(world_maritime, country_code,"ISO_TER1" ,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat)
        nouveaux_noms_columns = {'cell_name': cle_geo}
        country_grid_maritime = country_grid_maritime.rename(columns=nouveaux_noms_columns)
    
    # Concatenate the two DataFrames
    combined_grid = pd.concat([country_grid_maritime, country_grid_terrestre])
    
    # Check for duplicates based on all columns
    duplicates = combined_grid[combined_grid.duplicated(subset=[cle_geo])]
    combined_grid_unique = combined_grid.drop_duplicates(subset=[cle_geo])

    # Sauvegarder le fichier grille
    country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
    if not country_grid_maritime.empty:
        country_grid_maritime.to_file(
            r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson", 
            driver="GeoJSON"
        )
    combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
    
    ### Importer les données de biodiversité du pays ###
    
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    
    fichier = "C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')+"/Raw/extractGBIF_"+country_name.replace(' ', '_')+"_12112024.csv"  # Remplace par le chemin vers ton fichier
    
    colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                         'taxonRank','countryCode','occurrenceStatus', 'individualCount', 'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year'
                         ]
    
    df_biodiv = pd.read_csv(
        fichier, 
        sep='\t', 
        on_bad_lines='skip',  # Skip problematic lines
        quoting=3,            # Ignore quotes
        usecols=colonnes_a_importer
        )
    
    df_biodiv[cle_ID] = pd.to_numeric(df_biodiv[cle_ID], errors='coerce')
    
    # Associer chaque observation à une maille de la grille
    
    # Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
    df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
    df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')
    
    # Filtrer les lignes où les coordonnées sont NaN
    df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)
    
    # Ajouter le grid aux points
    df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)
    
    # Pré-traitement des données
    
    # Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
    df=df_biodiv_with_grid.copy()
    
    cle_date='year'
    
    colonnes_obligatoires=[cle_ID]
    # Supprimer les lignes ou une donnée cruciale manque
    df_cleaned = df.dropna(subset=colonnes_obligatoires)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    #valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
    #df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)
    
    df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)
    
    df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)
    
    df_cleaned['individualCount'] = df_cleaned['individualCount'].fillna(1)
    
    # Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
    df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')
    
    # Remplacer les NaN par 1
    df_cleaned['individualCount'].fillna(1, inplace=True)
    
    # Grouper les données par maille et par période 
    
    df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)
    
    dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")
    
    df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")
    
    # Sauvegarder les données groupées
    
    colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                         'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
    df_to_save=df_maille_espece[colonnes_a_conserver]
    df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                      +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)

  country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson",
  r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson",
  combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson",


In [16]:
grid_size_km=[20]

for size_km in grid_size_km:
    countries=['India','Sri Lanka','Bangladesh','Nepal','Bhutan','Japan','Republic of Korea',
             "Lao People's Democratic Republic","Myanmar","Thailand","Malaysia","Vietnam","Cambodia","Philippines",
            "Turkey","Tunisia","Algeria","Morocco",'Ethiopia','Kenya','Uganda','Tanzania','Rwanda','Burundi','South Sudan'] #20

    for country in countries:
        print(country)
        formater_data_gbif_all_in_one(country,size_km,annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024])

India


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    48808767
Période 2: 1991 à 2010     1027517
Période 1: 1801 à 1990      313298
Name: count, dtype: int64
Sri Lanka


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1910471
Période 2: 1991 à 2010      99506
Période 1: 1801 à 1990      33458
Name: count, dtype: int64
Bangladesh


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    549015
Période 2: 1991 à 2010     11758
Période 1: 1801 à 1990      8512
Name: count, dtype: int64
Nepal


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1033927
Période 2: 1991 à 2010     123886
Période 1: 1801 à 1990      64349
Name: count, dtype: int64
Bhutan


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    501943
Période 2: 1991 à 2010     32481
Période 1: 1801 à 1990      3735
Name: count, dtype: int64
Japan


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    3024415
Période 1: 1801 à 1990    2206418
Période 2: 1991 à 2010    1474457
Name: count, dtype: int64
Republic of Korea


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    2441108
Période 2: 1991 à 2010     665322
Période 1: 1801 à 1990     180629
Name: count, dtype: int64
Lao People's Democratic Republic


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    160530
Période 2: 1991 à 2010     33337
Période 1: 1801 à 1990     10250
Name: count, dtype: int64
Myanmar


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    232874
Période 2: 1991 à 2010     47559
Période 1: 1801 à 1990     14865
Name: count, dtype: int64
Thailand


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    4962881
Période 2: 1991 à 2010     357160
Période 1: 1801 à 1990     188408
Name: count, dtype: int64
Malaysia


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    2337233
Période 2: 1991 à 2010     378877
Période 1: 1801 à 1990     238657
Name: count, dtype: int64
Vietnam


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    466486
Période 2: 1991 à 2010    131275
Période 1: 1801 à 1990     28315
Name: count, dtype: int64
Cambodia


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    876095
Période 2: 1991 à 2010     42343
Période 1: 1801 à 1990      2836
Name: count, dtype: int64
Philippines


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1227415
Période 1: 1801 à 1990     248853
Période 2: 1991 à 2010     200311
Name: count, dtype: int64
Turkey


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    2490790
Période 2: 1991 à 2010     499753
Période 1: 1801 à 1990     132569
Name: count, dtype: int64
Tunisia


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    70913
Période 2: 1991 à 2010    28783
Période 1: 1801 à 1990    21006
Name: count, dtype: int64
Algeria


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    191317
Période 1: 1801 à 1990     41096
Période 2: 1991 à 2010      8713
Name: count, dtype: int64
Morocco


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1032734
Période 2: 1991 à 2010     137737
Période 1: 1801 à 1990      92423
Name: count, dtype: int64
Ethiopia


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    594299
Période 2: 1991 à 2010    132087
Période 1: 1801 à 1990     88602
Name: count, dtype: int64
Kenya


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    3106662
Période 2: 1991 à 2010     578725
Période 1: 1801 à 1990     319668
Name: count, dtype: int64
Uganda


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1587128
Période 2: 1991 à 2010     312022
Période 1: 1801 à 1990      61710
Name: count, dtype: int64
Tanzania


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  df_biodiv = pd.read_csv(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    1596623
Période 2: 1991 à 2010     321202
Période 1: 1801 à 1990     191696
Name: count, dtype: int64
Rwanda


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    272273
Période 1: 1801 à 1990     33870
Période 2: 1991 à 2010     15294
Name: count, dtype: int64
Burundi


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 1: 1801 à 1990    51827
Période 2: 1991 à 2010     7743
Période 3: 2011 à 2024     7207
Name: count, dtype: int64
South Sudan


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    16332
Période 1: 1801 à 1990     5377
Période 2: 1991 à 2010     1219
Name: count, dtype: int64


In [8]:
combined_grid_unique

Unnamed: 0,geometry,min_lon,min_lat,max_lon,max_lat,codeMaille20Km,country_code
32,"POLYGON ((-9.70245 46.89184, -9.70245 47.07151...",-9.882125,46.891843,-9.702450,47.071506,20kmW979N4698,FRA
33,"POLYGON ((-9.70245 47.07151, -9.70245 47.25117...",-9.882125,47.071506,-9.702450,47.251168,20kmW979N4716,FRA
34,"POLYGON ((-9.70245 47.25117, -9.70245 47.43083...",-9.882125,47.251168,-9.702450,47.430830,20kmW979N4734,FRA
35,"POLYGON ((-9.70245 47.43083, -9.70245 47.61049...",-9.882125,47.430830,-9.702450,47.610492,20kmW979N4752,FRA
36,"POLYGON ((-9.70245 47.61049, -9.70245 47.79015...",-9.882125,47.610492,-9.702450,47.790155,20kmW979N4770,FRA
...,...,...,...,...,...,...,...
999392,"POLYGON ((167.99613 -21.73913, 167.99613 -21.5...",167.816453,-21.739130,167.996128,-21.559468,20kmE16790S2164,FRA
999393,"POLYGON ((167.99613 -21.55947, 167.99613 -21.3...",167.816453,-21.559468,167.996128,-21.379806,20kmE16790S2146,FRA
999394,"POLYGON ((167.99613 -21.37981, 167.99613 -21.2...",167.816453,-21.379806,167.996128,-21.200144,20kmE16790S2128,FRA
999954,"POLYGON ((168.1758 -21.73913, 168.1758 -21.559...",167.996128,-21.739130,168.175803,-21.559468,20kmE16808S2164,FRA


In [None]:
cle_ID='speciesKey'
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_name="France"
fichier=r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\Raw\extractGBIF_France_12112024.csv"
# Taille de chaque paquet
lines_per_chunk = 10000000

colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                     'taxonRank','countryCode','occurrenceStatus', 'individualCount', 'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year']
# Lecture et découpage du fichier
chunk_number = 0

for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):
    chunk_number += 1
    # Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
    df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
    df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')
    df_biodiv[cle_ID] = pd.to_numeric(df_biodiv[cle_ID], errors='coerce')
    # Filtrer les lignes où les coordonnées sont NaN
    df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)
    
    # Ajouter le grid aux points
    df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)
    
    # Afficher les premières lignes
    df_biodiv_with_grid.head()

    # Pré-traitement des données
    # Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
    df=df_biodiv_with_grid.copy()
    
    cle_date='year'

    n_especes_entrée=len(df[cle_ID].unique())
    n_obs_entrée=len(df)
    print("En entrée : nombre d'espèces observées :",n_especes_entrée)
    print("En entrée : nombre d'obs :",n_obs_entrée)
    
    colonnes_obligatoires=[cle_ID]
    # Supprimer les lignes ou une donnée cruciale manque
    df_cleaned = df.dropna(subset=colonnes_obligatoires)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    #valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
    #df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)
    
    df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)
    
    df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)
    
    df_cleaned['individualCount'] = df_cleaned['individualCount'].fillna(1)
    
    # Ajouter des colonnes year, month, day et day_of_year
    #df_cleaned[cle_date] = pd.to_datetime(df_cleaned[cle_date], utc=True, errors='coerce')
    
    # Now safely access .dt.year, .dt.month, etc., from df_cleaned
    #df_cleaned['year'] = df_cleaned[cle_date].dt.year
    
    # Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
    df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')
    
    # Remplacer les NaN par 1
    df_cleaned['individualCount'].fillna(1, inplace=True)
    
    
    print(f"En sortie : nombre d'espèces observées :{len(df_cleaned[cle_ID].unique())} soit une perte de {100-(round(len(df_cleaned[cle_ID].unique())/n_especes_entrée*100))}%")
    print(f"En sortie : nombre d'obs :{len(df_cleaned)} soit une perte de {100-round(len(df_cleaned)/n_obs_entrée*100)} %") 
    
    # Grouper les données par maille et par période 
    annee_mini=0
    bornes_temporelles=[1800, 1990,2010, 2024] 
    
    df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)
    
    print(f"En sortie : nombre d'obs :{len(df_cleaned[df_cleaned['year']>annee_mini])} soit une perte de {100-round(len(df_cleaned[df_cleaned['year']>annee_mini])/n_obs_entrée*100)} %") 
    
    dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")
    
    df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")
    
    # Sauvegarder les données groupées
    
    colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                         'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
    df_to_save=df_maille_espece[colonnes_a_conserver]
    df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                      +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes_"+str(chunk_number)+".csv", index=False)

  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 33382
En entrée : nombre d'obs : 9620658


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :33296 soit une perte de 0%
En sortie : nombre d'obs :9579437 soit une perte de 0 %
periode
Période 3: 2011 à 2024    4817352
Période 2: 1991 à 2010    3381945
Période 1: 1801 à 1990    1079513
Name: count, dtype: int64
En sortie : nombre d'obs :9292567 soit une perte de 3 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 32484
En entrée : nombre d'obs : 9725239


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :32478 soit une perte de 0%
En sortie : nombre d'obs :9702451 soit une perte de 0 %
periode
Période 3: 2011 à 2024    6042104
Période 2: 1991 à 2010    3153014
Période 1: 1801 à 1990     363454
Name: count, dtype: int64
En sortie : nombre d'obs :9561913 soit une perte de 2 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 33632
En entrée : nombre d'obs : 9728614


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :33549 soit une perte de 0%
En sortie : nombre d'obs :9537684 soit une perte de 2 %
periode
Période 3: 2011 à 2024    5004086
Période 2: 1991 à 2010    3841380
Période 1: 1801 à 1990     542498
Name: count, dtype: int64
En sortie : nombre d'obs :9397276 soit une perte de 3 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 37635
En entrée : nombre d'obs : 9570295


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :37623 soit une perte de 0%
En sortie : nombre d'obs :9556789 soit une perte de 0 %
periode
Période 3: 2011 à 2024    4951479
Période 2: 1991 à 2010    3830643
Période 1: 1801 à 1990     526268
Name: count, dtype: int64
En sortie : nombre d'obs :9309443 soit une perte de 3 %
En entrée : nombre d'espèces observées : 28031
En entrée : nombre d'obs : 9368101


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :28025 soit une perte de 0%
En sortie : nombre d'obs :9295440 soit une perte de 1 %
periode
Période 3: 2011 à 2024    4323520
Période 2: 1991 à 2010    3694905
Période 1: 1801 à 1990    1155283
Name: count, dtype: int64
En sortie : nombre d'obs :9193114 soit une perte de 2 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 33150
En entrée : nombre d'obs : 8938522


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :33131 soit une perte de 0%
En sortie : nombre d'obs :8921039 soit une perte de 0 %
periode
Période 3: 2011 à 2024    4154114
Période 2: 1991 à 2010    3490363
Période 1: 1801 à 1990    1020871
Name: count, dtype: int64
En sortie : nombre d'obs :8686821 soit une perte de 3 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 30956
En entrée : nombre d'obs : 9621743


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :30954 soit une perte de 0%
En sortie : nombre d'obs :9616817 soit une perte de 0 %
periode
Période 3: 2011 à 2024    6590746
Période 2: 1991 à 2010    2251183
Période 1: 1801 à 1990     505254
Name: count, dtype: int64
En sortie : nombre d'obs :9352128 soit une perte de 3 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 26012
En entrée : nombre d'obs : 9727107


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :26007 soit une perte de 0%
En sortie : nombre d'obs :9716045 soit une perte de 0 %
periode
Période 3: 2011 à 2024    4811133
Période 2: 1991 à 2010    3806608
Période 1: 1801 à 1990     621504
Name: count, dtype: int64
En sortie : nombre d'obs :9260399 soit une perte de 5 %


  for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):


En entrée : nombre d'espèces observées : 39720
En entrée : nombre d'obs : 9308409


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


En sortie : nombre d'espèces observées :39600 soit une perte de 0%
En sortie : nombre d'obs :9254364 soit une perte de 1 %
periode
Période 3: 2011 à 2024    4614766
Période 2: 1991 à 2010    3923662
Période 1: 1801 à 1990     471813
Name: count, dtype: int64
En sortie : nombre d'obs :9028419 soit une perte de 3 %


In [7]:
cle_ID='speciesKey'
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_name="France"

path=r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France"
df=pd.DataFrame()
for i in range(1,21):
    print(str(i))
    df_temp=pd.read_csv(path+"\data_GBIF_France_"+cle_geo+"_periodes_"+str(i)+".csv")
    df=pd.concat([df,df_temp],ignore_index=True)
                            

df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['nombreObs'].sum().reset_index(name='nombreObs')

df_dico=generer_dictionnaire_taxonomie(df,cle_ID)

df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)

df_maille_espece.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                  +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)

  df_temp=pd.read_csv(path+"\data_GBIF_France_"+cle_geo+"_periodes_"+str(i)+".csv")


1


  df_temp=pd.read_csv(path+"\data_GBIF_France_"+cle_geo+"_periodes_"+str(i)+".csv")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\anormand\\Documents\\Projet Python\\Biodiv\\Data\\GBIF_France\\data_GBIF_France_codeMaille20Km_periodes_1.csv'

In [None]:
cle_ID='speciesKey'
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_name="France"

path=r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France"
df=pd.DataFrame()
for i in range(1,21):
    print(str(i))
    df_temp=pd.read_csv(path+"\data_GBIF_France_"+cle_geo+"_periodes_"+str(i)+".csv")
    df=pd.concat([df,df_temp],ignore_index=True)
                            

df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['nombreObs'].sum().reset_index(name='nombreObs')

df_dico=generer_dictionnaire_taxonomie(df,cle_ID)

df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)

df_maille_espece.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                  +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)