In [1]:
import geopandas as gpd
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import box
import matplotlib.pyplot as plt
from pyproj import CRS
import time
import math


import importlib
import fonctions_annexes_biodiv
importlib.reload(fonctions_annexes_biodiv)
from fonctions_annexes_biodiv import generer_dictionnaire_taxonomie

%matplotlib qt
path = 'C:/Users/anormand/Documents/Projet Python/Biodiv/Data'  # chemin vers le fichier


In [2]:
def degrees_per_km(latitude):
    """
    Calculate the degree equivalent of 1 km for both latitude and longitude
    at a specific latitude.

    :param latitude: Latitude in degrees
    :return: Tuple (lat_deg_per_km, lon_deg_per_km)
    """
    # 1 degree of latitude is ~111.32 km everywhere
    lat_deg_per_km = 1 / 111.32

    # 1 degree of longitude varies with latitude
    lon_deg_per_km = 1 / 111.32 / math.cos(math.radians(latitude))


    return lat_deg_per_km, lon_deg_per_km

def create_country_grid_WGS84(gdf, country_code,col_code="color_code", grid_size_km=10,midpoint_lat=None,display=False):
    """
    Generates a grid within the specified country's boundaries with cell names based on 
    coordinates and grid size using WGS 84 reference system (EPSG:4326).
    
    Parameters:
    - gdf: GeoDataFrame containing global boundaries with a column for the country's unique code (e.g., "color_code").
    - country_name: Name of the country for which to generate the grid.
    - grid_size_km: Size of the grid cells in kilometers (default is 10 km x 10 km).
    
    Returns:
    - grid: GeoDataFrame with grid cells and a 'name' column for each cell's unique identifier.
    """
    # Convert grid size from kilometers to degrees (approximation for WGS 84)
    # 1 degree of latitude ≈ 111 km, but longitude degrees vary by latitude
    # Filter to get the specified country
    country= gdf[gdf[col_code] == country_code]

    if country.empty:
        raise ValueError(f"Country '{country_code}' not found in the GeoDataFrame.")

    # Define a function to create a grid of specified resolution (in degrees) with alignment to a reference point
    def create_grid(country, grid_size=10, reference_point=(0, 0),midpoint_lat=None):
        # Unpack reference point
        ref_x, ref_y = reference_point
        
        country_geometry = country.geometry.unary_union  # Combine all geometries into one
    
        # Get the bounds of the input GeoDataFrame
        minx, miny, maxx, maxy = country_geometry.bounds


            # Step 3: Adjust bounding box to align with (0, 0)
        # Use approximate degrees per km for the middle of the country
        if midpoint_lat is None:
            midpoint_lat = (miny + maxy) / 2
        lat_deg_per_km, lon_deg_per_km = degrees_per_km(midpoint_lat)
        dy = grid_size * lat_deg_per_km
        dx = grid_size * lon_deg_per_km
    
        # Align the starting point to the reference point
        start_x = ref_x + ((minx - ref_x) // dx) * dx
        start_y = ref_y + ((miny - ref_y) // dy) * dy
    
        grid_cells = []
        x = start_x
        while x < maxx:
            y = start_y
            while y < maxy:
                cell = box(x, y, x + dx, y + dy)
    
                min_lon, min_lat, max_lon, max_lat = cell.bounds
    
                # Append the grid cell along with its bounds
                grid_cells.append({
                    "geometry": cell,
                    "min_lon": min_lon,
                    "min_lat": min_lat,
                    "max_lon": max_lon,
                    "max_lat": max_lat
                })
                y += dy
            x += dx
    
        # Create a GeoDataFrame from the grid cells with the additional columns
        grid_gdf = gpd.GeoDataFrame(
            grid_cells, 
            columns=["geometry", "min_lon", "min_lat", "max_lon", "max_lat"], 
            crs=country.crs
        )
        
        return grid_gdf

    
    # Create the grid and clip it to the country's boundary

    grid = create_grid(country, grid_size=grid_size_km,midpoint_lat=midpoint_lat)
    grid = grid[grid.intersects(country.unary_union)]
    
    # Generate names based on cell centroid latitude and longitude
    def generate_grid_name(cell, grid_size_km):
        centroid = cell.geometry.centroid
        lon, lat = centroid.x, centroid.y
        lon_label = f"E{int(abs(lon) * 100):03d}" if lon >= 0 else f"W{int(abs(lon) * 100):03d}"
        lat_label = f"N{int(abs(lat) * 100):03d}" if lat >= 0 else f"S{int(abs(lat) * 100):03d}"
        return f"{grid_size_km}km{lon_label}{lat_label}"

    # Apply naming to each grid cell
    grid["cell_name"] = grid.apply(lambda cell: generate_grid_name(cell, grid_size_km), axis=1)
    grid["country_code"] = country_code

    if display is True:
        # Plot the grid and the country boundary for reference
        fig, ax = plt.subplots(figsize=(10, 10))
        gdf.plot(ax=ax, edgecolor="black", linewidth=0.5)
        country.plot(ax=ax, edgecolor="red", linewidth=2, facecolor="none")
        grid.plot(ax=ax, color="lightblue", edgecolor="grey", alpha=0.6)
    
        plt.show()
    
    return grid

def add_grid_to_country(df_country, grid,cle_geo):
    """
    Optimized version of adding the corresponding grid cell to each row in df_country based on latitude and longitude.
    
    Parameters:
    - df_country: DataFrame containing the columns 'decimalLatitude' and 'decimalLongitude'.
    - grid: DataFrame containing the grid cells with 'name', 'min_lon', 'min_lat', 'max_lon', and 'max_lat' columns.
    
    Returns:
    - df_country: Updated DataFrame with an additional 'grid_name' column indicating the grid cell for each point.
    """
    # Convert grid bounds to NumPy arrays for efficient vectorized comparison
    min_lons = grid['min_lon'].values
    max_lons = grid['max_lon'].values
    min_lats = grid['min_lat'].values
    max_lats = grid['max_lat'].values
    grid_names = grid[cle_geo].values
    
    # Initialize an array to store the grid names
    grid_names_for_points = []
    
    # Iterate over each point in df_country and apply vectorized comparison
    for lon, lat in zip(df_country['decimalLongitude'], df_country['decimalLatitude']):
        # Find the grid cell by comparing the point coordinates with grid bounds
        matching_grid = np.where((min_lons <= lon) & (lon <= max_lons) & (min_lats <= lat) & (lat <= max_lats))[0]
        
        if matching_grid.size > 0:
            grid_names_for_points.append(grid_names[matching_grid[0]])  # Take the first matching grid cell
        else:
            grid_names_for_points.append(None)  # No matching grid
    
    # Add the grid names to the DataFrame
    df_country['grid_name'] = grid_names_for_points
    
    return df_country

def formater_maille_espece_GBIF(df,cle_geo='codeMaille10Km',cle_ID='cdRef',annee_min=None,bornes_temporelles=None):
    df_dico=generer_dictionnaire_taxonomie(df,cle_ID)
    # Convertir la colonne 'year' en int
    df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)

    # Convertir la colonne 'cdNom' en int
    df[cle_ID] = df[cle_ID].astype(int)

    if annee_min is not None:
        df=df[df['year']>=annee_min]
    
    # Choisir des bornes temporelles et assigner une période aux données
    if bornes_temporelles is not None:
        df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles, 
                       labels=[f'Période {i+1}: {bornes_temporelles[i]+1} à {bornes_temporelles[i+1]}' for i in range(len(bornes_temporelles) - 1)],
                       include_lowest=False)  # include_lowest=True inclut la borne inférieureure
        # Compter le nombre de données dans chaque intervalle
        compte_par_periode = df['periode'].value_counts()
        print(compte_par_periode)
         # Compter les occurrences d'observation de chaque taxon pour chaque code et période
        df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True).size().reset_index(name='nombreObs')
        #df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['individualCount'].sum().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
       
    else:
        # Compter les occurrences d'observation de chaque taxon pour chaque code
        df_maille_espece = df.groupby([cle_geo, cle_ID], observed=True).size().reset_index(name='nombreObs')
        #eventuellement remplacer ['individualCount'].sum() par .size() 
        
    df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)
    
    return df_maille_espece

In [3]:
# Load the world boundaries

world_terrestre = gpd.read_file(path+"/SIG_global/world-administrative-boundaries.geojson")
world_maritime = gpd.read_file(path+"/SIG_global/eez_v11.gpkg")


In [None]:
departement_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\carte_departements.geojson'  # Remplace par le chemin vers ton fichier
departement_gpd = gpd.read_file(departement_fichier)

PNR_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PNR_S_000.shx'  # Remplace par le chemin vers ton fichier
PNR_gpd = gpd.read_file(PNR_fichier)
PNR_gpd=PNR_gpd[['NOM_SITE','geometry']]

PN_fichier = r'C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\SIG\N_ENP_PN_S_000.shx'  # Remplace par le chemin vers ton fichier
PN_gpd = gpd.read_file(PN_fichier)
PN_gpd=PN_gpd[['NOM_SITE','geometry']]

In [None]:
PN_gpd

In [33]:
# Load the GeoJSON file
world_terrestre = gpd.read_file(path + "/SIG_global/world-administrative-boundaries.geojson")

# Update the name from "United Republic of Tanzania" to "Tanzania"
world_terrestre.loc[world_terrestre['color_code'] == "SRB", 'color_code'] = "KV"

# Save the updated GeoDataFrame back to a GeoJSON file if needed
world_terrestre.to_file(path + "/SIG_global/world-administrative-boundaries.geojson", driver="GeoJSON")


In [40]:
world_terrestre.to_file(path + "/SIG_global/world-administrative-boundaries.geojson", driver="GeoJSON")



In [39]:
world_terrestre[world_terrestre['name'].str.contains("Serbia",case=False)]

Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,iso_3166_1_alpha_2_codes,french_short,geometry
256,"{ ""lon"": 20.805271723235375, ""lat"": 44.0314984...",SRB,Member State,SRB,Serbia,Europe,Southern Europe,RS,Serbie,"POLYGON ((20.26102 46.11485, 20.31403 46.06986..."


In [36]:
new_row = world_terrestre.loc[3].copy()
new_row["color_code"] = "SRB"  # Remplace par la valeur souhaitée
new_row["name"] = "Serbia"  # Remplace par la valeur souhaitée

# Ajouter la ligne dupliquée au DataFrame
world_terrestre = pd.concat([world_terrestre, new_row.to_frame().T], ignore_index=True)

In [10]:
# Choix du pays

country_name="Albania"
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_code = world_terrestre[world_terrestre["name"] == country_name]["color_code"].iloc[0]

In [11]:
# Génération de la grille pour le pays choisi
country = world_terrestre[world_terrestre["color_code"] == country_code]
country_geometry = country.geometry.unary_union  # Combine all geometries into one

# Get the bounds of the input GeoDataFrame
minx, miny, maxx, maxy = country_geometry.bounds

# Step 3: Adjust bounding box to align with (0, 0)
# Use approximate degrees per km for the middle of the country
midpoint_lat = (miny + maxy) / 2

country_grid_terrestre = create_country_grid_WGS84(world_terrestre, country_code,"color_code",grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=False)
nouveaux_noms_columns = {'cell_name': cle_geo}
country_grid_terrestre = country_grid_terrestre.rename(columns=nouveaux_noms_columns)

country_maritime = world_maritime[world_maritime["ISO_TER1"] == country_code]

country_grid_maritime = gpd.GeoDataFrame()
if not country_maritime.empty:
    country_grid_maritime = create_country_grid_WGS84(world_maritime, country_code,"ISO_TER1" ,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat,display=False)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    country_grid_maritime = country_grid_maritime.rename(columns=nouveaux_noms_columns)

# Concatenate the two DataFrames
combined_grid = pd.concat([country_grid_maritime, country_grid_terrestre])

# Check for duplicates based on all columns
duplicates = combined_grid[combined_grid.duplicated(subset=[cle_geo])]
combined_grid_unique = combined_grid.drop_duplicates(subset=[cle_geo])

# Display duplicates
print("Doublons trouvés :")
duplicates

Doublons trouvés :


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]


Unnamed: 0,geometry,min_lon,min_lat,max_lon,max_lat,codeMaille20Km,country_code
4,"POLYGON ((19.32727 40.24434, 19.32727 40.424, ...",19.088666,40.244341,19.327275,40.424003,20kmE1920N4033,ALB
5,"POLYGON ((19.32727 40.424, 19.32727 40.60367, ...",19.088666,40.424003,19.327275,40.603665,20kmE1920N4051,ALB
6,"POLYGON ((19.32727 40.60367, 19.32727 40.78333...",19.088666,40.603665,19.327275,40.783327,20kmE1920N4069,ALB
21,"POLYGON ((19.56588 40.06468, 19.56588 40.24434...",19.327275,40.064678,19.565883,40.244341,20kmE1944N4015,ALB
22,"POLYGON ((19.56588 40.24434, 19.56588 40.424, ...",19.327275,40.244341,19.565883,40.424003,20kmE1944N4033,ALB
23,"POLYGON ((19.56588 40.424, 19.56588 40.60367, ...",19.327275,40.424003,19.565883,40.603665,20kmE1944N4051,ALB
24,"POLYGON ((19.56588 40.60367, 19.56588 40.78333...",19.327275,40.603665,19.565883,40.783327,20kmE1944N4069,ALB
25,"POLYGON ((19.56588 40.78333, 19.56588 40.96299...",19.327275,40.783327,19.565883,40.96299,20kmE1944N4087,ALB
26,"POLYGON ((19.56588 40.96299, 19.56588 41.14265...",19.327275,40.96299,19.565883,41.142652,20kmE1944N4105,ALB
27,"POLYGON ((19.56588 41.14265, 19.56588 41.32231...",19.327275,41.142652,19.565883,41.322314,20kmE1944N4123,ALB


In [21]:
import os
country_name='Albania'
output_path = os.path.join(
    "C:\\Users\\anormand\\Documents\\Projet Python\\Biodiv\\Data\\GBIF_"+
    country_name.replace(' ', '_'),
    "SIG",
    f"country_grid_terrestre_{grid_size_km}km.geojson"
)

os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Create missing directories

country_grid_terrestre.to_file(output_path, driver="GeoJSON")


In [12]:
# Sauvegarder le fichier grille
country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson", 
                             driver="GeoJSON")
if not country_grid_maritime.empty:
    country_grid_maritime.to_file(
        r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson", 
        driver="GeoJSON"
    )
combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson", 
                             driver="GeoJSON")

  country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson",
  r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson",
  combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson",
  country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson",
  r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson",
  combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country

DataSourceError: Failed to create GeoJSON datasource: C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_Albania\SIG\country_grid_terrestre_20km.geojson: C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_Albania\SIG\country_grid_terrestre_20km.geojson: No such file or directory

In [22]:
### Importer les données de biodiversité du pays ###
cle_ID='speciesKey'
cle_geo="codeMaille"+str(grid_size_km)+'Km'
colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                         'taxonRank','countryCode','occurrenceStatus', 'individualCount', 
                     'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year']
fichier = "C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')+"/Raw/extractGBIF_"+country_name.replace(' ', '_')+"_12112024.csv"  # Remplace par le chemin vers ton fichier

df_biodiv = pd.read_csv(fichier, sep='\t',on_bad_lines='skip',usecols=colonnes_a_importer,nrows=4572978 )

In [23]:
# Compter le nombre de NaN par colonne
df_biodiv['eventDate'] = pd.to_datetime(df_biodiv['eventDate'], errors='coerce', utc=True)

df_biodiv['year'] = df_biodiv['year'].fillna(df_biodiv['eventDate'].dt.year)
# Compter le nombre de NaN par colonne
nan_counts = df_biodiv.isna().sum()

# Afficher le résultat
print(nan_counts)

occurrenceID               9740
kingdom                       0
phylum                      726
class                      2079
order                      3420
family                      990
genus                      1864
species                    5765
taxonRank                     0
verbatimScientificName     1055
countryCode                   0
occurrenceStatus              0
individualCount           72315
decimalLatitude           19732
decimalLongitude          19732
eventDate                 53470
year                      15915
taxonKey                      0
speciesKey                 5767
dtype: int64


In [24]:
# Associer chaque observation à une maille de la grille

# Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')

# Filtrer les lignes où les coordonnées sont NaN
df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)

# Ajouter le grid aux points
df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)

# Afficher les premières lignes
df_biodiv_with_grid.head()

Unnamed: 0,occurrenceID,kingdom,phylum,class,order,family,genus,species,taxonRank,verbatimScientificName,countryCode,occurrenceStatus,individualCount,decimalLatitude,decimalLongitude,eventDate,year,taxonKey,speciesKey,grid_name
0,URN:catalog:CLO:EBIRD:OBS1687445869,Animalia,Chordata,Aves,Passeriformes,Hirundinidae,Hirundo,Hirundo rustica,SPECIES,Hirundo rustica,AL,PRESENT,12.0,40.96384,19.469254,2023-04-16 00:00:00+00:00,2023.0,9515886,9515886.0,20kmE1944N4105
1,URN:catalog:CLO:EBIRD:OBS998242125,Animalia,Chordata,Aves,Suliformes,Phalacrocoracidae,Phalacrocorax,Phalacrocorax carbo,SPECIES,Phalacrocorax carbo,AL,PRESENT,8.0,40.92713,19.496225,2020-10-18 00:00:00+00:00,2020.0,2481890,2481890.0,20kmE1944N4087
2,URN:catalog:CLO:EBIRD:OBS1149393329,Animalia,Chordata,Aves,Passeriformes,Motacillidae,Motacilla,Motacilla cinerea,SPECIES,Motacilla cinerea,AL,PRESENT,1.0,41.313248,19.824556,2021-05-08 00:00:00+00:00,2021.0,2490310,2490310.0,20kmE1992N4123
3,URN:catalog:CLO:EBIRD:OBS861391501,Animalia,Chordata,Aves,Anseriformes,Anatidae,Mareca,Mareca penelope,SPECIES,Mareca penelope,AL,PRESENT,4.0,40.882507,19.446424,2020-02-08 00:00:00+00:00,2020.0,8000602,8000602.0,20kmE1944N4087
4,URN:catalog:CLO:EBIRD:OBS1492950810,Animalia,Chordata,Aves,Passeriformes,Paridae,Parus,Parus major,SPECIES,Parus major,AL,PRESENT,2.0,42.45885,19.874212,2021-08-17 00:00:00+00:00,2021.0,9705453,9705453.0,20kmE1992N4249


In [25]:
# Pré-traitement des données

# Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
df=df_biodiv_with_grid.copy()

cle_ID='speciesKey'
cle_date='year'
cle_geo="codeMaille"+str(grid_size_km)+'Km'

n_especes_entrée=len(df[cle_ID].unique())
n_obs_entrée=len(df)
print("En entrée : nombre d'espèces observées :",n_especes_entrée)
print("En entrée : nombre d'obs :",n_obs_entrée)

colonnes_obligatoires=[cle_ID]
# Supprimer les lignes ou une donnée cruciale manque
df_cleaned = df.dropna(subset=colonnes_obligatoires)

# Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
#valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
#df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)

# Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)

df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)

df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)

# Ajouter des colonnes year, month, day et day_of_year
#df_cleaned[cle_date] = pd.to_datetime(df_cleaned[cle_date], utc=True, errors='coerce')

# Now safely access .dt.year, .dt.month, etc., from df_cleaned
#df_cleaned['year'] = df_cleaned[cle_date].dt.year

# Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')

# Remplacer les NaN par 1
df_cleaned['individualCount'].fillna(1, inplace=True)


print(f"En sortie : nombre d'espèces observées :{len(df_cleaned[cle_ID].unique())} soit une perte de {100-(round(len(df_cleaned[cle_ID].unique())/n_especes_entrée*100))}%")
print(f"En sortie : nombre d'obs :{len(df_cleaned)} soit une perte de {100-round(len(df_cleaned)/n_obs_entrée*100)} %") 



En entrée : nombre d'espèces observées : 7184
En entrée : nombre d'obs : 103579
En sortie : nombre d'espèces observées :7004 soit une perte de 3%
En sortie : nombre d'obs :102406 soit une perte de 1 %


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


In [None]:
# Sauvegarde du fichier pré-traité sous le format 1 ligne = 1 observation
"""
df_cleaned.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+
                  country_name.replace(' ', '_')+
                  "/Filtered"+
                  "/data_GBIF_"+country_name.replace(' ', '_')+"_"+str(grid_size_km)+"Km_filtered.csv",
                  index=False)
"""

In [26]:
# Grouper les données par maille et par période 
annee_mini=1
bornes_temporelles=[1800, 1990,2010, 2024] 

df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)

print(f"En sortie : nombre d'obs :{len(df_cleaned[df_cleaned['year']>annee_mini])} soit une perte de {100-round(len(df_cleaned[df_cleaned['year']>annee_mini])/n_obs_entrée*100)} %") 

periode
Période 3: 2011 à 2024    91506
Période 2: 1991 à 2010     4075
Période 1: 1801 à 1990     1623
Name: count, dtype: int64
En sortie : nombre d'obs :97245 soit une perte de 6 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'periode'] = pd.cut(df['year'], bins=bornes_temporelles,


In [27]:
dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")

df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")

In [28]:
# Sauvegarder les données groupées

colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                     'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
df_to_save=df_maille_espece[colonnes_a_conserver]
df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                  +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)

In [5]:
def formater_data_gbif_all_in_one(country_name,grid_size_km,cle_ID='speciesKey',annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024]):
    
    world_terrestre = gpd.read_file(path+"/SIG_global/world-administrative-boundaries.geojson")
    world_maritime = gpd.read_file(path+"/SIG_global/eez_v11.gpkg")
    
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    country_code = world_terrestre[world_terrestre["name"] == country_name]["color_code"].iloc[0]
    
    # Génération de la grille pour le pays choisi
    country = world_terrestre[world_terrestre["color_code"] == country_code]
    country_geometry = country.geometry.unary_union  # Combine all geometries into one
    
    # Get the bounds of the input GeoDataFrame
    minx, miny, maxx, maxy = country_geometry.bounds
    
    # Step 3: Adjust bounding box to align with (0, 0)
    # Use approximate degrees per km for the middle of the country
    midpoint_lat = (miny + maxy) / 2
    
    country_grid_terrestre = create_country_grid_WGS84(world_terrestre, country_code,"color_code",grid_size_km=grid_size_km,midpoint_lat=midpoint_lat)
    nouveaux_noms_columns = {'cell_name': cle_geo}
    country_grid_terrestre = country_grid_terrestre.rename(columns=nouveaux_noms_columns)
    
    country_maritime = world_maritime[world_maritime["ISO_TER1"] == country_code]
    
    country_grid_maritime = gpd.GeoDataFrame()
    if not country_maritime.empty:
        country_grid_maritime = create_country_grid_WGS84(world_maritime, country_code,"ISO_TER1" ,grid_size_km=grid_size_km,midpoint_lat=midpoint_lat)
        nouveaux_noms_columns = {'cell_name': cle_geo}
        country_grid_maritime = country_grid_maritime.rename(columns=nouveaux_noms_columns)
    
    # Concatenate the two DataFrames
    combined_grid = pd.concat([country_grid_maritime, country_grid_terrestre])
    
    # Check for duplicates based on all columns
    duplicates = combined_grid[combined_grid.duplicated(subset=[cle_geo])]
    combined_grid_unique = combined_grid.drop_duplicates(subset=[cle_geo])

    # Sauvegarder le fichier grille
    country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
    if not country_grid_maritime.empty:
        country_grid_maritime.to_file(
            r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson", 
            driver="GeoJSON"
        )
    combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson", 
                                 driver="GeoJSON")
    
    ### Importer les données de biodiversité du pays ###
    
    cle_geo="codeMaille"+str(grid_size_km)+'Km'
    
    fichier = "C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')+"/Raw/extractGBIF_"+country_name.replace(' ', '_')+"_12112024.csv"  # Remplace par le chemin vers ton fichier
    
    colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                         'taxonRank','countryCode','occurrenceStatus', 'individualCount', 'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year'
                         ]
    
    df_biodiv = pd.read_csv(
        fichier, 
        sep='\t', 
        on_bad_lines='skip',  # Skip problematic lines
        quoting=3,            # Ignore quotes
        usecols=colonnes_a_importer
        )
    
    df_biodiv[cle_ID] = pd.to_numeric(df_biodiv[cle_ID], errors='coerce')
    
    # Associer chaque observation à une maille de la grille
    
    # Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
    df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
    df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')
    
    # Filtrer les lignes où les coordonnées sont NaN
    df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)
    
    # Ajouter le grid aux points
    df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)
    
    # Pré-traitement des données
    
    # Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
    df=df_biodiv_with_grid.copy()
    
    cle_date='year'
    
    colonnes_obligatoires=[cle_ID]
    # Supprimer les lignes ou une donnée cruciale manque
    df_cleaned = df.dropna(subset=colonnes_obligatoires)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    #valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
    #df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)
    
    df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)
    
    df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)
    
    df_cleaned['individualCount'] = df_cleaned['individualCount'].fillna(1)
    
    # Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
    df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')
    
    # Remplacer les NaN par 1
    df_cleaned['individualCount'].fillna(1, inplace=True)
    
    # Grouper les données par maille et par période 
    
    df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)
    
    dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")
    
    df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")
    
    # Sauvegarder les données groupées
    
    colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                         'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
    df_to_save=df_maille_espece[colonnes_a_conserver]
    df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                      +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)

  country_grid_terrestre.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_terrestre_"+str(grid_size_km)+"km.geojson",
  r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_" + country_name.replace(' ', '_') + "\SIG\country_grid_maritime_" + str(grid_size_km) + "km.geojson",
  combined_grid_unique.to_file(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_"+country_name.replace(' ', '_')+"\SIG\country_grid_combined_"+str(grid_size_km)+"km.geojson",


In [44]:
grid_size_km=[10,20]

for size_km in grid_size_km:
    countries=['Madagascar'] #20

    for country in countries:
        print(country)
        formater_data_gbif_all_in_one(country,size_km,annee_mini=0,bornes_temporelles=[1800, 1990,2010, 2024])

Madagascar


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    653149
Période 2: 1991 à 2010    535246
Période 1: 1801 à 1990    222796
Name: count, dtype: int64
Madagascar


  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
  country_geometry = country.geometry.unary_union  # Combine all geometries into one
  grid = grid[grid.intersects(country.unary_union)]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['individualCount'].fillna(1, inplace=True)


periode
Période 3: 2011 à 2024    653149
Période 2: 1991 à 2010    535246
Période 1: 1801 à 1990    222796
Name: count, dtype: int64


In [None]:
cle_ID='speciesKey'
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_name="France"
fichier=r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France\Raw\extractGBIF_France_12112024.csv"
# Taille de chaque paquet
lines_per_chunk = 10000000

colonnes_a_importer=['kingdom','phylum','class','order','family','genus','species','verbatimScientificName',
                     'taxonRank','countryCode','occurrenceStatus', 'individualCount', 'decimalLongitude','decimalLatitude','eventDate','taxonKey','speciesKey','occurrenceID', 'year']
# Lecture et découpage du fichier
chunk_number = 0

for df_biodiv in pd.read_csv(fichier, sep='\t', chunksize=lines_per_chunk, on_bad_lines='skip', usecols=colonnes_a_importer):
    chunk_number += 1
    # Assurez-vous que 'decimalLongitude' et 'decimalLatitude' sont des colonnes numériques
    df_biodiv['decimalLongitude'] = pd.to_numeric(df_biodiv['decimalLongitude'], errors='coerce')
    df_biodiv['decimalLatitude'] = pd.to_numeric(df_biodiv['decimalLatitude'], errors='coerce')
    df_biodiv[cle_ID] = pd.to_numeric(df_biodiv[cle_ID], errors='coerce')
    # Filtrer les lignes où les coordonnées sont NaN
    df_biodiv = df_biodiv.dropna(subset=['decimalLongitude', 'decimalLatitude',cle_ID]).reset_index(drop=True)
    
    # Ajouter le grid aux points
    df_biodiv_with_grid = add_grid_to_country(df_biodiv, combined_grid_unique,cle_geo)
    
    # Afficher les premières lignes
    df_biodiv_with_grid.head()

    # Pré-traitement des données
    # Lire uniquement les colonnes spécifiées du fichier dans un DataFrame
    df=df_biodiv_with_grid.copy()
    
    cle_date='year'

    n_especes_entrée=len(df[cle_ID].unique())
    n_obs_entrée=len(df)
    print("En entrée : nombre d'espèces observées :",n_especes_entrée)
    print("En entrée : nombre d'obs :",n_obs_entrée)
    
    colonnes_obligatoires=[cle_ID]
    # Supprimer les lignes ou une donnée cruciale manque
    df_cleaned = df.dropna(subset=colonnes_obligatoires)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    #valid_ranks = ['SPECIES', 'SUBSPECIES', 'VARIETY']
    #df_cleaned = df_cleaned[df_cleaned['taxonRank'].isin(valid_ranks)].reset_index(drop=True)
    
    # Filtrer les lignes où 'taxonRank' est SPECIES, SUBSPECIES ou VARIETY
    df_cleaned = df_cleaned[df_cleaned['occurrenceStatus']=='PRESENT'].reset_index(drop=True)
    
    df_cleaned[cle_ID] = df_cleaned[cle_ID].astype(int)
    
    df_cleaned.rename(columns={'grid_name': cle_geo}, inplace=True)
    
    df_cleaned['individualCount'] = df_cleaned['individualCount'].fillna(1)
    
    # Ajouter des colonnes year, month, day et day_of_year
    #df_cleaned[cle_date] = pd.to_datetime(df_cleaned[cle_date], utc=True, errors='coerce')
    
    # Now safely access .dt.year, .dt.month, etc., from df_cleaned
    #df_cleaned['year'] = df_cleaned[cle_date].dt.year
    
    # Convertir 'individualCount' en numérique, en forçant les erreurs à NaN
    df_cleaned['individualCount'] = pd.to_numeric(df_cleaned['individualCount'], errors='coerce')
    
    # Remplacer les NaN par 1
    df_cleaned['individualCount'].fillna(1, inplace=True)
    
    
    print(f"En sortie : nombre d'espèces observées :{len(df_cleaned[cle_ID].unique())} soit une perte de {100-(round(len(df_cleaned[cle_ID].unique())/n_especes_entrée*100))}%")
    print(f"En sortie : nombre d'obs :{len(df_cleaned)} soit une perte de {100-round(len(df_cleaned)/n_obs_entrée*100)} %") 
    
    # Grouper les données par maille et par période 
    annee_mini=0
    bornes_temporelles=[1800, 1990,2010, 2024] 
    
    df_maille_espece=formater_maille_espece_GBIF(df_cleaned,cle_geo,cle_ID,annee_mini,bornes_temporelles)
    
    print(f"En sortie : nombre d'obs :{len(df_cleaned[df_cleaned['year']>annee_mini])} soit une perte de {100-round(len(df_cleaned[df_cleaned['year']>annee_mini])/n_obs_entrée*100)} %") 
    
    dico_noms_vernaculaires_merged = pd.read_csv(r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\TAXO_GBIF\dico_noms_vernaculaires_merged.csv")
    
    df_maille_espece=pd.merge(df_maille_espece,dico_noms_vernaculaires_merged,on=cle_ID,how="left")
    
    # Sauvegarder les données groupées
    
    colonnes_a_conserver = ['kingdom','phylum','class','order','family','genus','species',
                         'nombreObs',cle_ID,cle_geo,'periode','taxonRank','vernacularName_fr','vernacularName_en','occurrenceID']
    df_to_save=df_maille_espece[colonnes_a_conserver]
    df_to_save.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                      +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes_"+str(chunk_number)+".csv", index=False)

In [None]:
cle_ID='speciesKey'
grid_size_km=20
cle_geo="codeMaille"+str(grid_size_km)+'Km'
country_name="France"

path=r"C:\Users\anormand\Documents\Projet Python\Biodiv\Data\GBIF_France"
df=pd.DataFrame()
for i in range(1,21):
    print(str(i))
    df_temp=pd.read_csv(path+"\data_GBIF_France_"+cle_geo+"_periodes_"+str(i)+".csv")
    df=pd.concat([df,df_temp],ignore_index=True)
                            

df_maille_espece = df.groupby([cle_geo, cle_ID,'periode'], observed=True)['nombreObs'].sum().reset_index(name='nombreObs')

df_dico=generer_dictionnaire_taxonomie(df,cle_ID)

df_maille_espece=pd.merge(df_maille_espece,df_dico,on=cle_ID)

df_maille_espece.to_csv("C:/Users/anormand/Documents/Projet Python/Biodiv/Data/GBIF_"+country_name.replace(' ', '_')
                  +"/data_GBIF_"+country_name.replace(' ', '_')+'_'+cle_geo+"_periodes.csv", index=False)