Objectif : utiliser des random forests afin de prédire l'évolution de l'espèce en fonction du changement climatique dans le temps et l'espace

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import s3fs 

1. retrieving data from the MinIO Client cloud 

Data on Atlantic puffins : hvf
Data on climate change  

In [8]:
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})
BUCKET = "esam"

# Puffin data 
colonies_path = f"{BUCKET}/diffusion/puffin_data/puffin-data1_Colonies.csv"
with fs.open(colonies_path, "r") as f:  
        colonies_data = pd.read_csv(f)

data_path = f'{BUCKET}/diffusion/puffin_data/puffin-data1_Colonies.csv'
with fs.open(data_path, "r") as f:  
        puffin_data = pd.read_csv(f)

# Copernicus data 
dataframes = []
files_in_folder = fs.ls(f'{BUCKET}/diffusion/cds_data') 

for file_path in files_in_folder:
        with fs.open(file_path, "r") as f:
                df = pd.read_csv(f)
                dataframes.append(df)  

combined_cds = pd.concat(dataframes, ignore_index=True)
print(combined_cds.head())  
print(combined_cds.info())  




      url: https://cds.climate.copernicus.eu/api
0      key: 4a3c4e19-b15f-41a9-8374-adf4f8ee3fb3
1      key: 4a3c4e19-b15f-41a9-8374-adf4f8ee3fb3
2      key: 4a3c4e19-b15f-41a9-8374-adf4f8ee3fb3
3      key: 4a3c4e19-b15f-41a9-8374-adf4f8ee3fb3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column                                          Non-Null Count  Dtype 
---  ------                                          --------------  ----- 
 0       url: https://cds.climate.copernicus.eu/api  4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes
None


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 1. Charger les données
data = pd.read_csv('puffins_data.csv')

# 2. Définir les variables explicatives et cible
X = data[['Latitude', 'Longitude', 'SST_Janvier', 'SST_Juin', 'Année']]
y = data['Observations_Puffins']

# 3. Diviser les données en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Initialiser et entraîner le modèle
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 5. Prédire et évaluer le modèle
y_pred = rf_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.2f}')

# 6. Analyser l'importance des variables
importances = rf_model.feature_importances_
features = X.columns

plt.figure(figsize=(8, 6))
plt.barh(features, importances)
plt.xlabel('Importance')
plt.ylabel('Variables')
plt.title('Importance des variables dans le modèle Random Forest')
plt.show()

# 7. Prédictions futures
new_data = pd.DataFrame({
    'Latitude': [65.0, 66.5],
    'Longitude': [-18.0, -17.5],
    'SST_Janvier': [5.2, 4.8],
    'SST_Juin': [8.1, 7.9],
    'Année': [2015, 2016]
})

predictions = rf_model.predict(new_data)
print(f'Prédictions : {predictions}')
