In [57]:
import pandas as pd
import numpy as np
from config import SOM_DATA_URL, AWC_DATA_URL
from functools import reduce

In [58]:
departements = { #set a dict of departement name as keys and dep code as value for missing dep values in datasets
    'Ain': 1,
    'Aisne': 2,
    'Allier': 3,
    'Alpes-de-Haute-Provence': 4,
    'Hautes-Alpes': 5,
    'Alpes-Maritimes': 6,
    'Ardèche': 7,
    'Ardennes': 8,
    'Ariège': 9,
    'Aube': 10,
    'Aude': 11,
    'Aveyron': 12,
    'Bouches-du-Rhône': 13,
    'Calvados': 14,
    'Cantal': 15,
    'Charente': 16,
    'Charente-Maritime': 17,
    'Cher': 18,
    'Corrèze': 19,
    'Corse': 20,
    "Côte-d'Or": 21,
    "Côtes-d'Armor": 22,
    'Creuse': 23,
    'Dordogne': 24,
    'Doubs': 25,
    'Drôme': 26,
    'Eure': 27,
    'Eure-et-Loir': 28,
    'Finistère': 29,
    'Gard': 30,
    'Haute-Garonne': 31,
    'Gers': 32,
    'Gironde': 33,
    'Hérault': 34,
    'Ille-et-Vilaine': 35,
    'Indre': 36,
    'Indre-et-Loire': 37,
    'Isère': 38,
    'Jura': 39,
    'Landes': 40,
    'Loir-et-Cher': 41,
    'Loire': 42,
    'Haute-Loire': 43,
    'Loire-Atlantique': 44,
    'Loiret': 45,
    'Lot': 46,
    'Lot-et-Garonne': 47,
    'Lozère': 48,
    'Maine-et-Loire': 49,
    'Manche': 50,
    'Marne': 51,
    'Haute-Marne': 52,
    'Mayenne': 53,
    'Meurthe-et-Moselle': 54,
    'Meuse': 55,
    'Morbihan': 56,
    'Moselle': 57,
    'Nièvre': 58,
    'Nord': 59,
    'Oise': 60,
    'Orne': 61,
    'Pas-de-Calais': 62,
    'Puy-de-Dôme': 63,
    'Pyrénées-Atlantiques': 64,
    'Hautes-Pyrénées': 65,
    'Pyrénées-Orientales': 66,
    'Bas-Rhin': 67,
    'Haut-Rhin': 68,
    'Rhône': 69,
    'Haute-Saône': 70,
    'Saône-et-Loire': 71,
    'Sarthe': 72,
    'Savoie': 73,
    'Haute-Savoie': 74,
    'Paris': 75,
    'Seine-Maritime': 76,
    'Seine-et-Marne': 77,
    'Yvelines': 78,
    'Deux-Sèvres': 79,
    'Somme': 80,
    'Tarn': 81,
    'Tarn-et-Garonne': 82,
    'Var': 83,
    'Vaucluse': 84,
    'Vendée': 85,
    'Vienne': 86,
    'Haute-Vienne': 87,
    'Vosges': 88,
    'Yonne': 89,
    'Territoire de Belfort': 90,
    'Essonne': 91,
    'Hauts-de-Seine': 92,
    'Seine-Saint-Denis': 93,
    'Val-de-Marne': 94,
    "Val-d'Oise": 95
}

In [59]:
#load historical mean of data for month we can't predict (we cant predict EVI data, we cant predict weather for 9 month in future etc) so we remplace prediction with mean historical values
hist_temp = pd.read_csv("temp_precip_mean_historical.csv")
hist_evi = pd.read_csv("evi_mean_historical.csv")
hist_vpd = pd.read_csv("vpd_mean_historical.csv")

#load current market year data, without data from current month
current_weather_vpd = pd.read_csv("temp_precip_vpd_mean_current.csv")
current_evi = pd.read_csv("evi_mean_current.csv")

#load forecast data
forecast_weather_vpd = pd.read_csv("weather_forecast.csv")

#load available water capacity and soil organic matter
som_pom = pd.read_json(f"{SOM_DATA_URL}/pom.json") #read json
som_pom['DEP'] = som_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
som_pom = som_pom.drop("nom", axis=1) #remove unwanted dep name -> final som pom data

maom_pom = pd.read_json(f"{SOM_DATA_URL}/maom.json") #read json
maom_pom['DEP'] = maom_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
maom_pom = maom_pom.drop("nom", axis=1) #remove unwanted dep name -> final som maom data

awc = pd.read_json(f"{AWC_DATA_URL}/AWC.json") #read json
awc['DEP'] = awc['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
awc = awc.drop("nom", axis=1) #remove unwanted dep name -> final AWC data

In [60]:
hist_evi_month = np.setxor1d(hist_evi["MONTH"].unique(), current_evi["MONTH"].unique()) #get month that we do not have in current data 
hist_evi = hist_evi[hist_evi["MONTH"].isin(hist_evi_month)] #only keep month were we do not have the current data

forecast_weather_vpd_month = np.setxor1d(hist_temp["MONTH"].unique(), current_weather_vpd["MONTH"].unique()) #same for forecast data
forecast_weather_vpd = forecast_weather_vpd[forecast_weather_vpd["MONTH"].isin(forecast_weather_vpd_month)]

In [61]:
print(current_weather_vpd["MONTH"].unique()) #check current month we have
print(forecast_weather_vpd["MONTH"].unique()) #check forecast we have
print(hist_evi["MONTH"].unique()) #check monthly mean evi we need

[ 1  2  9 10 11 12]
[3 4 5 6 7 8]
[3 4 5 6 7 8]


In [62]:
# PIVOT CURRENT WEATHER AND VPD DATA
current_weather_vpd_pivot = current_weather_vpd.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP'],
    columns='MONTH',
    values=['RR', 'TN', 'TX', 'TM', 'vpd_mean', 'vpd_min', 'vpd_max']
)

current_weather_vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in current_weather_vpd_pivot.columns
]
current_weather_vpd_pivot = current_weather_vpd_pivot.reset_index()


# PIVOT FORECAST WEATHER AND VPD DATA
forecast_weather_vpd_pivot = forecast_weather_vpd.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP'],
    columns='MONTH',
    values=['RR', 'TN', 'TX', 'TM', 'vpd_mean', 'vpd_min', 'vpd_max']
)

forecast_weather_vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in forecast_weather_vpd_pivot.columns
]
forecast_weather_vpd_pivot = forecast_weather_vpd_pivot.reset_index()


# PIVOT CURRENT EVI DATA
current_evi_pivot = current_evi.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP'],
    columns='MONTH',
    values=['evi']
)

current_evi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in current_evi_pivot.columns
]
current_evi_pivot = current_evi_pivot.reset_index()


# PIVOT HISTORICAL EVI DATA
hist_evi_pivot = hist_evi.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP'],
    columns='MONTH',
    values=['evi']
)

hist_evi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in hist_evi_pivot.columns
]
hist_evi_pivot = hist_evi_pivot.reset_index()

In [63]:
dfs = [current_weather_vpd_pivot, forecast_weather_vpd_pivot, current_evi_pivot, hist_evi_pivot]

In [64]:
merged = reduce(lambda left, right: pd.merge(left, right, on="DEP", how="inner"), dfs) #use reduce to merge df1 and df2, then the result with df3, then df4... until there is no more df in the dfs list

#merge awc and som datas
merged = merged.merge(som_pom, on="DEP", how="left")
merged = merged.merge(maom_pom, on="DEP", how="left")
merged = merged.merge(awc, on="DEP", how="left")

In [65]:
merged.columns

Index(['DEP', 'RR1', 'RR2', 'RR9', 'RR10', 'RR11', 'RR12', 'TM1', 'TM2', 'TM9',
       'TM10', 'TM11', 'TM12', 'TN1', 'TN2', 'TN9', 'TN10', 'TN11', 'TN12',
       'TX1', 'TX2', 'TX9', 'TX10', 'TX11', 'TX12', 'vpd_max1', 'vpd_max2',
       'vpd_max9', 'vpd_max10', 'vpd_max11', 'vpd_max12', 'vpd_mean1',
       'vpd_mean2', 'vpd_mean9', 'vpd_mean10', 'vpd_mean11', 'vpd_mean12',
       'vpd_min1', 'vpd_min2', 'vpd_min9', 'vpd_min10', 'vpd_min11',
       'vpd_min12', 'RR3', 'RR4', 'RR5', 'RR6', 'RR7', 'RR8', 'TM3', 'TM4',
       'TM5', 'TM6', 'TM7', 'TM8', 'TN3', 'TN4', 'TN5', 'TN6', 'TN7', 'TN8',
       'TX3', 'TX4', 'TX5', 'TX6', 'TX7', 'TX8', 'vpd_max3', 'vpd_max4',
       'vpd_max5', 'vpd_max6', 'vpd_max7', 'vpd_max8', 'vpd_mean3',
       'vpd_mean4', 'vpd_mean5', 'vpd_mean6', 'vpd_mean7', 'vpd_mean8',
       'vpd_min3', 'vpd_min4', 'vpd_min5', 'vpd_min6', 'vpd_min7', 'vpd_min8',
       'evi1', 'evi2', 'evi9', 'evi10', 'evi11', 'evi12', 'evi3', 'evi4',
       'evi5', 'evi6', 'evi7', 'ev

In [66]:
#match training data column order
# match training data column order
merged = merged[["DEP","RR1", "RR2", "RR3", "RR4", "RR5", "RR6", "RR7", "RR8", "RR9", "RR10", "RR11", "RR12", 
                 "TM1", "TM2", "TM3", "TM4", "TM5", "TM6", "TM7", "TM8", "TM9", "TM10", "TM11", "TM12", 
                 "TN1", "TN2", "TN3", "TN4", "TN5", "TN6", "TN7", "TN8", "TN9", "TN10", "TN11", "TN12", 
                 "TX1", "TX2", "TX3", "TX4", "TX5", "TX6", "TX7", "TX8", "TX9", "TX10", "TX11", "TX12", 
                 "vpd_max1", "vpd_max2", "vpd_max3", "vpd_max4", "vpd_max5", "vpd_max6", "vpd_max7", "vpd_max8", "vpd_max9", "vpd_max10", "vpd_max11", "vpd_max12", 
                 "vpd_mean1", "vpd_mean2", "vpd_mean3", "vpd_mean4", "vpd_mean5", "vpd_mean6", "vpd_mean7", "vpd_mean8", "vpd_mean9", "vpd_mean10", "vpd_mean11", "vpd_mean12", 
                 "vpd_min1", "vpd_min2", "vpd_min3", "vpd_min4", "vpd_min5", "vpd_min6", "vpd_min7", "vpd_min8", "vpd_min9", "vpd_min10", "vpd_min11", "vpd_min12", 
                 "evi1", "evi2", "evi3", "evi4", "evi5", "evi6", "evi7", "evi8", "evi9", "evi10", "evi11", "evi12","pom","maom","awc"]]

In [73]:
merged['year'] = 2025
merged['yield'] = 0

In [74]:
merged = merged.dropna() #we have nan pom an maom for 75 Paris, there is no fields in Paris so remove them ok

In [75]:
dataset = pd.read_csv("../../wheat_model_dataset_1980_2018.csv")

In [76]:
dataset = pd.concat([dataset, merged])

In [77]:
dataset

Unnamed: 0,department,year,yield,area,production,DEP,MY,RR1,RR2,RR3,...,evi6,evi7,evi8,evi9,evi10,evi11,evi12,pom,maom,awc
0,Ain,1980,4.39805,36001.0,158334.3,1,1979/1980,3.672457,3.484615,4.661456,...,,,,,,,,16.789692,26.925089,0.107088
1,Ain,1981,4.30319,36075.0,155237.5,1,1980/1981,5.279117,2.298026,4.151613,...,,,,,,,,16.789692,26.925089,0.107088
2,Ain,1982,5.00000,38050.0,190250.0,1,1981/1982,3.686452,0.559464,3.402339,...,,,,,,,,16.789692,26.925089,0.107088
3,Ain,1983,3.29500,37924.0,124959.6,1,1982/1983,3.015110,3.857753,3.601241,...,,,,,,,,16.789692,26.925089,0.107088
4,Ain,1984,5.90000,40000.0,236000.0,1,1983/1984,5.722266,3.942845,3.163952,...,,,,,,,,16.789692,26.925089,0.107088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,,2025,0.00000,,,88,,138.311111,70.233333,41.246545,...,0.549811,0.508733,0.483523,0.496765,0.423578,0.366557,0.312296,19.340321,29.435806,0.112435
88,,2025,0.00000,,,89,,79.933333,45.300000,41.762829,...,0.552851,0.422868,0.382229,0.414454,0.374777,0.323499,0.261036,5.579002,19.594700,0.102762
89,,2025,0.00000,,,90,,102.700000,49.200000,46.007624,...,0.553085,0.525909,0.504568,0.503770,0.383430,0.314863,0.298320,16.129911,28.980818,0.105598
90,,2025,0.00000,,,91,,96.000000,42.200000,38.307286,...,0.526223,0.398204,0.348727,0.385912,0.351164,0.175606,0.257704,4.623960,16.732876,0.088218


In [78]:
dataset.to_csv("../../wheat_model_current.csv")