In [1]:
import pandas as pd
import numpy as np
from config import *

In [2]:
def get_market_year(row): #make a market year row in datas
    year = row['DATE'].year
    month = row['DATE'].month
    if month >= 9:
        market_year = f"{year}/{year + 1}"
    else:
        market_year = f"{year - 1}/{year}"
    return market_year

In [3]:
departements = { #set a dict of departement name as keys and dep code as value for missing dep values in datasets
    'Ain': 1,
    'Aisne': 2,
    'Allier': 3,
    'Alpes-de-Haute-Provence': 4,
    'Hautes-Alpes': 5,
    'Alpes-Maritimes': 6,
    'Ardèche': 7,
    'Ardennes': 8,
    'Ariège': 9,
    'Aube': 10,
    'Aude': 11,
    'Aveyron': 12,
    'Bouches-du-Rhône': 13,
    'Calvados': 14,
    'Cantal': 15,
    'Charente': 16,
    'Charente-Maritime': 17,
    'Cher': 18,
    'Corrèze': 19,
    'Corse': 20,
    "Côte-d'Or": 21,
    "Côtes-d'Armor": 22,
    'Creuse': 23,
    'Dordogne': 24,
    'Doubs': 25,
    'Drôme': 26,
    'Eure': 27,
    'Eure-et-Loir': 28,
    'Finistère': 29,
    'Gard': 30,
    'Haute-Garonne': 31,
    'Gers': 32,
    'Gironde': 33,
    'Hérault': 34,
    'Ille-et-Vilaine': 35,
    'Indre': 36,
    'Indre-et-Loire': 37,
    'Isère': 38,
    'Jura': 39,
    'Landes': 40,
    'Loir-et-Cher': 41,
    'Loire': 42,
    'Haute-Loire': 43,
    'Loire-Atlantique': 44,
    'Loiret': 45,
    'Lot': 46,
    'Lot-et-Garonne': 47,
    'Lozère': 48,
    'Maine-et-Loire': 49,
    'Manche': 50,
    'Marne': 51,
    'Haute-Marne': 52,
    'Mayenne': 53,
    'Meurthe-et-Moselle': 54,
    'Meuse': 55,
    'Morbihan': 56,
    'Moselle': 57,
    'Nièvre': 58,
    'Nord': 59,
    'Oise': 60,
    'Orne': 61,
    'Pas-de-Calais': 62,
    'Puy-de-Dôme': 63,
    'Pyrénées-Atlantiques': 64,
    'Hautes-Pyrénées': 65,
    'Pyrénées-Orientales': 66,
    'Bas-Rhin': 67,
    'Haut-Rhin': 68,
    'Rhône': 69,
    'Haute-Saône': 70,
    'Saône-et-Loire': 71,
    'Sarthe': 72,
    'Savoie': 73,
    'Haute-Savoie': 74,
    'Paris': 75,
    'Seine-Maritime': 76,
    'Seine-et-Marne': 77,
    'Yvelines': 78,
    'Deux-Sèvres': 79,
    'Somme': 80,
    'Tarn': 81,
    'Tarn-et-Garonne': 82,
    'Var': 83,
    'Vaucluse': 84,
    'Vendée': 85,
    'Vienne': 86,
    'Haute-Vienne': 87,
    'Vosges': 88,
    'Yonne': 89,
    'Territoire de Belfort': 90,
    'Essonne': 91,
    'Hauts-de-Seine': 92,
    'Seine-Saint-Denis': 93,
    'Val-de-Marne': 94,
    "Val-d'Oise": 95
}

In [4]:
hist_weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/raw_rr_tn_tx_tm-1950-2023.csv").drop(["Unnamed: 0"], axis=1) #read raw weather data
current_weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/raw_rr_tn_tx_tm-current.csv").drop(["Unnamed: 0"], axis=1)
raw_weather = pd.concat([hist_weather_raw, current_weather_raw])

raw_weather = raw_weather.groupby(["DATE","DEP"]).mean().reset_index()
raw_weather["DATE"] = pd.to_datetime(raw_weather["DATE"])
raw_weather = raw_weather[raw_weather["DATE"] < raw_weather["DATE"].iloc[-1]] #remove last date of data, missing values
weather_temp = raw_weather[["DATE", "DEP", "TN", "TX", "TM"]].groupby('DEP').resample(rule="ME", on="DATE").mean().drop("DEP", axis=1).reset_index()
weather_precip = raw_weather[["DATE", "DEP", "RR"]].groupby('DEP').resample(rule="ME", on="DATE").sum().drop("DEP", axis=1).reset_index()
weather = pd.merge(weather_temp, weather_precip, on=["DATE", "DEP"], how="inner").reset_index(drop=True)
weather["MY"] = weather.apply(get_market_year, axis=1)
weather["MONTH"] = weather["DATE"].dt.month
#remove years where there is no full data for a market year
weather = weather[weather["DATE"] >= "1981-09-30"]
weather = weather[weather["MY"] != "2024/2025"] #remove current MY
weather = weather[weather["DEP"] != 92] # we remove this dep (next to paris so its ok) because we have NaNs data for temp
weather.loc[weather["TM"].isna(), "TM"] = (weather["TX"] + weather["TN"]) / 2 #we calculute a mean value of min temp and max temp for missing values of mean temp

In [5]:
weather_pivot = weather.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH",
    values=['RR', 'TN', 'TX', 'TM']
)
weather_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in weather_pivot.columns
]
weather_pivot = weather_pivot.reset_index()

In [6]:
raw_vpd = pd.read_csv(f"{VPD_DATA_URL}/ERA5/1940-2025_vpd.csv").drop(["Unnamed: 0", "Unnamed: 0.1", "departement"], axis=1)
raw_vpd = raw_vpd.rename(columns={"date":"DATE", "dep":"DEP"})
raw_vpd["DATE"] = pd.to_datetime(raw_vpd["DATE"])
raw_vpd = raw_vpd.groupby('DEP').resample(rule="ME", on="DATE").mean().drop("DEP", axis=1).reset_index()
raw_vpd["MY"] = raw_vpd.apply(get_market_year, axis=1)
raw_vpd["MONTH"] = raw_vpd["DATE"].dt.month
#remove years where there is no full data for a market year
raw_vpd = raw_vpd[(raw_vpd["MY"] != "1939/1940") & (raw_vpd["MY"] != "2024/2025")]
raw_vpd = raw_vpd[~raw_vpd["DEP"].isin([75, 92, 93])] #remove dep without data (to small of a departement)

In [7]:
vpd_pivot = raw_vpd.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH",
    values=['vpd_max', 'vpd_mean', 'vpd_min']
)
vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in vpd_pivot.columns
]
vpd_pivot = vpd_pivot.reset_index()

In [8]:
raw_ndvi = pd.read_csv(f"{NDVI_DATA_URL}/ndvi_processed.csv").drop(["Unnamed: 0", "departement"], axis=1)
raw_ndvi["date"] = pd.to_datetime(raw_ndvi["date"])
raw_ndvi = raw_ndvi.groupby('dep').resample(rule="ME", on="date").mean().drop("dep", axis=1)
raw_ndvi = raw_ndvi.reset_index()
raw_ndvi = raw_ndvi.rename(columns={"date":"DATE", "dep":"DEP"})
raw_ndvi["MY"] = raw_ndvi.apply(get_market_year, axis=1)
#remove years where there is no full data for a market year
raw_ndvi = raw_ndvi[(raw_ndvi["MY"] != "1980/1981") & (raw_ndvi["MY"] != "2024/2025")] 
raw_ndvi["MONTH"] = raw_ndvi["DATE"].dt.month
raw_ndvi = raw_ndvi[~raw_ndvi["DEP"].isin([75,93])] #remove dep 75 and 93 as we have no data for these dep
raw_ndvi = raw_ndvi.ffill() #we still have nan values for only the last 3 month of 1994. I decided to fill them with the last value, meaning the ndvi value of september

In [9]:
ndvi_pivot = raw_ndvi.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH",
    values=['ndvi_mean']
)
ndvi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in ndvi_pivot.columns
]
ndvi_pivot = ndvi_pivot.reset_index()

1st dataset (1900 - 2018)

In [10]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/2021-001_Schauberger-et-al_Data_FILTERED/wheat_total_data_1900-2018_FILTERED.txt", sep=';')

#aggregate Corse du sud and Haute Corse into one Corse department :
corseSud = yields_raw[yields_raw['department'] == "Corse-du-sud"].fillna(0)
corseHaute = yields_raw[yields_raw['department'] == "Haute-Corse"].fillna(0)
corseArea = corseSud["area"].reset_index(drop=True) + corseHaute["area"].reset_index(drop=True)
corseProd = corseSud["production"].reset_index(drop=True) + corseHaute["production"].reset_index(drop=True)
corseYield = (corseSud["production"].reset_index(drop=True) + corseHaute["production"].reset_index(drop=True)) / (corseSud["area"].reset_index(drop=True) + corseHaute["area"].reset_index(drop=True))
corseYears = pd.Series(range(1900, 2019))
corse = pd.DataFrame({"department": "Corse", "year": corseYears, "yield": corseYield, "area": corseArea, "production": corseProd})
yields_raw = pd.concat([yields_raw, corse])
yields_raw = yields_raw[(yields_raw['department'] != "Corse-du-sud") & (yields_raw['department'] != "Haute-Corse")]

yields_raw['DEP'] = yields_raw['department'].map(departements).replace('NA', np.nan) #map dep name to dep code
yields_raw = yields_raw.drop("department", axis=1)
yields_raw["MY"] = (yields_raw['year'] - 1).astype(str) + '/' + yields_raw['year'].astype(str) 
oldYields = yields_raw.copy()

2nd dataset (2000 - 2024)

In [11]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/SCR-GRC-hist_dep_surface_prod_cult_cer-A25.csv", encoding='utf-8')
yields_raw["ESPECES"] = yields_raw["ESPECES"].str.strip() #remove left and white spaces
yields_raw = yields_raw[yields_raw["ESPECES"] == "Blé tendre"].reset_index(drop=True) #filter soft wheat

#Corse
corseSud = yields_raw[yields_raw["DEP"] == "2A"]
corseSud = pd.concat([pd.DataFrame({'ANNEE': list(range(2000, 2016))}), corseSud], ignore_index=True).fillna(0)
corseHaute = yields_raw[yields_raw['DEP'] == "2B"].fillna(0)
corseArea = corseSud["CULT_SURF"].reset_index(drop=True) + corseHaute["CULT_SURF"].reset_index(drop=True)
corseProd = corseSud["CULT_PROD"].reset_index(drop=True) + corseHaute["CULT_PROD"].reset_index(drop=True)
corseYield = (corseSud["CULT_PROD"].reset_index(drop=True) + corseHaute["CULT_PROD"].reset_index(drop=True)) / (corseSud["CULT_SURF"].reset_index(drop=True) + corseHaute["CULT_SURF"].reset_index(drop=True))
corseYears = pd.Series(range(2000, 2026))
corse = pd.DataFrame({"DEPARTEMENT": "Corse", "DEP": 20, "ANNEE": corseYears, "CULT_REND": corseYield, "CULT_SURF": corseArea, "CULT_PROD": corseProd})
yields_raw = pd.concat([yields_raw, corse])
yields_raw = yields_raw[(yields_raw['DEP'] != "2A") & (yields_raw['DEP'] != "2B")]

yields_raw["DEP"] = yields_raw["DEP"].astype(int)
yields = yields_raw[["ANNEE", "DEP", "CULT_SURF", "CULT_REND", "CULT_PROD"]] #keep only wanted data
yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str) 
yields = yields.rename(columns={"ANNEE": "year", "CULT_SURF": "area", "CULT_REND": "yield", "CULT_PROD": "production"})
yields['yield'] = yields['yield'] / 10 #convert to kg/ha
newYields = yields[yields["year"] <= 2024] #-> final yield data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str)


Concat both yields datasets

In [12]:
newYields = newYields[newYields['year'] >= 2019]
yields = pd.concat([oldYields, newYields])
yields = yields.dropna().reset_index(drop=True)
yields

Unnamed: 0,year,yield,area,production,DEP,MY
0,1900,1.44401,95230.0,137513.0,1,1899/1900
1,1901,1.02637,92300.0,94734.0,1,1900/1901
2,1902,0.99449,93150.0,92636.4,1,1901/1902
3,1903,1.20303,92714.0,111537.7,1,1902/1903
4,1904,0.98300,89094.0,87579.4,1,1903/1904
...,...,...,...,...,...,...
10835,2020,0.35000,72.0,252.0,20,2019/2020
10836,2021,0.65000,99.0,643.5,20,2020/2021
10837,2022,0.65000,83.0,539.5,20,2021/2022
10838,2023,0.40000,63.0,252.0,20,2022/2023


In [13]:
som_pom = pd.read_json(f"{SOM_DATA_URL}/pom.json") #read json
som_pom['DEP'] = som_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
som_pom = som_pom.drop("nom", axis=1) #remove unwanted dep name -> final som pom data

In [14]:
maom_pom = pd.read_json(f"{SOM_DATA_URL}/maom.json") #read json
maom_pom['DEP'] = maom_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
maom_pom = maom_pom.drop("nom", axis=1) #remove unwanted dep name -> final som maom data

In [15]:
awc = pd.read_json(f"{AWC_DATA_URL}/AWC.json") #read json
awc['DEP'] = awc['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
awc = awc.drop("nom", axis=1) #remove unwanted dep name -> final AWC data

In [25]:
weather_vpd = pd.merge(weather_pivot, vpd_pivot, on=['DEP', 'MY'], how="inner")
weather_vpd_ndvi = pd.merge(weather_vpd, ndvi_pivot, on=['DEP', 'MY'], how="inner")
weather_vpd_ndvi_yields = pd.merge(weather_vpd_ndvi, yields, on=['DEP', 'MY'], how="inner")

som_maom = pd.merge(som_pom, maom_pom, on="DEP", how='inner')
som_maom_awc = pd.merge(som_maom, awc, on="DEP", how='inner')

result = pd.merge(weather_vpd_ndvi_yields, som_maom_awc, on="DEP", how="inner")

In [28]:
cols_to_front = ['year', 'yield', 'area', 'production']
remaining_cols = [col for col in result.columns if col not in cols_to_front]
new_order = cols_to_front + remaining_cols
result = result[new_order]

In [29]:
result

Unnamed: 0,year,yield,area,production,MY,DEP,RR1,RR2,RR3,RR4,...,ndvi_mean6,ndvi_mean7,ndvi_mean8,ndvi_mean9,ndvi_mean10,ndvi_mean11,ndvi_mean12,pom,maom,awc
0,1982,5.00000,38050.0,190250.0,1981/1982,1,114.280000,15.665000,105.472500,20.574359,...,0.296783,0.361438,0.318044,0.196142,0.176611,0.180859,0.023504,16.789692,26.925089,0.107088
1,1982,6.44000,175000.0,1127000.0,1981/1982,2,73.117742,18.958065,66.470968,24.843548,...,0.275896,0.296582,0.167389,0.224620,0.101299,0.091185,0.029490,5.252623,18.222063,0.095255
2,1982,4.51238,45564.0,205602.0,1981/1982,3,86.126087,18.006522,92.515217,12.595652,...,0.339788,0.344192,0.289400,0.229686,0.148436,0.260774,0.028602,6.453841,21.526911,0.089040
3,1982,2.62370,13517.0,35464.5,1981/1982,4,19.454286,26.475000,34.014286,26.823529,...,0.249519,0.292760,0.269123,0.190389,0.252101,0.235063,0.077243,16.788250,28.354742,0.118781
4,1982,3.80000,5500.0,20900.0,1981/1982,5,52.214372,13.919872,44.737244,8.650611,...,0.221391,0.303296,0.260093,0.172809,0.198091,0.199589,0.029373,29.324879,31.984188,0.119110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3802,2024,5.30000,106900.0,566570.0,2023/2024,89,63.940000,89.925000,123.140000,55.930000,...,0.306480,0.293551,0.411035,0.375566,0.263226,0.165845,0.097976,5.579002,19.594700,0.102762
3803,2024,5.30000,2600.0,13780.0,2023/2024,90,142.257143,120.200000,109.228571,106.085714,...,0.333060,0.508806,0.717539,0.620807,0.326133,0.148563,0.138120,16.129911,28.980818,0.105598
3804,2024,6.20000,24714.0,153226.8,2023/2024,91,57.166667,76.000000,86.633333,52.966667,...,0.318178,0.317884,0.460187,0.390205,0.316535,0.189023,0.115690,4.623960,16.732876,0.088218
3805,2024,6.20000,227.0,1407.4,2023/2024,94,66.700000,92.800000,110.700000,72.900000,...,0.253583,0.262074,0.399626,0.340669,0.297460,0.139963,0.079293,5.895328,20.545574,0.084148


In [30]:
result.to_csv("wheat_model_dataset_1980-2024.csv")