In [2]:
import pandas as pd
from config import *
import os
import numpy as np

In [3]:
def get_market_year(row): #make a market year row in datas
    year = row['YEAR']
    month = row['MONTH']
    if month >= 9:
        market_year = f"{year}/{year + 1}"
    else:
        market_year = f"{year - 1}/{year}"
    return market_year

In [4]:
departements = { #set a dict of departement name as keys and dep code as value for missing dep values in datasets
    'Ain': 1,
    'Aisne': 2,
    'Allier': 3,
    'Alpes-de-Haute-Provence': 4,
    'Hautes-Alpes': 5,
    'Alpes-Maritimes': 6,
    'Ardèche': 7,
    'Ardennes': 8,
    'Ariège': 9,
    'Aube': 10,
    'Aude': 11,
    'Aveyron': 12,
    'Bouches-du-Rhône': 13,
    'Calvados': 14,
    'Cantal': 15,
    'Charente': 16,
    'Charente-Maritime': 17,
    'Cher': 18,
    'Corrèze': 19,
    'Corse': 20,
    "Côte-d'Or": 21,
    "Côtes-d'Armor": 22,
    'Creuse': 23,
    'Dordogne': 24,
    'Doubs': 25,
    'Drôme': 26,
    'Eure': 27,
    'Eure-et-Loir': 28,
    'Finistère': 29,
    'Gard': 30,
    'Haute-Garonne': 31,
    'Gers': 32,
    'Gironde': 33,
    'Hérault': 34,
    'Ille-et-Vilaine': 35,
    'Indre': 36,
    'Indre-et-Loire': 37,
    'Isère': 38,
    'Jura': 39,
    'Landes': 40,
    'Loir-et-Cher': 41,
    'Loire': 42,
    'Haute-Loire': 43,
    'Loire-Atlantique': 44,
    'Loiret': 45,
    'Lot': 46,
    'Lot-et-Garonne': 47,
    'Lozère': 48,
    'Maine-et-Loire': 49,
    'Manche': 50,
    'Marne': 51,
    'Haute-Marne': 52,
    'Mayenne': 53,
    'Meurthe-et-Moselle': 54,
    'Meuse': 55,
    'Morbihan': 56,
    'Moselle': 57,
    'Nièvre': 58,
    'Nord': 59,
    'Oise': 60,
    'Orne': 61,
    'Pas-de-Calais': 62,
    'Puy-de-Dôme': 63,
    'Pyrénées-Atlantiques': 64,
    'Hautes-Pyrénées': 65,
    'Pyrénées-Orientales': 66,
    'Bas-Rhin': 67,
    'Haut-Rhin': 68,
    'Rhône': 69,
    'Haute-Saône': 70,
    'Saône-et-Loire': 71,
    'Sarthe': 72,
    'Savoie': 73,
    'Haute-Savoie': 74,
    'Paris': 75,
    'Seine-Maritime': 76,
    'Seine-et-Marne': 77,
    'Yvelines': 78,
    'Deux-Sèvres': 79,
    'Somme': 80,
    'Tarn': 81,
    'Tarn-et-Garonne': 82,
    'Var': 83,
    'Vaucluse': 84,
    'Vendée': 85,
    'Vienne': 86,
    'Haute-Vienne': 87,
    'Vosges': 88,
    'Yonne': 89,
    'Territoire de Belfort': 90,
    'Essonne': 91,
    'Hauts-de-Seine': 92,
    'Seine-Saint-Denis': 93,
    'Val-de-Marne': 94,
    "Val-d'Oise": 95
}

Note : We will pivot data with month number in column name. Datas starts from 1 (Jan) to 12 (Dec), but it is understood and applicated that months from 9 to 12 are harvest year - 1 and months from 1 to 8 are harvest year.

-> Market Year for EU wheat is September to Aug (for 2020 harvest, we plant in september 2019 and harvest july/aug 2020)

Weather data

In [5]:
weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/rr_tn_tx_tm-1950-2023.csv").drop(["Unnamed: 0"], axis=1) #read raw weather data
weather_raw["YEAR-MONTH"] = pd.to_datetime(weather_raw["DATE"]).dt.to_period('M') #new column with YYYY-MM format
weatherMeanGroupedDepMonth = weather_raw[["YEAR-MONTH", "DEP", "RR", "TN", "TX", "TM"]].groupby(["YEAR-MONTH", "DEP"]).mean() #group by year-month and dep then mean the values
weather = weatherMeanGroupedDepMonth.reset_index() #remove multi indexing 
weather["YEAR"] = weather["YEAR-MONTH"].dt.year
weather["MONTH"] = weather["YEAR-MONTH"].dt.month
weather["MY"] = weather.apply(get_market_year, axis=1) #add year, month and market year for merging 
weather = weather[(weather["MY"] >= "1979/1980") & (weather["MY"] <= "2022/2023")] #-> final weather data

In [6]:
weather_pivot = weather.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['RR', 'TN', 'TX', 'TM']
)

weather_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in weather_pivot.columns
]
weather_pivot = weather_pivot.reset_index()

Yields data

1st dataset (1900 - 2018)

In [9]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/2021-001_Schauberger-et-al_Data_FILTERED/wheat_total_data_1900-2018_FILTERED.txt", sep=';')

#aggregate Corse du sud and Haute Corse into one Corse department :
corseSud = yields_raw[yields_raw['department'] == "Corse-du-sud"].fillna(0)
corseHaute = yields_raw[yields_raw['department'] == "Haute-Corse"].fillna(0)
corseArea = corseSud["area"].reset_index(drop=True) + corseHaute["area"].reset_index(drop=True)
corseProd = corseSud["production"].reset_index(drop=True) + corseHaute["production"].reset_index(drop=True)
corseYield = (corseSud["production"].reset_index(drop=True) + corseHaute["production"].reset_index(drop=True)) / (corseSud["area"].reset_index(drop=True) + corseHaute["area"].reset_index(drop=True))
corseYears = pd.Series(range(1900, 2019))
corse = pd.DataFrame({"department": "Corse", "year": corseYears, "yield": corseYield, "area": corseArea, "production": corseProd})
yields_raw = pd.concat([yields_raw, corse])
yields_raw = yields_raw[(yields_raw['department'] != "Corse-du-sud") & (yields_raw['department'] != "Haute-Corse")]

yields_raw['dep'] = yields_raw['department'].map(departements).replace('NA', np.nan) #map dep name to dep code
yields_raw["MY"] = (yields_raw['year'] - 1).astype(str) + '/' + yields_raw['year'].astype(str) 
yields_raw = yields_raw.rename(columns={"dep": "DEP"})
yields = yields_raw[yields_raw["year"] > 1979] #-> final yields data

2nd dataset (2000 - 2024)

In [107]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/SCR-GRC-hist_dep_surface_prod_cult_cer-A25.csv", sep=';', encoding='latin1')
yields_raw["ESPECES"] = yields_raw["ESPECES"].str.strip() #remove left and white spaces
yields_raw = yields_raw[yields_raw["ESPECES"] == "Blé tendre"].reset_index(drop=True) #filter soft wheat
yields_raw["DEP"] = yields_raw["DEP"].str.strip().replace({"2A": "20", "2B": "20"}) #strip white spaces an replace string values to dep number
yields_raw["DEP"] = yields_raw["DEP"].astype(int)
yields = yields_raw[["ANNEE", "DEP", "CULT_SURF", "CULT_REND"]] #keep only wanted data
yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str) 
yields = yields.rename(columns={"ANNEE": "year", "CULT_SURF": "area", "CULT_REND": "yield"})
yields['yield'] = yields['yield'].str.replace(",", '.').astype(float) / 10 #convert to kg/ha
yields = yields[(yields["year"] >= 2001) & (yields["year"] < 2024)] #-> final weather data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str)


In [10]:
yields

Unnamed: 0,department,year,yield,area,production,DEP,MY
80,Ain,1980,4.39805,36001.0,158334.3,1,1979/1980
81,Ain,1981,4.30319,36075.0,155237.5,1,1980/1981
82,Ain,1982,5.00000,38050.0,190250.0,1,1981/1982
83,Ain,1983,3.29500,37924.0,124959.6,1,1982/1983
84,Ain,1984,5.90000,40000.0,236000.0,1,1983/1984
...,...,...,...,...,...,...,...
114,Corse,2014,3.50000,130.0,455.0,20,2013/2014
115,Corse,2015,3.50000,105.0,367.5,20,2014/2015
116,Corse,2016,3.00000,78.0,234.0,20,2015/2016
117,Corse,2017,3.50000,73.0,255.5,20,2016/2017


Vapor pressure deficit data

In [11]:
vpd_raw = pd.DataFrame(columns=["dep", "date", "vpd_max", "vpd_min", "vpd_mean"])
for vpd_file in os.listdir(f"{VPD_DATA_URL}/dailyDepDatas/"): #loop throught files
    vpd_dep = pd.read_json(f"{VPD_DATA_URL}/dailyDepDatas/{vpd_file}") #read json
    vpd_raw = pd.concat([vpd_raw, vpd_dep]) #concat data
vpd_raw['DEP'] = vpd_raw['dep'].map(departements) #map dep name to dep code
vpd_raw["YEAR-MONTH"] = pd.to_datetime(vpd_raw["date"]).dt.to_period('M') #new column with YYYY-MM format
vpdMeanGroupedDepMonth = vpd_raw[["vpd_max", "vpd_min", "vpd_mean", "YEAR-MONTH", "DEP"]].groupby(["YEAR-MONTH", "DEP"]).mean() #group by year-month and dep then mean the values
vpd = vpdMeanGroupedDepMonth.reset_index() #remove multi indexing 
vpd = vpd.dropna() #removes nan values (when nan value, there is no data for the dep)
vpd["YEAR"] = vpd["YEAR-MONTH"].dt.year
vpd["MONTH"] = vpd["YEAR-MONTH"].dt.month
vpd["MY"] = vpd.apply(get_market_year, axis=1) #add year, month and market year for merging 
vpd = vpd[vpd["MY"] >= "1979/1980"] #-> final vpd data

  vpd_raw = pd.concat([vpd_raw, vpd_dep]) #concat data


In [12]:
vpd_pivot = vpd.pivot_table( # pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['vpd_mean', 'vpd_min', 'vpd_max']
)

vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in vpd_pivot.columns
]
vpd_pivot = vpd_pivot.reset_index()

Enhanced Vegetation Index data

In [13]:
evi = pd.DataFrame(columns=["name", "code", "date", "mean_data"]) 
for evi_file in os.listdir(f"{EVI_DATA_URL}/monthlyDepMean/"): #loop throught files
    evi_dep = pd.read_json(f"{EVI_DATA_URL}/monthlyDepMean/{evi_file}") #read json
    evi = pd.concat([evi, evi_dep]) #concat data
evi["YEAR-MONTH"] = pd.to_datetime(evi["date"]).dt.to_period('M') #set period (already to monthly data but we keep same format for every dataset (Year-Month))
evi = evi.rename(columns={"code": "DEP", "mean_data": "evi"}) #rename for same format
evi = evi[["YEAR-MONTH", "DEP", "evi"]].sort_values(by="YEAR-MONTH") #keep wanted data 
evi["YEAR"] = evi["YEAR-MONTH"].dt.year
evi["MONTH"] = evi["YEAR-MONTH"].dt.month
evi["MY"] = evi.apply(get_market_year, axis=1) #add year, month and market year for merging 
evi = evi[evi["MY"] >= "2000/2001"] #-> final evi data

  evi = pd.concat([evi, evi_dep]) #concat data


In [14]:
evi_pivot = evi.pivot_table( # pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['evi']
)

evi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in evi_pivot.columns
]
evi_pivot = evi_pivot.reset_index()

Soil Organic Matter data (Particulate organic matter (POM) and Mineral-associated organic matter (MAOM))

In [15]:
som_pom = pd.read_json(f"{SOM_DATA_URL}/pom.json") #read json
som_pom['DEP'] = som_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
som_pom = som_pom.drop("nom", axis=1) #remove unwanted dep name -> final som pom data

In [16]:
maom_pom = pd.read_json(f"{SOM_DATA_URL}/maom.json") #read json
maom_pom['DEP'] = maom_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
maom_pom = maom_pom.drop("nom", axis=1) #remove unwanted dep name -> final som maom data

In [17]:
awc = pd.read_json(f"{AWC_DATA_URL}/AWC.json") #read json
awc['DEP'] = awc['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
awc = awc.drop("nom", axis=1) #remove unwanted dep name -> final AWC data

In [18]:
awc

Unnamed: 0,awc,DEP
0,0.095255,2
1,0.106082,10
2,0.098906,14
3,0.104202,15
4,0.091842,28
...,...,...
90,0.094073,53
91,0.114648,54
92,0.097066,79
93,0.105598,90


Merging datasets

In [19]:
result = yields.merge(weather_pivot, on=['DEP', 'MY'], how='left')
result = result.merge(vpd_pivot, on=['DEP', 'MY'], how='left')
result = result.merge(evi_pivot, on=['DEP', 'MY'], how='left')

result = result.merge(som_pom, on="DEP", how="left")
result = result.merge(maom_pom, on="DEP", how="left")
result = result.merge(awc, on="DEP", how="left")

In [20]:
result = result.dropna(subset=['yield'])

In [21]:
result #show results

Unnamed: 0,department,year,yield,area,production,DEP,MY,RR1,RR2,RR3,...,evi6,evi7,evi8,evi9,evi10,evi11,evi12,pom,maom,awc
0,Ain,1980,4.39805,36001.0,158334.3,1,1979/1980,3.672457,3.484615,4.661456,...,,,,,,,,16.789692,26.925089,0.107088
1,Ain,1981,4.30319,36075.0,155237.5,1,1980/1981,5.279117,2.298026,4.151613,...,,,,,,,,16.789692,26.925089,0.107088
2,Ain,1982,5.00000,38050.0,190250.0,1,1981/1982,3.686452,0.559464,3.402339,...,,,,,,,,16.789692,26.925089,0.107088
3,Ain,1983,3.29500,37924.0,124959.6,1,1982/1983,3.015110,3.857753,3.601241,...,,,,,,,,16.789692,26.925089,0.107088
4,Ain,1984,5.90000,40000.0,236000.0,1,1983/1984,5.722266,3.942845,3.163952,...,,,,,,,,16.789692,26.925089,0.107088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3697,Corse,2014,3.50000,130.0,455.0,20,2013/2014,5.060050,5.395923,3.773239,...,0.423447,0.392998,0.388455,0.353366,0.332301,0.319865,0.305374,15.657990,28.322897,0.109897
3698,Corse,2015,3.50000,105.0,367.5,20,2014/2015,1.708821,5.654291,5.265742,...,0.420115,0.405022,0.384232,0.343211,0.331971,0.307984,0.290167,15.657990,28.322897,0.109897
3699,Corse,2016,3.00000,78.0,234.0,20,2015/2016,3.607827,6.636231,2.000909,...,0.396195,0.399334,0.364752,0.354355,0.356517,0.342911,0.325609,15.657990,28.322897,0.109897
3700,Corse,2017,3.50000,73.0,255.5,20,2016/2017,5.908224,3.860211,1.166214,...,0.388713,0.382783,0.351085,0.347126,0.335812,0.328511,0.315143,15.657990,28.322897,0.109897


In [22]:
result.to_csv("wheat_model_dataset_1980_2018.csv", index=False) #dataset to csv ! 