In [103]:
import pandas as pd
from config import *
import os
import numpy as np

In [104]:
def get_market_year(row): #make a market year row in datas
    year = row['YEAR']
    month = row['MONTH']
    if month >= 9:
        market_year = f"{year}/{year + 1}"
    else:
        market_year = f"{year - 1}/{year}"
    return market_year

In [105]:
departements = { #set a dict of departement name as keys and dep code as value for missing dep values in datasets
    'Ain': 1,
    'Aisne': 2,
    'Allier': 3,
    'Alpes-de-Haute-Provence': 4,
    'Hautes-Alpes': 5,
    'Alpes-Maritimes': 6,
    'Ardèche': 7,
    'Ardennes': 8,
    'Ariège': 9,
    'Aube': 10,
    'Aude': 11,
    'Aveyron': 12,
    'Bouches-du-Rhône': 13,
    'Calvados': 14,
    'Cantal': 15,
    'Charente': 16,
    'Charente-Maritime': 17,
    'Cher': 18,
    'Corrèze': 19,
    'Corse': 20,
    "Côte-d'Or": 21,
    "Côtes-d'Armor": 22,
    'Creuse': 23,
    'Dordogne': 24,
    'Doubs': 25,
    'Drôme': 26,
    'Eure': 27,
    'Eure-et-Loir': 28,
    'Finistère': 29,
    'Gard': 30,
    'Haute-Garonne': 31,
    'Gers': 32,
    'Gironde': 33,
    'Hérault': 34,
    'Ille-et-Vilaine': 35,
    'Indre': 36,
    'Indre-et-Loire': 37,
    'Isère': 38,
    'Jura': 39,
    'Landes': 40,
    'Loir-et-Cher': 41,
    'Loire': 42,
    'Haute-Loire': 43,
    'Loire-Atlantique': 44,
    'Loiret': 45,
    'Lot': 46,
    'Lot-et-Garonne': 47,
    'Lozère': 48,
    'Maine-et-Loire': 49,
    'Manche': 50,
    'Marne': 51,
    'Haute-Marne': 52,
    'Mayenne': 53,
    'Meurthe-et-Moselle': 54,
    'Meuse': 55,
    'Morbihan': 56,
    'Moselle': 57,
    'Nièvre': 58,
    'Nord': 59,
    'Oise': 60,
    'Orne': 61,
    'Pas-de-Calais': 62,
    'Puy-de-Dôme': 63,
    'Pyrénées-Atlantiques': 64,
    'Hautes-Pyrénées': 65,
    'Pyrénées-Orientales': 66,
    'Bas-Rhin': 67,
    'Haut-Rhin': 68,
    'Rhône': 69,
    'Haute-Saône': 70,
    'Saône-et-Loire': 71,
    'Sarthe': 72,
    'Savoie': 73,
    'Haute-Savoie': 74,
    'Paris': 75,
    'Seine-Maritime': 76,
    'Seine-et-Marne': 77,
    'Yvelines': 78,
    'Deux-Sèvres': 79,
    'Somme': 80,
    'Tarn': 81,
    'Tarn-et-Garonne': 82,
    'Var': 83,
    'Vaucluse': 84,
    'Vendée': 85,
    'Vienne': 86,
    'Haute-Vienne': 87,
    'Vosges': 88,
    'Yonne': 89,
    'Territoire de Belfort': 90,
    'Essonne': 91,
    'Hauts-de-Seine': 92,
    'Seine-Saint-Denis': 93,
    'Val-de-Marne': 94,
    "Val-d'Oise": 95
}

Note : We will pivot data with month number in column name. Datas starts from 1 (Jan) to 12 (Dec), but it is understood and applicated that months from 9 to 12 are harvest year - 1 and months from 1 to 8 are harvest year.

-> Market Year for EU wheat is September to Aug (for 2020 harvest, we plant in september 2019 and harvest july/aug 2020)

Weather data

In [129]:
weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/rr_tn_tx_tm-1950-2023.csv").drop(["Unnamed: 0"], axis=1) #read raw weather data
weather_raw["YEAR-MONTH"] = pd.to_datetime(weather_raw["DATE"]).dt.to_period('M') #new column with YYYY-MM format
weatherMeanGroupedDepMonth = weather_raw[["YEAR-MONTH", "DEP", "RR", "TN", "TX", "TM"]].groupby(["YEAR-MONTH", "DEP"]).mean() #group by year-month and dep then mean the values
weather = weatherMeanGroupedDepMonth.reset_index() #remove multi indexing 
weather["YEAR"] = weather["YEAR-MONTH"].dt.year
weather["MONTH"] = weather["YEAR-MONTH"].dt.month
weather["MY"] = weather.apply(get_market_year, axis=1) #add year, month and market year for merging 
weather = weather[(weather["MY"] >= "1979/1980") & (weather["MY"] <= "2022/2023")] #-> final weather data

KeyboardInterrupt: 

In [None]:
weather_pivot = weather.pivot_table( # Pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['RR', 'TN', 'TX', 'TM']
)

weather_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in weather_pivot.columns
]
weather_pivot = weather_pivot.reset_index()

Yields data

1st dataset (1900 - 2018)

In [113]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/2021-001_Schauberger-et-al_Data_FILTERED/wheat_total_data_1900-2018_FILTERED.txt", sep=';')

#aggregate Corse du sud and Haute Corse into one Corse department :
corseSud = yields_raw[yields_raw['department'] == "Corse-du-sud"].fillna(0)
corseHaute = yields_raw[yields_raw['department'] == "Haute-Corse"].fillna(0)
corseArea = corseSud["area"].reset_index(drop=True) + corseHaute["area"].reset_index(drop=True)
corseProd = corseSud["production"].reset_index(drop=True) + corseHaute["production"].reset_index(drop=True)
corseYield = corseSud["yield"].reset_index(drop=True) + corseHaute["yield"].reset_index(drop=True)
corseYears = pd.Series(range(1900, 2019))
corse = pd.DataFrame({"department": "Corse", "year": corseYears, "yield": corseYield, "area": corseArea, "production": corseProd})
yields_raw = pd.concat([yields_raw, corse])
yields_raw = yields_raw[(yields_raw['department'] != "Corse-du-sud") & (yields_raw['department'] != "Haute-Corse")]

yields_raw['dep'] = yields_raw['department'].map(departements).replace('NA', np.nan) #map dep name to dep code
yields_raw["MY"] = (yields_raw['year'] - 1).astype(str) + '/' + yields_raw['year'].astype(str) 
yields_raw = yields_raw.rename(columns={"dep": "DEP"})
yields = yields_raw[yields_raw["year"] > 1979] #-> final yields data

2nd dataset (2000 - 2024)

In [130]:
yields_raw = pd.read_csv(f"{YIELDS_DATA_URL}/SCR-GRC-hist_dep_surface_prod_cult_cer-A25.csv", sep=';', encoding='latin1')
yields_raw["ESPECES"] = yields_raw["ESPECES"].str.strip() #remove left and white spaces
yields_raw = yields_raw[yields_raw["ESPECES"] == "Blé tendre"].reset_index(drop=True) #filter soft wheat
yields_raw["DEP"] = yields_raw["DEP"].str.strip().replace({"2A": "20", "2B": "20"}) #strip white spaces an replace string values to dep number
yields_raw["DEP"] = yields_raw["DEP"].astype(int)
yields = yields_raw[["ANNEE", "DEP", "CULT_SURF", "CULT_REND"]] #keep only wanted data
yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str) 
yields = yields[(yields["ANNEE"] >= 2001) & (yields["ANNEE"] < 2024)] #-> final weather data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yields["MY"] = (yields['ANNEE'] - 1).astype(str) + '/' + yields['ANNEE'].astype(str)


In [131]:
yields

Unnamed: 0,ANNEE,DEP,CULT_SURF,CULT_REND,MY
1,2001,77,13493900,7500,2000/2001
2,2002,77,13937500,8500,2001/2002
3,2003,77,13737500,6898,2002/2003
4,2004,77,14144600,9000,2003/2004
5,2005,77,14367700,8000,2004/2005
...,...,...,...,...,...
2424,2019,20,9600,4000,2018/2019
2425,2020,20,7000,3500,2019/2020
2426,2021,20,9700,6500,2020/2021
2427,2022,20,8000,6500,2021/2022


Vapor pressure deficit data

In [115]:
vpd_raw = pd.DataFrame(columns=["dep", "date", "vpd_max", "vpd_min", "vpd_mean"])
for vpd_file in os.listdir(f"{VPD_DATA_URL}/dailyDepDatas/"): #loop throught files
    vpd_dep = pd.read_json(f"{VPD_DATA_URL}/dailyDepDatas/{vpd_file}") #read json
    vpd_raw = pd.concat([vpd_raw, vpd_dep]) #concat data
vpd_raw['DEP'] = vpd_raw['dep'].map(departements) #map dep name to dep code
vpd_raw["YEAR-MONTH"] = pd.to_datetime(vpd_raw["date"]).dt.to_period('M') #new column with YYYY-MM format
vpdMeanGroupedDepMonth = vpd_raw[["vpd_max", "vpd_min", "vpd_mean", "YEAR-MONTH", "DEP"]].groupby(["YEAR-MONTH", "DEP"]).mean() #group by year-month and dep then mean the values
vpd = vpdMeanGroupedDepMonth.reset_index() #remove multi indexing 
vpd = vpd.dropna() #removes nan values (when nan value, there is no data for the dep)
vpd["YEAR"] = vpd["YEAR-MONTH"].dt.year
vpd["MONTH"] = vpd["YEAR-MONTH"].dt.month
vpd["MY"] = vpd.apply(get_market_year, axis=1) #add year, month and market year for merging 
vpd = vpd[vpd["MY"] >= "1979/1980"] #-> final vpd data

  vpd_raw = pd.concat([vpd_raw, vpd_dep]) #concat data


In [116]:
vpd_pivot = vpd.pivot_table( # pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['vpd_mean', 'vpd_min', 'vpd_max']
)

vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in vpd_pivot.columns
]
vpd_pivot = vpd_pivot.reset_index()

Enhanced Vegetation Index data

In [117]:
evi = pd.DataFrame(columns=["name", "code", "date", "mean_data"]) 
for evi_file in os.listdir(f"{EVI_DATA_URL}/monthlyDepMean/"): #loop throught files
    evi_dep = pd.read_json(f"{EVI_DATA_URL}/monthlyDepMean/{evi_file}") #read json
    evi = pd.concat([evi, evi_dep]) #concat data
evi["YEAR-MONTH"] = pd.to_datetime(evi["date"]).dt.to_period('M') #set period (already to monthly data but we keep same format for every dataset (Year-Month))
evi = evi.rename(columns={"code": "DEP", "mean_data": "evi"}) #rename for same format
evi = evi[["YEAR-MONTH", "DEP", "evi"]].sort_values(by="YEAR-MONTH") #keep wanted data 
evi["YEAR"] = evi["YEAR-MONTH"].dt.year
evi["MONTH"] = evi["YEAR-MONTH"].dt.month
evi["MY"] = evi.apply(get_market_year, axis=1) #add year, month and market year for merging 
evi = evi[evi["MY"] >= "2000/2001"] #-> final evi data

  evi = pd.concat([evi, evi_dep]) #concat data


In [118]:
evi_pivot = evi.pivot_table( # pivot table for month datas as columns and not rows
    index=['DEP', 'MY'],
    columns='MONTH',
    values=['evi']
)

evi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in evi_pivot.columns
]
evi_pivot = evi_pivot.reset_index()

Soil Organic Matter data (Particulate organic matter (POM) and Mineral-associated organic matter (MAOM))

In [119]:
som_pom = pd.read_json(f"{SOM_DATA_URL}/pom.json") #read json
som_pom['DEP'] = som_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
som_pom = som_pom.drop("nom", axis=1) #remove unwanted dep name -> final som pom data

In [120]:
maom_pom = pd.read_json(f"{SOM_DATA_URL}/maom.json") #read json
maom_pom['DEP'] = maom_pom['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
maom_pom = maom_pom.drop("nom", axis=1) #remove unwanted dep name -> final som maom data

In [121]:
awc = pd.read_json(f"{AWC_DATA_URL}/AWC.json") #read json
awc['DEP'] = awc['nom'].map(departements) #map dep name to dep codevpd['DEP'] = vpd['dep'].map(departements) #map dep name to dep code
awc = awc.drop("nom", axis=1) #remove unwanted dep name -> final AWC data

Merging datasets

In [132]:
result = yields.merge(weather_pivot, on=['DEP', 'MY'], how='left')
result = result.merge(vpd_pivot, on=['DEP', 'MY'], how='left')
result = result.merge(evi_pivot, on=['DEP', 'MY'], how='left')

result = result.merge(som_pom, on="DEP", how="left")
result = result.merge(maom_pom, on="DEP", how="left")
result = result.merge(awc, on="DEP", how="left")

In [133]:
result #show results

Unnamed: 0,ANNEE,DEP,CULT_SURF,CULT_REND,MY,RR1,RR2,RR3,RR4,RR5,...,evi6,evi7,evi8,evi9,evi10,evi11,evi12,pom,maom,vpd
0,2001,77,13493900,7500,2000/2001,2.467359,1.665315,4.451066,3.182373,1.444451,...,0.514345,0.409490,0.408823,0.339984,0.291674,0.235035,0.212440,4.415719,15.990197,0.089862
1,2002,77,13937500,8500,2001/2002,1.109455,3.514409,1.819132,0.517874,1.712458,...,0.538276,0.448189,0.357645,0.332362,0.295205,0.288137,0.233184,4.415719,15.990197,0.089862
2,2003,77,13737500,6898,2002/2003,2.282796,0.926441,0.567346,1.439708,2.846067,...,0.574613,0.407363,0.342762,0.336154,0.278275,0.217687,0.226608,4.415719,15.990197,0.089862
3,2004,77,14144600,9000,2003/2004,3.744067,0.405788,1.242569,2.203571,1.870232,...,0.575383,0.458860,0.353065,0.313528,0.289102,0.234515,0.204609,4.415719,15.990197,0.089862
4,2005,77,14367700,8000,2004/2005,1.648856,1.444091,1.541701,1.838061,1.716598,...,0.580532,0.428030,0.373774,0.340361,0.277447,0.248891,0.204195,4.415719,15.990197,0.089862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,2019,20,9600,4000,2018/2019,1.825853,2.234225,0.470323,2.643190,3.537650,...,0.399499,0.388429,0.377462,0.375772,0.352461,0.336898,0.322346,15.657990,28.322897,0.109897
2145,2020,20,7000,3500,2019/2020,1.687465,0.514343,2.962535,3.276762,2.581244,...,0.435666,0.431090,0.388402,0.355846,0.336295,0.313358,0.300604,15.657990,28.322897,0.109897
2146,2021,20,9700,6500,2020/2021,5.630784,2.620412,1.022414,1.969150,2.378431,...,0.405275,0.392680,0.360204,0.371781,0.375880,0.340750,0.330526,15.657990,28.322897,0.109897
2147,2022,20,8000,6500,2021/2022,0.921650,0.828915,1.524876,3.056282,1.225617,...,0.403204,0.379189,0.366802,0.343552,0.334657,0.288043,0.291791,15.657990,28.322897,0.109897


In [134]:
result.to_csv("wheat_model_dataset_2001_2023.csv", index=False) #dataset to csv ! 