In [1]:
import pandas as pd
from config import *

In [2]:
def get_market_year(row): #make a market year row in datas
    year = row['DATE'].year
    month = row['DATE'].month
    if month >= 9:
        market_year = f"{year}/{year + 1}"
    else:
        market_year = f"{year - 1}/{year}"
    return market_year

In [78]:
hist_weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/raw_rr_tn_tx_tm-1950-2023.csv").drop(["Unnamed: 0"], axis=1) #read raw weather data
current_weather_raw = pd.read_csv(f"{WEATHER_DATA_URL}/raw_rr_tn_tx_tm-current.csv").drop(["Unnamed: 0"], axis=1)
raw_weather = pd.concat([hist_weather_raw, current_weather_raw])

raw_weather = raw_weather.groupby(["DATE","DEP"]).mean().reset_index()
raw_weather["DATE"] = pd.to_datetime(raw_weather["DATE"])
raw_weather = raw_weather[raw_weather["DATE"] < raw_weather["DATE"].iloc[-1]] #remove last date of data, missing values
raw_weather["MY"] = raw_weather.apply(get_market_year, axis=1)
raw_weather["MONTH-DAY"] = raw_weather["DATE"].dt.strftime("%m-%d")
#remove years where there is no full data for a market year
raw_weather = raw_weather[(raw_weather["MY"] != "1949/1950") & (raw_weather["MY"] != "2024/2025")] 
raw_weather = raw_weather[raw_weather["MONTH-DAY"] != "02-29"] #remove leap years

raw_weather = raw_weather.dropna() #can't train on nan values so the sooner the better

In [84]:
raw_weather[(raw_weather["DEP"] == 1) & (raw_weather["DATE"] == "1950-12-19")]

Unnamed: 0,DATE,DEP,RR,TN,TX,TM,MY,MONTH-DAY


In [85]:
weather_pivot = raw_weather.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH-DAY",
    values=['RR', 'TN', 'TX', 'TM']
)
weather_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in weather_pivot.columns
]
weather_pivot = weather_pivot.reset_index()
weather_pivot = weather_pivot.dropna()

In [86]:
weather_pivot

Unnamed: 0,MY,DEP,RR01-01,RR01-02,RR01-03,RR01-04,RR01-05,RR01-06,RR01-07,RR01-08,...,TX12-22,TX12-23,TX12-24,TX12-25,TX12-26,TX12-27,TX12-28,TX12-29,TX12-30,TX12-31
1,1950/1951,2,2.400000,1.563158,0.478947,8.526316,0.700000,0.852632,0.605263,6.926316,...,-2.400000,-0.800000,-0.500000,-0.200000,-1.700000,-1.200000,-0.800000,-6.500000,-5.000000,1.700000
2,1950/1951,3,6.125641,0.677500,0.057500,0.087500,0.010000,0.082500,0.222500,0.347500,...,7.100000,1.600000,-0.925000,-3.425000,-1.600000,-1.250000,-0.925000,-2.600000,-3.550000,3.675000
3,1950/1951,5,4.121053,13.650000,0.831579,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.725000,-0.700000,3.225000,-0.100000,0.325000,0.400000,1.125000,-0.150000,-1.125000,-3.475000
4,1950/1951,6,4.821622,31.932432,12.394595,0.000000,0.000000,0.000000,0.000000,0.000000,...,7.616667,8.116667,9.716667,9.816667,8.300000,7.066667,6.666667,9.466667,7.216667,7.933333
5,1950/1951,10,3.810000,4.870000,0.260000,2.230000,0.020000,0.830000,0.600000,3.550000,...,-0.266667,1.266667,-1.800000,-0.733333,-0.933333,0.233333,-0.400000,-4.000000,-3.166667,3.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6268,2023/2024,91,7.683333,6.733333,7.650000,0.033333,0.200000,0.333333,0.100000,0.933333,...,11.400000,10.750000,11.583333,11.950000,10.583333,10.566667,10.650000,11.216667,11.350000,10.450000
6269,2023/2024,92,8.600000,6.700000,4.800000,0.200000,0.200000,0.800000,0.000000,1.000000,...,11.800000,12.700000,13.600000,12.900000,11.600000,12.000000,12.400000,12.100000,11.900000,11.800000
6270,2023/2024,93,8.200000,5.700000,4.400000,0.000000,0.000000,0.600000,0.000000,0.000000,...,11.800000,12.400000,12.900000,12.600000,11.400000,11.500000,11.600000,11.700000,11.800000,11.800000
6271,2023/2024,94,9.100000,9.650000,6.400000,0.000000,0.000000,0.400000,0.100000,0.450000,...,11.800000,11.700000,12.800000,12.600000,11.500000,11.500000,11.550000,11.800000,11.650000,11.900000


In [96]:
raw_vpd = pd.read_csv(f"{VPD_DATA_URL}/1940-2025_vpd.csv").drop(["Unnamed: 0", "departement"], axis=1)
raw_vpd = raw_vpd.dropna()
raw_vpd = raw_vpd.rename(columns={"date":"DATE", "dep":"DEP"})
raw_vpd["DATE"] = pd.to_datetime(raw_vpd["DATE"])
raw_vpd["MY"] = raw_vpd.apply(get_market_year, axis=1)
#remove years where there is no full data for a market year
raw_vpd = raw_vpd[(raw_vpd["MY"] != "1939/1940") & (raw_vpd["MY"] != "2024/2025")] 
raw_vpd["MONTH-DAY"] = raw_vpd["DATE"].dt.strftime("%m-%d")
raw_vpd = raw_vpd[raw_vpd["MONTH-DAY"] != "02-29"] #remove leap years

In [97]:
vpd_pivot = raw_vpd.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH-DAY",
    values=['vpd_max', 'vpd_mean', 'vpd_min']
)
vpd_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in vpd_pivot.columns
]
vpd_pivot = vpd_pivot.reset_index()

In [98]:
vpd_pivot

Unnamed: 0,MY,DEP,vpd_max01-01,vpd_max01-02,vpd_max01-03,vpd_max01-04,vpd_max01-05,vpd_max01-06,vpd_max01-07,vpd_max01-08,...,vpd_min12-22,vpd_min12-23,vpd_min12-24,vpd_min12-25,vpd_min12-26,vpd_min12-27,vpd_min12-28,vpd_min12-29,vpd_min12-30,vpd_min12-31
0,1940/1941,1,0.132354,0.093340,0.109083,0.089246,0.083568,0.074628,0.098343,0.094229,...,0.030227,0.053570,0.039393,0.025857,0.016992,0.019505,0.027384,0.030492,0.038333,0.055430
1,1940/1941,2,0.165009,0.214080,0.198853,0.163985,0.147468,0.134596,0.129717,0.153086,...,0.099755,0.076699,0.054817,0.058276,0.029522,0.022392,0.029557,0.023802,0.005602,0.002414
2,1940/1941,3,0.148820,0.142861,0.107388,0.113023,0.101335,0.099834,0.116278,0.077015,...,0.033520,0.095137,0.071986,0.056618,0.038118,0.019662,0.015799,0.061468,0.037086,0.047029
3,1940/1941,4,0.140340,0.080095,0.116702,0.146266,0.191767,0.065463,0.062133,0.186698,...,0.044506,0.043099,0.030873,0.023123,0.019892,0.022255,0.026260,0.046517,0.057726,0.016833
4,1940/1941,5,0.111647,0.057454,0.072901,0.095202,0.110487,0.059091,0.076325,0.143007,...,0.021719,0.032656,0.025410,0.014698,0.012184,0.016012,0.016404,0.037558,0.038403,0.012515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7723,2023/2024,89,0.283134,0.245179,0.259011,0.282445,0.294730,0.153310,0.100484,0.164398,...,0.084235,0.082826,0.110970,0.097442,0.112299,0.075556,0.116377,0.090418,0.063831,0.146829
7724,2023/2024,90,0.199950,0.211431,0.399459,0.244395,0.133245,0.110667,0.121237,0.109534,...,0.037877,0.075001,0.120394,0.129238,0.056776,0.028053,0.082429,0.063444,0.034981,0.021536
7725,2023/2024,91,0.302723,0.209739,0.334163,0.241236,0.336526,0.242142,0.143298,0.164703,...,0.127071,0.126732,0.165520,0.144169,0.083230,0.082129,0.090718,0.079024,0.047687,0.140487
7726,2023/2024,94,0.308881,0.215436,0.321815,0.261003,0.335451,0.217595,0.155188,0.163629,...,0.104290,0.150739,0.182253,0.152527,0.073675,0.080210,0.093177,0.077897,0.053399,0.162159


In [107]:
raw_ndvi = pd.read_csv(f"{NDVI_DATA_URL}/ndvi_processed.csv").drop(["Unnamed: 0", "departement"], axis=1)
raw_ndvi = raw_ndvi.dropna()
raw_ndvi = raw_ndvi.rename(columns={"date":"DATE", "dep":"DEP"})
raw_ndvi["DATE"] = pd.to_datetime(raw_ndvi["DATE"])
raw_ndvi["MY"] = raw_ndvi.apply(get_market_year, axis=1)
#remove years where there is no full data for a market year
raw_ndvi = raw_ndvi[(raw_ndvi["MY"] != "1980/1981") & (raw_ndvi["MY"] != "2024/2025")] 
raw_ndvi["MONTH-DAY"] = raw_ndvi["DATE"].dt.strftime("%m-%d")
raw_ndvi = raw_ndvi[raw_ndvi["MONTH-DAY"] != "02-29"] #remove leap years

In [110]:
ndvi_pivot = raw_ndvi.pivot_table( # Pivot table for month datas as columns and not rows
    index=['MY', 'DEP'],
    columns="MONTH-DAY",
    values=['ndvi_mean']
)
ndvi_pivot.columns = [ # rename columns with month number
    f"{col[0]}{col[1]}" if isinstance(col, tuple) and col[1] != "" 
    else col for col in ndvi_pivot.columns
]
ndvi_pivot = ndvi_pivot.reset_index()

In [111]:
ndvi_pivot

Unnamed: 0,MY,DEP,ndvi_mean01-01,ndvi_mean01-02,ndvi_mean01-03,ndvi_mean01-04,ndvi_mean01-05,ndvi_mean01-06,ndvi_mean01-07,ndvi_mean01-08,...,ndvi_mean12-22,ndvi_mean12-23,ndvi_mean12-24,ndvi_mean12-25,ndvi_mean12-26,ndvi_mean12-27,ndvi_mean12-28,ndvi_mean12-29,ndvi_mean12-30,ndvi_mean12-31
0,1981/1982,1,-0.000560,-0.032711,0.162730,-0.018870,0.006850,0.224560,0.021330,,...,0.006700,-0.022760,-0.002756,-0.005350,0.086970,-0.011220,-0.013830,0.085314,,0.019700
1,1981/1982,2,0.111329,0.011386,0.150171,-0.002614,0.000000,0.225729,0.118893,,...,0.009107,-0.028221,-0.013964,0.000964,0.011569,-0.023729,0.278446,0.023807,,0.057607
2,1981/1982,3,0.144750,-0.020607,0.236000,0.211233,-0.007847,0.209067,-0.035529,,...,0.007167,0.008173,-0.037907,0.089620,0.257560,-0.010427,-0.021360,0.135707,,-0.017167
3,1981/1982,4,0.030920,0.111020,0.136664,0.158282,0.114620,,0.159173,,...,-0.005660,0.000000,0.012636,0.111282,0.089127,-0.002636,0.003527,0.019327,,-0.001700
4,1981/1982,5,0.010200,0.023113,0.002830,0.001310,0.006000,,0.058600,,...,-0.014711,0.006382,0.021964,-0.009280,-0.021091,-0.001418,0.009136,0.052855,,0.013036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3994,2023/2024,90,0.220500,-0.028700,0.390300,0.066500,-0.018700,-0.003100,-0.011300,-0.0105,...,-0.017100,0.044200,-0.004700,0.682900,0.029400,0.268800,0.015400,0.015200,0.4527,-0.019100
3995,2023/2024,91,0.200600,-0.022250,0.005400,-0.001400,0.515500,0.316250,-0.003500,0.0355,...,0.003450,0.007250,0.241950,0.007750,0.012200,0.124400,0.003250,-0.003500,0.0219,0.417300
3996,2023/2024,92,0.174700,-0.023600,-0.021900,0.023800,0.081900,0.030500,-0.016600,0.0025,...,-0.000800,0.015300,0.155700,0.003900,-0.011700,0.093300,0.011200,0.000000,-0.0092,0.374300
3997,2023/2024,94,0.222100,-0.024900,-0.012900,0.002600,0.345500,0.004500,-0.022200,-0.0099,...,-0.018300,0.001200,0.272700,-0.013800,-0.021300,0.117300,0.000000,-0.005400,-0.0246,0.406500


In [112]:
ndvi_pivot.isna().sum().sum()

np.int64(39083)