In [1]:
import pandas as pd
import requests
import os
import xarray as xr
import numpy as np
import gzip
import shutil
import glob
from config import BASE_URL

In [2]:
def dl_weather_history(departement):
    url = f'https://object.files.data.gouv.fr/meteofrance/data/synchro_ftp/BASE/HOR/H_{departement}_latest-2023-2024.csv.gz'
    try:
        r = requests.get(url)
        r.raise_for_status()
        with open(f"{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv.gz", "wb") as f:
            f.write(r.content)
        with gzip.open(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv.gz', 'rb') as f_in:
            with open(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
departements = [f"{dep:02}" for dep in range(1, 96)]
for dep in departements:
    dl_weather_history(dep)

In [3]:
def make_df(url):
    dfList = []
    os.chdir(str(url))
    files = glob.glob('*.{}'.format('csv'))
    for file in files:
        #We accept a small loss of data in Latitude and Longitude by specifiying float32 for memory purpose
        tmp = pd.read_csv(f'{url}/{file}', sep=';', usecols=['LAT', 'LON', 'AAAAMMJJHH', 'RR1', 'T'], dtype={'LAT':'float32', 'LON':'float32', 'AAAAMMJJHH':'int32', 'RR1':'float32', 'T':'float32'})
        tmp['AAAAMMJJHH'] = pd.to_datetime(tmp['AAAAMMJJHH'], format='%Y%m%d%H')
        datemax = pd.Timestamp.now().date() - pd.Timedelta(14, "d")
        tmp = tmp[tmp['AAAAMMJJHH'].dt.date >= datemax].reset_index(drop=True)
        dfList.append(tmp)
    df = pd.concat(dfList)
    return df

In [4]:
df = make_df(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/')

In [5]:
def download_grib_025(date): #date format : 2024-08-26T06:00:00Z
    prevision_list = ['00H06H','07H12H','13H18H','19H24H','25H30H','31H36H','37H42H','43H48H']
    if os.path.isdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #remove dir if exists
        shutil.rmtree(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    if not os.path.isdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #create dir if doesnt exists
        os.mkdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    for i in prevision_list:
        url = f"https://object.data.gouv.fr/meteofrance-pnt/pnt/{date}/arome/0025/SP1/arome__0025__SP1__{i}__{date}.grib2"
        try:
            r = requests.get(url)
            with open(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}/arome__0025__SP1__{i}__{date.replace(':', '-')}.grib2", "wb") as f:
                f.write(r.content)
        except Exception as e:
                print(f"An error occurred: {e}")

In [6]:
for date in df['AAAAMMJJHH'].dt.date.unique():
    download_grib_025(str(date)+"T00:00:00Z")

In [6]:
def grib_to_dataframe(date, base_path):
    directory = f"{base_path}/{date.replace(':', '-')}"
    file_list = [os.path.join(directory, file.replace(':', '-')) for file in os.listdir(directory)]

    backend_kwargs = [
        {'filter_by_keys': {'paramId': 167, 'level': 2}}, #temperature
        {'filter_by_keys': {'paramId': 228228}} #precipitations
    ]

    datasets = []
    for file in file_list:
        for bk in backend_kwargs:
            ds = xr.open_dataset(file, engine='cfgrib', backend_kwargs=bk)
            datasets.append(ds)

    ds = xr.merge(datasets)
    return ds

In [9]:
arome_path = f"{BASE_URL}Meteo/AromeAccuracy/data/arome"
#date = str(df['AAAAMMJJHH'].dt.date.unique()[0]) + "T00:00:00Z"
datasets_list = []

for i in df['AAAAMMJJHH'].dt.date.unique()[:-3]: #remove the 3 last days of arome because we do not have full weather station data (saving memory)
    tmp = grib_to_dataframe(str(i)+"T00:00:00Z", arome_path)
    datasets_list.append(tmp)

In [10]:
#Make a custom dataframe since I cant make xarray.Dataset.to_dataframe() keep the step and timedelta into the dataframe
superdflist = []
for dataset in datasets_list:
    #set to float32 to save memory even if we lose some precision, we will find the nearest arome point next. 
    dataset['latitude'] = dataset['latitude'].astype('float32')
    dataset['longitude'] = dataset['longitude'].astype('float32')
    
    step = dataset['step'].values #values of each step (48 times one hour)
    valid_date = np.datetime64(dataset['valid_time'].values[0]) #first date value as np datetime to work with np timedelta
    lst = []
    for i in range(49): #number of steps
        lat_grid, lon_grid = np.meshgrid(dataset['latitude'].values.round(3), dataset['longitude'].values.round(3), indexing='ij')
        # flatten array
        flat_temp = dataset['t2m'][i].values.flatten()
        flat_precip = dataset['tp'][i].values.flatten()
        flat_lat = lat_grid.flatten()
        flat_lon = lon_grid.flatten()

        # create dataframe
        tmp = pd.DataFrame({
            'Latitude': flat_lat,
            'Longitude': flat_lon,
            't2m': flat_temp,
            'tp': flat_precip, 
            'DateTimePrevision': dataset['time'].values,
            'DateTimeEffective': valid_date + step[i]
        })
        lst.append(tmp)
    data = pd.concat(lst).reset_index(drop=True)
    superdflist.append(data)
superdf = pd.concat(superdflist)
superdf = superdf.dropna()

In [11]:
#find the nearest Arome LAT, LON from each station
lat_lon_unique = [df['LAT'].unique(), df['LON'].unique()]
nearest_lat_lon = []
for lat, lon in zip(lat_lon_unique[0], lat_lon_unique[1]):    
    nearest_lat_idx = (data['Latitude'] - lat).abs().idxmin()
    nearest_lon_idx = (data['Longitude'] - lon).abs().idxmin()
    nearest_lat_lon.append([lat, lon, data.iloc[nearest_lat_idx]['Latitude'], data.iloc[nearest_lon_idx]['Longitude']])
nearest_lat_lon_df = pd.DataFrame(nearest_lat_lon, columns=['LAT', 'LON', 'Latitude', 'Longitude'])

In [28]:
dfMerge = pd.merge(df, nearest_lat_lon_df, on=['LAT', 'LON'], how='inner')
dfMerge = dfMerge.rename(columns={'AAAAMMJJHH':'DateTimeEffective'})
end = pd.merge(dfMerge, superdf, on=['Latitude', 'Longitude', 'DateTimeEffective'], how='inner')
end = end.sort_values(by=['DateTimePrevision', 'Latitude']).reset_index(drop=True) #we sort the df for the RR1 cumsum next
end['t2m'] = end['t2m'] - 273.15 #temp is set to kelvin so we convert to °C

In [15]:
precipcumsum = end.groupby(['DateTimePrevision', 'Latitude', 'Longitude'])['RR1'].cumsum()

In [29]:
end['RR1Cumul'] = precipcumsum
rr1Rain = end['RR1'] > 0
tpRain = end['tp'].diff() > 0
end['ForecastRight'] = rr1Rain == tpRain #if it rained or it did not rained for this hour

In [30]:
end

Unnamed: 0,LAT,LON,DateTimeEffective,RR1,T,Latitude,Longitude,t2m,tp,DateTimePrevision,RR1Cumul,ForecastRight
0,43.523666,6.898667,2024-10-11 01:00:00,0.0,14.3,43.525002,6.90,14.912506,0.000000,2024-10-11,0.0,True
1,43.523666,6.898667,2024-10-11 02:00:00,0.0,14.2,43.525002,6.90,14.209351,0.000000,2024-10-11,0.0,True
2,43.523666,6.898667,2024-10-11 03:00:00,0.0,13.3,43.525002,6.90,13.833008,0.000000,2024-10-11,0.0,True
3,43.523666,6.898667,2024-10-11 04:00:00,0.0,12.0,43.525002,6.90,13.119293,0.000000,2024-10-11,0.0,True
4,43.523666,6.898667,2024-10-11 05:00:00,0.0,12.0,43.525002,6.90,12.976257,0.000000,2024-10-11,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
133469,50.959831,1.956167,2024-10-23 20:00:00,0.0,10.1,50.950001,1.95,8.827545,6.703125,2024-10-22,4.5,True
133470,50.959831,1.956167,2024-10-23 21:00:00,0.0,9.5,50.950001,1.95,7.868225,6.703125,2024-10-22,4.5,True
133471,50.959831,1.956167,2024-10-23 22:00:00,0.0,8.0,50.950001,1.95,6.937195,6.703125,2024-10-22,4.5,True
133472,50.959831,1.956167,2024-10-23 23:00:00,0.0,7.8,50.950001,1.95,6.389404,6.718750,2024-10-22,4.5,False


In [31]:
rmsetmp = np.sqrt(((end['t2m'] - end['T']) ** 2).mean())
rmsetmp

np.float32(1.553764)

In [32]:
biastmp = (end['t2m'] - end['T']).mean()
biastmp

np.float32(-0.3958679)

In [33]:
rmseprecip = np.sqrt(((end['tp'] - end['RR1Cumul']) ** 2).mean())
rmseprecip

np.float32(8.126516)

In [34]:
biasprecip = (end['tp'] - end['RR1Cumul']).mean()
biasprecip

np.float32(0.7222519)

In [47]:
end['ForecastRight'].value_counts(normalize=True)[True]

np.float64(0.7648306037130826)