In [3]:
import pandas as pd
import requests
import os
import xarray as xr
import numpy as np
import gzip
import shutil
import glob
from config import BASE_URL

In [2]:
def dl_weather_history(departement):
    url = f'https://object.files.data.gouv.fr/meteofrance/data/synchro_ftp/BASE/HOR/H_{departement}_latest-2023-2024.csv.gz'
    try:
        r = requests.get(url)
        r.raise_for_status()
        with open(f"{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv.gz", "wb") as f:
            f.write(r.content)
        with gzip.open(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv.gz', 'rb') as f_in:
            with open(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/H_{departement}_latest-2023-2024.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
departements = [f"{dep:02}" for dep in range(1, 96)]
for dep in departements:
    dl_weather_history(dep)

In [4]:
def make_df(url):
    dfList = []
    os.chdir(str(url))
    files = glob.glob('*.{}'.format('csv'))
    for file in files:
        #We accept a small loss of data in Latitude and Longitude by specifiying float32 for memory purpose
        tmp = pd.read_csv(f'{url}/{file}', sep=';', usecols=['LAT', 'LON', 'AAAAMMJJHH', 'RR1', 'T'], dtype={'LAT':'float32', 'LON':'float32', 'AAAAMMJJHH':'int32', 'RR1':'float32', 'T':'float32'})
        tmp['AAAAMMJJHH'] = pd.to_datetime(tmp['AAAAMMJJHH'], format='%Y%m%d%H')
        datemax = pd.Timestamp.now().date() - pd.Timedelta(14, "d")
        tmp = tmp[tmp['AAAAMMJJHH'].dt.date >= datemax].reset_index(drop=True)
        dfList.append(tmp)
    df = pd.concat(dfList)
    return df


In [6]:
df = make_df(f'{BASE_URL}Meteo/AromeAccuracy/data/MeteoFrance/')

In [7]:
def download_grib_025(date): #date format : 2024-08-26T06:00:00Z
    prevision_list = ['00H06H','07H12H','13H18H','19H24H','25H30H','31H36H','37H42H','43H48H']
    if os.path.isdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #remove dir if exists
        shutil.rmtree(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    if not os.path.isdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #create dir if doesnt exists
        os.mkdir(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    for i in prevision_list:
        url = f"https://object.data.gouv.fr/meteofrance-pnt/pnt/{date}/arome/0025/SP1/arome__0025__SP1__{i}__{date}.grib2"
        try:
            r = requests.get(url)
            with open(f"{BASE_URL}Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}/arome__0025__SP1__{i}__{date.replace(':', '-')}.grib2", "wb") as f:
                f.write(r.content)
        except Exception as e:
                print(f"An error occurred: {e}")

In [8]:
for date in df['AAAAMMJJHH'].dt.date.unique():
    download_grib_025(str(date)+"T00:00:00Z")

In [10]:
def grib_to_dataframe(date, base_path):
    directory = f"{base_path}/{date.replace(':', '-')}"
    file_list = [os.path.join(directory, file.replace(':', '-')) for file in os.listdir(directory)]

    backend_kwargs = [
        {'filter_by_keys': {'paramId': 167, 'level': 2}}, #temperature
        {'filter_by_keys': {'paramId': 228228}} #precipitations
    ]

    datasets = []
    for file in file_list:
        for bk in backend_kwargs:
            ds = xr.open_dataset(file, engine='cfgrib', backend_kwargs=bk)
            datasets.append(ds)

    ds = xr.merge(datasets)
    return ds

In [11]:
arome_path = f"{BASE_URL}Meteo/AromeAccuracy/data/arome"
#date = str(df['AAAAMMJJHH'].dt.date.unique()[0]) + "T00:00:00Z"
datasets_list = []

for i in df['AAAAMMJJHH'].dt.date.unique()[:-3]: #remove the 3 last days of arome because we do not have full weather station data (saving memory)
    tmp = grib_to_dataframe(str(i)+"T00:00:00Z", arome_path)
    datasets_list.append(tmp)

In [9]:
#Make a custom dataframe since I cant make xarray.Dataset.to_dataframe() keep the step and timedelta into the dataframe
superdflist = []
for dataset in datasets_list:
    #set to float32 to save memory even if we lose some precision, we will find the nearest arome point next. 
    dataset['latitude'] = dataset['latitude'].astype('float32')
    dataset['longitude'] = dataset['longitude'].astype('float32')
    
    step = dataset['step'].values #values of each step (48 times one hour)
    valid_date = np.datetime64(dataset['valid_time'].values[0]) #first date value as np datetime to work with np timedelta
    lst = []
    for i in range(49): #number of steps
        lat_grid, lon_grid = np.meshgrid(dataset['latitude'].values.round(3), dataset['longitude'].values.round(3), indexing='ij')
        # flatten array
        flat_temp = dataset['t2m'][i].values.flatten()
        flat_precip = dataset['tp'][i].values.flatten()
        flat_lat = lat_grid.flatten()
        flat_lon = lon_grid.flatten()
        
        # create dataframe
        tmp = pd.DataFrame({
            'Latitude': flat_lat,
            'Longitude': flat_lon,
            't2m': flat_temp,
            'tp': flat_precip, 
            'DateTime': valid_date + step[i]
        })
        lst.append(tmp)
    data = pd.concat(lst).reset_index(drop=True)
    superdflist.append(data)
superdf = pd.concat(superdflist)
superdf = superdf.dropna()

In [10]:
#find the nearest Arome LAT, LON from each station
lat_lon_unique = [df['LAT'].unique(), df['LON'].unique()]
nearest_lat_lon = []
for lat, lon in zip(lat_lon_unique[0], lat_lon_unique[1]):    
    nearest_lat_idx = (data['Latitude'] - lat).abs().idxmin()
    nearest_lon_idx = (data['Longitude'] - lon).abs().idxmin()
    nearest_lat_lon.append([lat, lon, data.iloc[nearest_lat_idx]['Latitude'], data.iloc[nearest_lon_idx]['Longitude']])
nearest_lat_lon_df = pd.DataFrame(nearest_lat_lon, columns=['LAT', 'LON', 'Latitude', 'Longitude'])

In [12]:
dfMerge = pd.merge(df, nearest_lat_lon_df, on=['LAT', 'LON'], how='inner')
dfMerge = dfMerge.rename(columns={'AAAAMMJJHH':'DateTime'})
end = pd.merge(dfMerge, superdf, on=['Latitude', 'Longitude', 'DateTime'], how='inner')
end['t2m'] = end['t2m'] - 273.15 #temp is set to kelvin so we convert to °C

In [13]:
end

Unnamed: 0,LAT,LON,DateTime,RR1,T,Latitude,Longitude,t2m,tp
0,48.717999,2.397000,2024-10-07 01:00:00,0.0,15.2,48.724998,2.400,14.788910,0.000000
1,48.717999,2.397000,2024-10-07 02:00:00,0.0,15.0,48.724998,2.400,14.155914,0.000000
2,48.717999,2.397000,2024-10-07 03:00:00,0.0,14.8,48.724998,2.400,14.114746,0.000000
3,48.717999,2.397000,2024-10-07 04:00:00,0.0,14.4,48.724998,2.400,14.249634,0.000000
4,48.717999,2.397000,2024-10-07 05:00:00,0.0,14.2,48.724998,2.400,14.367920,0.000000
...,...,...,...,...,...,...,...,...,...
133627,46.685333,0.678833,2024-10-19 20:00:00,0.2,12.6,46.674999,0.675,11.877350,5.328125
133628,46.685333,0.678833,2024-10-19 21:00:00,0.0,12.3,46.674999,0.675,12.191437,5.328125
133629,46.685333,0.678833,2024-10-19 22:00:00,0.0,11.7,46.674999,0.675,12.153809,5.328125
133630,46.685333,0.678833,2024-10-19 23:00:00,0.0,11.4,46.674999,0.675,11.867401,5.328125


In [46]:
end[(end['LAT'] == 46.685333) & (end['LON'] == 0.678833) & (end['DateTime'].dt.date == pd.to_datetime('2024-10-18').date())]

Unnamed: 0,LAT,LON,DateTime,RR1,T,Latitude,Longitude,t2m,tp
133558,46.685333,0.678833,2024-10-18 00:00:00,1.0,13.1,46.674999,0.675,13.304932,18.179688
133559,46.685333,0.678833,2024-10-18 00:00:00,1.0,13.1,46.674999,0.675,13.174255,9.1875
133560,46.685333,0.678833,2024-10-18 01:00:00,0.2,13.1,46.674999,0.675,13.068268,11.773438
133561,46.685333,0.678833,2024-10-18 01:00:00,0.2,13.1,46.674999,0.675,13.189758,0.013672
133562,46.685333,0.678833,2024-10-18 02:00:00,0.0,13.1,46.674999,0.675,12.913147,12.890625
133563,46.685333,0.678833,2024-10-18 02:00:00,0.0,13.1,46.674999,0.675,13.090393,0.50293
133564,46.685333,0.678833,2024-10-18 03:00:00,0.0,13.1,46.674999,0.675,12.717224,14.359375
133565,46.685333,0.678833,2024-10-18 03:00:00,0.0,13.1,46.674999,0.675,12.921051,1.185547
133566,46.685333,0.678833,2024-10-18 04:00:00,0.6,12.7,46.674999,0.675,12.260559,14.921875
133567,46.685333,0.678833,2024-10-18 04:00:00,0.6,12.7,46.674999,0.675,12.399719,2.857422


In [14]:
rmse = np.sqrt(((end['t2m'] - end['T']) ** 2).mean())
rmse

np.float32(1.4840714)

In [15]:
bias = (end['t2m'] - end['T']).mean()
bias

np.float32(-0.20810568)

In [16]:
superdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383431392 entries, 805338 to 39383940
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Latitude   float32       
 1   Longitude  float32       
 2   t2m        float32       
 3   tp         float32       
 4   DateTime   datetime64[ns]
dtypes: datetime64[ns](1), float32(4)
memory usage: 11.4 GB
