In [1]:
import pandas as pd
import requests
import os
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import gzip
import shutil

In [2]:
def dl_weather_history():
    url = 'https://object.files.data.gouv.fr/meteofrance/data/synchro_ftp/BASE/HOR/H_79_latest-2023-2024.csv.gz'
    try:
        r = requests.get(url)
        r.raise_for_status()
        with open("data/H_79_latest-2023-2024.csv.gz", "wb") as f:
            f.write(r.content)
        with gzip.open('data/H_79_latest-2023-2024.csv.gz', 'rb') as f_in:
            with open('data/H_79_latest-2023-2024.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
dl_weather_history()

In [3]:
#We accept a small loss of data in Latitude and Longitude by specifiying float32 for memory purpose
df = pd.read_csv('data/H_79_latest-2023-2024.csv', sep=';', usecols=['LAT', 'LON', 'AAAAMMJJHH', 'RR1', 'T'], dtype={'LAT':'float32', 'LON':'float32', 'AAAAMMJJHH':'int32', 'RR1':'float32', 'T':'float32'})
df['AAAAMMJJHH'] = pd.to_datetime(df['AAAAMMJJHH'], format='%Y%m%d%H')
datemax = pd.Timestamp.now().date() - pd.Timedelta(14, "d")
df = df[df['AAAAMMJJHH'].dt.date >= datemax].reset_index(drop=True)

In [4]:
def download_grib_025(date): #date format : 2024-08-26T06:00:00Z
    prevision_list = ['00H06H','07H12H','13H18H','19H24H','25H30H','31H36H','37H42H','43H48H']
    if os.path.isdir(f"C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #remove dir if exists
        shutil.rmtree(f"C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    if not os.path.isdir(f"C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}"): #create dir if doesnt exists
        os.mkdir(f"C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}")
    for i in prevision_list:
        url = f"https://object.data.gouv.fr/meteofrance-pnt/pnt/{date}/arome/0025/SP1/arome__0025__SP1__{i}__{date}.grib2"
        try:
            r = requests.get(url)
            with open(f"C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome/{date.replace(':', '-')}/arome__0025__SP1__{i}__{date.replace(':', '-')}.grib2", "wb") as f:
                f.write(r.content)
        except Exception as e:
                print(f"An error occurred: {e}")

In [5]:
for date in df['AAAAMMJJHH'].dt.date.unique():
    download_grib_025(str(date)+"T00:00:00Z")

In [6]:
def grib_to_dataframe(date, base_path):
    directory = f"{base_path}/{date.replace(':', '-')}"
    file_list = [os.path.join(directory, file.replace(':', '-')) for file in os.listdir(directory)]

    backend_kwargs = [
        {'filter_by_keys': {'paramId': 167, 'level': 2}}, #temperature
        {'filter_by_keys': {'paramId': 228228}} #precipitations
    ]

    datasets = []
    for file in file_list:
        for bk in backend_kwargs:
            ds = xr.open_dataset(file, engine='cfgrib', backend_kwargs=bk)
            datasets.append(ds)

    ds = xr.merge(datasets)
    return ds

In [7]:
base_path = "C:/Users/alexl/Documents/GitHub/Meteo/AromeAccuracy/data/arome"
#date = str(df['AAAAMMJJHH'].dt.date.unique()[0]) + "T00:00:00Z"
datasets_list = []

for i in df['AAAAMMJJHH'].dt.date.unique()[:-3]: #remove the 3 last days of arome because we do not have full weather station data (saving memory)
    tmp = grib_to_dataframe(str(i)+"T00:00:00Z", base_path)
    datasets_list.append(tmp)
#dfArome = grib_to_dataframe(date, base_path)
xrArome = datasets_list[0]

In [27]:
#set to float32 to save memory even if we lose some precision, we will find the nearest arome point next. 
xrArome['latitude'] = xrArome['latitude'].astype('float32')
xrArome['longitude'] = xrArome['longitude'].astype('float32')

In [28]:
#Make a custom dataframe since I cant make xarray.Dataset.to_dataframe() keep the step and timedelta into the dataframe
superdflist = []
for dataset in datasets_list:
    #set to float32 to save memory even if we lose some precision, we will find the nearest arome point next. 
    dataset['latitude'] = dataset['latitude'].astype('float32')
    dataset['longitude'] = dataset['longitude'].astype('float32')
    
    step = dataset['step'].values #values of each step (48 times one hour)
    valid_date = np.datetime64(dataset['valid_time'].values[0]) #first date value as np datetime to work with np timedelta
    lst = []
    for i in range(49): #number of steps
        lat_grid, lon_grid = np.meshgrid(dataset['latitude'].values.round(3), dataset['longitude'].values.round(3), indexing='ij')
        # flatten array
        flat_temp = dataset['t2m'][i].values.flatten()
        flat_precip = dataset['tp'][i].values.flatten()
        flat_lat = lat_grid.flatten()
        flat_lon = lon_grid.flatten()
        
        # create dataframe
        tmp = pd.DataFrame({
            'Latitude': flat_lat,
            'Longitude': flat_lon,
            't2m': flat_temp,
            'tp': flat_precip, 
            'DateTime': valid_date + step[i]
        })
        lst.append(tmp)
    data = pd.concat(lst).reset_index(drop=True)
    superdflist.append(data)
superdf = pd.concat(superdflist)

[           Latitude  Longitude  t2m  tp   DateTime
 0         55.400002    -12.000  NaN NaN 2024-10-03
 1         55.400002    -11.975  NaN NaN 2024-10-03
 2         55.400002    -11.950  NaN NaN 2024-10-03
 3         55.400002    -11.925  NaN NaN 2024-10-03
 4         55.400002    -11.900  NaN NaN 2024-10-03
 ...             ...        ...  ...  ..        ...
 39384088  37.500000     15.900  NaN NaN 2024-10-05
 39384089  37.500000     15.925  NaN NaN 2024-10-05
 39384090  37.500000     15.950  NaN NaN 2024-10-05
 39384091  37.500000     15.975  NaN NaN 2024-10-05
 39384092  37.500000     16.000  NaN NaN 2024-10-05
 
 [39384093 rows x 5 columns],
            Latitude  Longitude  t2m  tp   DateTime
 0         55.400002    -12.000  NaN NaN 2024-10-04
 1         55.400002    -11.975  NaN NaN 2024-10-04
 2         55.400002    -11.950  NaN NaN 2024-10-04
 3         55.400002    -11.925  NaN NaN 2024-10-04
 4         55.400002    -11.900  NaN NaN 2024-10-04
 ...             ...        ... 

In [23]:
#find the nearest Arome LAT, LON from each station
lat_lon_unique = [df['LAT'].unique(), df['LON'].unique()]
nearest_lat_lon = []
for lat, lon in zip(lat_lon_unique[0], lat_lon_unique[1]):    
    nearest_lat_idx = (data['Latitude'] - lat).abs().idxmin()
    nearest_lon_idx = (data['Longitude'] - lon).abs().idxmin()
    nearest_lat_lon.append([lat, lon, data.iloc[nearest_lat_idx]['Latitude'], data.iloc[nearest_lon_idx]['Longitude']])
nearest_lat_lon_df = pd.DataFrame(nearest_lat_lon, columns=['LAT', 'LON', 'Latitude', 'Longitude'])

In [24]:
dfMerge = pd.merge(df, nearest_lat_lon_df, on=['LAT', 'LON'], how='inner')
dfMerge = dfMerge.rename(columns={'AAAAMMJJHH':'DateTime'})
end = pd.merge(dfMerge, data, on=['Latitude', 'Longitude', 'DateTime'], how='inner')
end['t2m'] = end['t2m'] - 273.15

In [25]:
end

Unnamed: 0,LAT,LON,DateTime,RR1,T,Latitude,Longitude,t2m,tp
0,46.941166,-0.584167,2024-10-03 00:00:00,0.0,11.6,46.950001,-0.575,10.364258,
1,46.941166,-0.584167,2024-10-03 01:00:00,0.0,11.4,46.950001,-0.575,11.462219,0.000000
2,46.941166,-0.584167,2024-10-03 02:00:00,0.0,11.0,46.950001,-0.575,10.736298,0.000000
3,46.941166,-0.584167,2024-10-03 03:00:00,0.0,10.5,46.950001,-0.575,10.447205,0.000000
4,46.941166,-0.584167,2024-10-03 04:00:00,0.0,10.1,46.950001,-0.575,10.383636,0.000000
...,...,...,...,...,...,...,...,...,...
828,46.989166,-0.207000,2024-10-04 20:00:00,0.0,9.7,47.000000,-0.200,10.164948,0.011719
829,46.989166,-0.207000,2024-10-04 21:00:00,0.0,9.0,47.000000,-0.200,9.424835,0.011719
830,46.989166,-0.207000,2024-10-04 22:00:00,0.0,7.9,47.000000,-0.200,8.730225,0.011719
831,46.989166,-0.207000,2024-10-04 23:00:00,0.0,7.4,47.000000,-0.200,8.146790,0.011719


In [12]:
rmse = np.sqrt(((end['t2m'] - end['T']) ** 2).mean())
rmse

1.3321052

In [13]:
bias = (end['t2m'] - end['T']).mean()
bias

-0.49231303

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39384093 entries, 0 to 39384092
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Latitude   float64       
 1   Longitude  float64       
 2   t2m        float32       
 3   tp         float32       
 4   DateTime   datetime64[ns]
dtypes: datetime64[ns](1), float32(2), float64(2)
memory usage: 1.2 GB
