This notebook is about creating the final dataset we'll use to forecast. To do so, we use the dataset Hourly Weather Data in Ireland (from 24 stations), from Daria Vasileva, available here : https://www.kaggle.com/datasets/dariasvasileva/hourly-weather-data-in-ireland-from-24-stations.

We will operate various transformations to get a final dataset containing temp, wetb, dewpt, vappr and rhum for each stations daily (we take the data at the hour 12 and only the 1000 most recent values). We will also add series_idx and time_idx to make it ready for DeepAR forecasting using pytorch_forecasting.

In [161]:
import numpy as np
import pandas as pd

In [162]:
def import_df(name):
    """
    This function is used to import a dataframe.
    """
    return pd.read_csv(name).dropna()

In [163]:
def convert_to_dt(df):
    """
    This function is used to convert the date column to DateTime type.
    """
    df.date = pd.to_datetime(df.date)
    return df

In [164]:
def features_df(df):
    """
    This function is used to take only needed features.
    """
    return df[["date","temp","wetb","dewpt","vappr","rhum"]]

In [165]:
def hourly_df(df):
    """
    This function is used to take only dates where the hour is equal to 12.
    """
    return df[df['date'].dt.hour == 12]

In [166]:
def dataframe_creation(name):
    """
    This function is used to create a station dataframe.
    """
    return(hourly_df(features_df(convert_to_dt(import_df(name))))[-1000:])

In [167]:
stations = {'phoenix_park':'175', 'mace_head' : '275', 'oak_park':'375','shannon_airport':'518','dublin_airport':'532','moore_park':'575','ballyhaise':'675','sherkinisland':'775','mullingar':'875','roches_point':'1075','newport':'1175','markree':'1275','dunsany':'1375','gurteen':'1475','malin_head':'1575','johnstownii':'1775','mt_dillon':'1975','finner':'2075','claremorris':'2175','valentia_observatory':'2275','belmullet':'2375','casement':'3723','cork_airport':'3904','knock_airport':'4935'}

In [168]:
def big_df(list_of_stations):
    """
    This function is used to create the final dataframe.
    """
    final_df = pd.DataFrame()
    cpt=0
    for i in stations.keys():
        temp_df = dataframe_creation("stations/"+stations[i]+"_"+i+".csv")
        temp_df['station'] = i
        temp_df['series_idx'] = cpt
        cpt+=1
        temp_df['time_idx'] = np.arange(len(temp_df))
        final_df = pd.concat([final_df, temp_df])
    return final_df.reset_index(drop = True)

In [158]:
data = big_df(stations)
data

Unnamed: 0,date,temp,wetb,dewpt,vappr,rhum,station,series_idx,time_idx
0,2019-05-08 12:00:00,7.9,6.7,5.3,8.9,83.0,phoenix_park,0,0
1,2019-05-09 12:00:00,8.3,7.4,6.3,9.6,87.0,phoenix_park,0,1
2,2019-05-10 12:00:00,12.6,8.1,2.1,7.1,48.0,phoenix_park,0,2
3,2019-05-11 12:00:00,12.7,7.9,1.6,6.9,46.0,phoenix_park,0,3
4,2019-05-12 12:00:00,14.6,10.6,6.4,9.6,57.0,phoenix_park,0,4
...,...,...,...,...,...,...,...,...,...
23995,2022-01-24 12:00:00,5.4,5.2,4.9,8.7,97.0,knock_airport,23,995
23996,2022-01-26 12:00:00,7.9,7.6,7.3,10.2,96.0,knock_airport,23,996
23997,2022-01-28 12:00:00,9.6,9.1,8.6,11.2,93.0,knock_airport,23,997
23998,2022-01-30 12:00:00,5.3,5.3,5.3,8.9,100.0,knock_airport,23,998


In [159]:
data.to_csv("data.csv")

In [170]:
(hourly_df(features_df(convert_to_dt(import_df("stations/275_mace_head.csv"))))[-1000:]).to_csv("test.csv")