This notebook outlines how the training data is obtained. It requires the python modules located in this folder.

### Weather Data

There are several points where intermediate data frames are written to disk in order to provide incremental checkpoints, since this dataset takes a while to download and build.

In [1]:
from weather_data import get_station_df, cast_df
from os.path import isfile
from datetime import timedelta, timezone, datetime
from multiprocessing import cpu_count
from joblib import Parallel, delayed
import pandas as pd
import numpy as np

parallel = Parallel(n_jobs=cpu_count())

stations_dict = {'IA' : ['DSM', 'CID'],
                 'MN' : ['DLH', 'JKJ', 'LYV', 'MSP', 'RST'], 
                 'WI' : ['MSN', 'MKE', 'EAU', 'GRB'],
                 'MI' : ['ANJ', 'GRR', 'LAN', 'DET', 'ARB'],
                 'IN' : ['EVV', 'FWA', 'GYY', 'IND', 'SBN', 'SPI'],
                 'IL' : ['BMI', 'CMI', 'ARR', 'PIA'],
                 'MO' : ['STL', 'COU', 'SGF', 'MKC'],
                 'MS' : ['HKS', 'MJD', 'TUP', 'MEI'],
                 'LA' : ['BTR', 'LFT', 'LCH', 'SHV', 'AEX'],
                 'TX' : ['LFK'] }

stations = [(state, station) for state in stations_dict.keys() for station in stations_dict[state]]

def download_file_path(state, station):
    from pathlib import Path
    Path("./data/").mkdir(exist_ok=True)
    return f'./data/{state}_{station}.parquet'

def est(yyyy, mm, dd, hh):
    return datetime(yyyy, mm, dd, hh, tzinfo=timezone(timedelta(hours=-5)))

In [2]:
first_hour = est(2015,  2, 1,  0)
last_hour  = est(2021, 12, 31, 23)
observation_dates = pd.date_range(start = first_hour, end = last_hour)
observation_hours = [d.replace(hour = h) for d in observation_dates for h in range(0, 24)]

def normalize_station(station):
    station['valid'] = pd.to_datetime(station['valid'], utc = True)
    station = station[station['tmpf'] != 'M']
    numeric_cols = ['tmpf', 'lat', 'lon']
    station[numeric_cols] = station[numeric_cols].apply(pd.to_numeric, axis=1)

    station['feel'] = np.where(station['feel'] == 'M', station['tmpf'], station['feel'])
    station['feel'] = station[['feel']].apply(pd.to_numeric, axis=1)
    return station

def download_station(state, station):
    path = download_file_path(state, station) 
    if isfile(path):
        return pd.read_parquet(path)

    station = get_station_df(station, first_hour, last_hour)
    if station is None:
        print(f'Retrieve {station} failed')
        return None
    return station.to_parquet(path)

_ = parallel(delayed(download_station)(state, station) for (state, station) in stations)

In [15]:
def build_hourly_df(state, station, observation_hours):
    path = download_file_path(state, station) 
    w = pd.read_parquet(download_file_path(state, station))
    return cast_df(w, observation_hours)

dfs = parallel(delayed(build_hourly_df)(state, station, observation_hours) for (state, station) in stations)
df = pd.concat(dfs, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['valid'] = pd.to_datetime(df['valid'], utc = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['valid'] = pd.to_datetime(df['valid'], utc = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['valid'] = pd.to_datetime(df['valid'], utc = True)
A value is trying to be set on a copy of a 

In [16]:
df

Unnamed: 0,station,valid,tmpf,lat,lon
0,DSM,2015-02-01 05:00:00+00:00,33.98,41.5339,-93.6531
1,DSM,2015-02-01 06:00:00+00:00,33.08,41.5339,-93.6531
2,DSM,2015-02-01 07:00:00+00:00,33.08,41.5339,-93.6531
3,DSM,2015-02-01 08:00:00+00:00,33.08,41.5339,-93.6531
4,DSM,2015-02-01 09:00:00+00:00,33.08,41.5339,-93.6531
...,...,...,...,...,...
2347356,LFK,2021-12-31 19:00:00+00:00,78.10,31.2340,-94.7500
2347357,LFK,2021-12-31 20:00:00+00:00,81.00,31.2340,-94.7500
2347358,LFK,2021-12-31 21:00:00+00:00,81.00,31.2340,-94.7500
2347359,LFK,2021-12-31 22:00:00+00:00,81.00,31.2340,-94.7500


### Obtain the Regional MTLF and Actual Load for each Observation Hour

In [None]:
from rf_al_data import get_daily_rf_al_df
forecast_output_dir = '../data/mtlf'
rf_al_df = get_daily_rf_al_df(first_hour, last_hour, forecast_output_dir)

rf_al_df.to_parquet(f'{forecast_output_dir}/rf_al_all.parquet')

Fetching hourly Regional Forecasts and Actual Load for 2524 days


### Validate Data 

In [None]:
(num_weather_observations, _) = merged.shape
(num_load_observations, _) = rf_al_df.shape
assert num_weather_observations == num_load_observations

AssertionError: 

In [None]:
num_load_observations

60576

In [None]:
num_weather_observations

58374