In [12]:
import pandas as pd 
import os
import pathlib 
import joblib

from tsl.utils.utils import seed_everything, reduce_mem_usage

# %%
def dump(df, name, path):
    df = reduce_mem_usage(df)
    save_dir = pathlib.Path(path)
    if not save_dir.exists():
        save_dir.mkdir(parents=True)
    joblib.dump(df, save_dir / f'{name}.joblib', compress=True)

dump_dir = os.path.join('./electricity_price', 'price')


## Load Data

In [6]:
data_path = './electricity_price'
data_file = 'da_price.csv'

data = pd.read_csv(os.path.join(data_path, data_file), parse_dates=True, index_col=0)
data.index.name = 'date'
data.columns

Index(['central_mtlf_fc', 'north_mtlf_fc', 'south_mtlf_fc', 'system_mtlf_fc',
       'ia_ksux_ws_mph_fc', 'il_kmdw_ws_mph_fc', 'mi_kanj_ws_mph_fc',
       'mi_karb_ws_mph_fc', 'mi_kazo_ws_mph_fc', 'mi_klan_ws_mph_fc',
       'mn_kinl_ws_mph_fc', 'mn_krst_ws_mph_fc', 'mo_kstl_ws_mph_fc',
       'oh_kakr_ws_mph_fc', 'ok_kclk_ws_mph_fc', 'ok_kokc_ws_mph_fc',
       'ok_ktul_ws_mph_fc', 'ok_kwdg_ws_mph_fc', 'sc_kchs_ws_mph_fc',
       'ar_kjbr_td_f_fc', 'ia_kdsm_td_f_fc', 'in_kfwa_td_f_fc',
       'ky_ksdf_td_f_fc', 'la_kbtr_td_f_fc', 'mi_klan_td_f_fc',
       'mn_kinl_td_f_fc', 'mo_kstl_td_f_fc', 'oh_kluk_td_f_fc',
       'ok_klaw_td_f_fc', 'hour', 'weekday', 'month',
       'da_energy_aeci_lmpexpost_ac_log',
       'da_energy_michigan_hub_lmpexpost_ac_log',
       'da_energy_minn_hub_lmpexpost_ac_log'],
      dtype='object')

Drop time columns as they will be reconstructed from the date column.

In [7]:
drop_cols=['hour', 'weekday', 'month']
data = data.drop(drop_cols, axis=1)
data.columns

Index(['central_mtlf_fc', 'north_mtlf_fc', 'south_mtlf_fc', 'system_mtlf_fc',
       'ia_ksux_ws_mph_fc', 'il_kmdw_ws_mph_fc', 'mi_kanj_ws_mph_fc',
       'mi_karb_ws_mph_fc', 'mi_kazo_ws_mph_fc', 'mi_klan_ws_mph_fc',
       'mn_kinl_ws_mph_fc', 'mn_krst_ws_mph_fc', 'mo_kstl_ws_mph_fc',
       'oh_kakr_ws_mph_fc', 'ok_kclk_ws_mph_fc', 'ok_kokc_ws_mph_fc',
       'ok_ktul_ws_mph_fc', 'ok_kwdg_ws_mph_fc', 'sc_kchs_ws_mph_fc',
       'ar_kjbr_td_f_fc', 'ia_kdsm_td_f_fc', 'in_kfwa_td_f_fc',
       'ky_ksdf_td_f_fc', 'la_kbtr_td_f_fc', 'mi_klan_td_f_fc',
       'mn_kinl_td_f_fc', 'mo_kstl_td_f_fc', 'oh_kluk_td_f_fc',
       'ok_klaw_td_f_fc', 'da_energy_aeci_lmpexpost_ac_log',
       'da_energy_michigan_hub_lmpexpost_ac_log',
       'da_energy_minn_hub_lmpexpost_ac_log'],
      dtype='object')

## Time Series
The prediction targets in this dataset are the energy prices at three locations.  
The other features are treated as global features that affect all three locations.
although we can also process so that each location has its own set of local features.


In [8]:
# log price of the day-ahead market log(original_price + 50) because the original price can be negative
ts_cols = ['da_energy_aeci_lmpexpost_ac_log',
       'da_energy_michigan_hub_lmpexpost_ac_log',
       'da_energy_minn_hub_lmpexpost_ac_log']

ts = data[ts_cols]


Unnamed: 0_level_0,da_energy_aeci_lmpexpost_ac_log,da_energy_michigan_hub_lmpexpost_ac_log,da_energy_minn_hub_lmpexpost_ac_log
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-01 00:00:00,3.849083,3.908015,3.853334
2019-09-01 01:00:00,3.823629,3.8865,3.829728
2019-09-01 02:00:00,3.824721,3.857989,3.817712
2019-09-01 03:00:00,3.77115,3.813528,3.793239
2019-09-01 04:00:00,3.783735,3.803769,3.749739


In [13]:
dump(ts, 'ts', dump_dir)

Mem. usage decreased to  0.23 Mb (0.0% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)


## Global Features
The global features are the same for all locations.

Note it might be better to generate location-specific features, but we will start with global features.

In [14]:
num_global_cols = ['central_mtlf_fc', 'north_mtlf_fc', 'south_mtlf_fc', 'system_mtlf_fc',
       'ia_ksux_ws_mph_fc', 'il_kmdw_ws_mph_fc', 'mi_kanj_ws_mph_fc',
       'mi_karb_ws_mph_fc', 'mi_kazo_ws_mph_fc', 'mi_klan_ws_mph_fc',
       'mn_kinl_ws_mph_fc', 'mn_krst_ws_mph_fc', 'mo_kstl_ws_mph_fc',
       'oh_kakr_ws_mph_fc', 'ok_kclk_ws_mph_fc', 'ok_kokc_ws_mph_fc',
       'ok_ktul_ws_mph_fc', 'ok_kwdg_ws_mph_fc', 'sc_kchs_ws_mph_fc',
       'ar_kjbr_td_f_fc', 'ia_kdsm_td_f_fc', 'in_kfwa_td_f_fc',
       'ky_ksdf_td_f_fc', 'la_kbtr_td_f_fc', 'mi_klan_td_f_fc',
       'mn_kinl_td_f_fc', 'mo_kstl_td_f_fc', 'oh_kluk_td_f_fc',
       'ok_klaw_td_f_fc']

num_global = data[num_global_cols]
num_global.shape

(17497, 29)

In [15]:
dump(num_global, 'global_num', dump_dir)

Mem. usage decreased to  1.17 Mb (70.8% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

## Time Features

We will not generate time features for now as the data loader will do this based on given date time index.

## Local Features

Local features should be generated for each location.
