Download historical data from EIA and NOAA/GHCN-d to the local filesystem

In [None]:
import download_historical_data as dl
import os 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Importing some stuff from the FastAI book
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import fastai.tabular.all as aiTab
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import dtreeviz.trees as dtrees
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

HISTORICAL_DATA_DIR = os.path.abspath("./historical_data")
ANALYSIS_DATA_DIR = os.path.abspath("./analysis_data/")
ELECTRIC_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "electric_data")
WEATHER_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "weather_station_data")

for dir in [HISTORICAL_DATA_DIR, ANALYSIS_DATA_DIR, ELECTRIC_DATA_DIR, WEATHER_DATA_DIR]:
    if not os.path.exists(dir):
        os.makedirs(dir)


WEATHER_STATION_IDS = [
    "USW00023066",  # Grand Junction Walker Field
    "USC00053553",  # Greeley UNC
    "USC00053005",  # Ft Collins
    "USC00050848",  # Boulder
    "USC00055984",  # Northglenn
    "USC00058995",  # Wheat Ridge
    "USW00023061"  # Alamosa
]

# Uncomment to force re-download of source data
# Otherwise can also run the download script manually via: python download_historical_data.py
# Data files are saved locally so you only need to re-download to get new/different data
#dl.download_eia_historical_data(ELECTRIC_DATA_DIR, eia_respondent="PSCO")
#dl.download_ghcnd_historical_data(WEATHER_DATA_DIR, WEATHER_STATION_IDS)

In [None]:
import glob

plt.style.use("default") #alternative "ggplot"

temp_df : pd.DataFrame = None

## Load up temperature data for each weather station, into their own columns
for df_file in glob.glob(WEATHER_DATA_DIR + "\*.json"):
    with open(df_file, "r", encoding="utf-8") as f:
        station_id = os.path.basename(df_file)[0:11]
        station_df = pd.read_json(f)
        station_df.index.rename("date", inplace=True)
        
        # TODO: This name-mangling seems like a halfassed way to either do a MultiIndex or maybe a tuple-index
        # Going to leave it for now as I'm not clear what will be easiest when trying to train an ML model
        col_renames = {col: f"{station_id}_{col}" for col in station_df.columns}
        station_df.rename(col_renames, axis="columns", inplace=True)
        
        if temp_df is not None:
            temp_df = pd.merge(left=temp_df, right=station_df, how="outer", left_index=True, right_index=True)
        else:
            temp_df = station_df

# station_df["tmp_date"] = station_df.index
# station_df["day_of_year"] = station_df["tmp_date"].dt.day_of_year
# station_df.drop("tmp_date", axis=1, inplace=True)

len(temp_df)

Load electric demand data

In [None]:
psco_demand_data_file = os.path.join(ELECTRIC_DATA_DIR, "psco-daily-dataframe.json")
with open(psco_demand_data_file, "r", encoding="utf-8") as f:
    demand_df = pd.read_json(f)

len(demand_df)

In [None]:
joined_df = pd.merge(demand_df, temp_df, how="outer", left_index=True, right_index=True)
joined_df.dropna(inplace=True)
len(joined_df),joined_df.columns

In [None]:
## Augment data
augmented_df = joined_df.copy()

# Extract date index into a column
augmented_df.reset_index(inplace=True)
augmented_df["date"] = augmented_df["index"]  
augmented_df.set_index("index", inplace=True)

augmented_df = aiTab.add_datepart(augmented_df, "date", drop=True)
## A lot of the augmented parts are not that applicable in our case
augmented_df.drop(['Elapsed', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
                   'Is_year_end', 'Is_year_start'], axis=1, inplace=True)
augmented_df.columns


In [None]:
## Split out training, test and validation sets
# train = augmented_df.loc[:"2021-01-01"]
# validation = augmented_df.loc["2021-01-01":"2022-01-01"]
# test = augmented_df.loc["2022-01-01":]
train_idx = np.where(augmented_df.Year < 2021)[0]
valid_idx = np.where((augmented_df.Year >= 2021) & (augmented_df.Year < 2022))[0]
test_idx = np.where(augmented_df.Year >= 2022)[0]
print(f"trainSize={len(train_idx)}, validationSize={len(valid_idx)}, testSize={len(test_idx)}")

splits = (list(train_idx), list(valid_idx))


In [None]:
## Split out categorical vs continuous data
cont, cat = aiTab.cont_cat_split(augmented_df, 1, dep_var="daily_demand")
print(cont)
print(cat)

In [None]:
## Create TabularPandas
procs = [aiTab.Categorify, aiTab.FillMissing]
dep_var = "daily_demand"
to = aiTab.TabularPandas(augmented_df, procs, cat, cont, y_names=dep_var, splits=splits)
len(to.train),len(to.valid)

In [None]:
to.show(3)

In [None]:
aiTab.save_pickle(os.path.join(ANALYSIS_DATA_DIR, 'tabular.pkl'), to)

In [None]:
## Reload from pickle
to = aiTab.load_pickle(os.path.join(ANALYSIS_DATA_DIR, 'tabular.pkl'))

In [None]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y
xs, y, valid_xs, valid_y

In [None]:
## Create a decision tree
m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(xs, y)

In [None]:
from sklearn.tree import export_graphviz
import graphviz
import re

def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs):
    s = export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                        special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))


# Draw the tree
draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)

In [None]:
## Let's see that in DTreeViz
samp_idx = np.random.permutation(len(y))[:500]
dtrees.dtreeviz(m, xs.iloc[samp_idx], y.iloc[samp_idx], xs.columns, dep_var,
         fontname='DejaVu Sans', scale=1.6, label_fontsize=10,
         orientation='LR')


In [None]:
## MOAR LEAVES
m = DecisionTreeRegressor(min_samples_leaf=15)
m.fit(xs, y)

In [None]:
## Functions to check root-mean-squared error for the model
import math
def r_mse(pred, y): return round(math.sqrt(((pred - y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [None]:
## Check error in the training set
m_rmse(m, xs, y)

In [None]:
## Check RMS error against validation set
m_rmse(m, valid_xs, valid_y)

In [None]:
## How many leaves do we have? Oh, it's one per measurement, so massively overfitted
m.get_n_leaves(), len(xs)

In [None]:
# Draw the tree (use carefully, it's pretty big)
# samp_idx = np.random.permutation(len(y))[:500]
# dtrees.dtreeviz(m, xs.iloc[samp_idx], y.iloc[samp_idx], xs.columns, dep_var,
#                 fontname='DejaVu Sans', scale=1.6, label_fontsize=10,
#                 orientation='LR')

In [None]:
## Function to grow a random forest with some default parameters chosen

## n_estimators -> number of trees in the forest
num_estimators = 200

def grow_random_forest(xs, y, n_estimators=num_estimators, max_samples=0.8,
       max_features=0.5, min_samples_leaf=4, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
                                 max_samples=max_samples, max_features=max_features,
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [None]:
forest = grow_random_forest(xs, y)

In [None]:
m_rmse(forest, xs, y), m_rmse(forest, valid_xs, valid_y)

In [None]:
## Get predictions from each individual tree in the forest
preds = np.stack([t.predict(valid_xs) for t in forest.estimators_])

## Plot the mean error for a given number of estimators used
plt.plot([r_mse(preds[:i + 1].mean(0), valid_y) for i in range(num_estimators)])


In [None]:
# Out-of-bag errors for the forest
r_mse(forest.oob_prediction_, y)