Download historical data from EIA and NOAA/GHCN-d to the local filesystem

In [None]:
import download_historical_data as dl
import os 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Importing some stuff from the FastAI book
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import fastai.tabular.all as aiTab
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import dtreeviz.trees as dtrees
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 10
pd.options.display.max_columns = 6

HISTORICAL_DATA_DIR = os.path.abspath("./historical_data")
ANALYSIS_DATA_DIR = os.path.abspath("./analysis_data/")
ELECTRIC_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "electric_data")
WEATHER_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "weather_station_data")

for dir in [HISTORICAL_DATA_DIR, ANALYSIS_DATA_DIR, ELECTRIC_DATA_DIR, WEATHER_DATA_DIR]:
    if not os.path.exists(dir):
        os.makedirs(dir)


WEATHER_STATION_IDS = [
    "USW00023066",  # Grand Junction Walker Field
    "USC00053553",  # Greeley UNC
    "USC00053005",  # Ft Collins
    "USC00050848",  # Boulder
    "USC00055984",  # Northglenn
    "USC00058995",  # Wheat Ridge
    "USW00023061"  # Alamosa
]

# Uncomment following lines to force re-download of source data
# Otherwise can also run the download script manually via: python download_historical_data.py
# Data files are saved locally so you only need to re-download to get new/different data

#dl.download_eia_historical_data(ELECTRIC_DATA_DIR, eia_respondent="PSCO")
#dl.download_ghcnd_historical_data(WEATHER_DATA_DIR, WEATHER_STATION_IDS)

In [None]:
# Read in the weather DataFrame
temp_df = dl.read_weather_data(WEATHER_DATA_DIR, WEATHER_STATION_IDS, earliest_date=None)
len(temp_df)

Load PSCO electric demand data from EIA

In [None]:
psco_demand_data_file = os.path.join(ELECTRIC_DATA_DIR, "psco-daily-dataframe.json")
with open(psco_demand_data_file, "r", encoding="utf-8") as f:
    demand_df = pd.read_json(f)

len(demand_df)

Merge demand and temperature data

In [None]:
joined_df = pd.merge(demand_df, temp_df, how="outer", left_index=True, right_index=True)
joined_df.dropna(inplace=True)
len(joined_df),joined_df.columns

Augment data with new dates and maybe some other stuff

In [None]:
## Augment data
augmented_df = joined_df.copy()

# Extract date index into a column
augmented_df.reset_index(inplace=True)
augmented_df["date"] = augmented_df["index"]  
augmented_df.set_index("index", inplace=True)

## Adds date parts
augmented_df = aiTab.add_datepart(augmented_df, "date", drop=True)
## But a lot of the augmented parts are not that applicable in our case
augmented_df.drop(['Elapsed', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
                   'Is_year_end', 'Is_year_start'], axis=1, inplace=True)

## Add lagged values
for lag in range(1, 15):
    augmented_df[f"demand_lag_{lag}"] = augmented_df["daily_demand"].shift(lag)
augmented_df.dropna(inplace=True) ## Lag columns will have NaN values


augmented_df.columns

In [None]:
augmented_df.head()

In [None]:
# Create masks for data sets
train_mask = (augmented_df.Year < 2021)
validation_mask = ((augmented_df.Year >= 2021) & (augmented_df.Year < 2022))
test_mask = (augmented_df.Year >= 2022)

## Split out training, test and validation sets
train_df = augmented_df.where(train_mask).dropna()
validation_df = augmented_df.where(validation_mask).dropna()
test_df = augmented_df.where(test_mask).dropna()

# print(train_df.iloc[0:5][["daily_demand"]])
# print(validation_df.iloc[0:5][["daily_demand"]])
# print(test_df.iloc[0:5])

In [None]:
## Create indexes from the sets
train_idx = np.where(train_mask)[0]
valid_idx = np.where(validation_mask)[0]
test_idx = np.where(test_mask)[0]
print(f"trainSize={len(train_idx)}, validationSize={len(valid_idx)}, testSize={len(test_idx)}")
# print(train_idx[0:5])
# print(valid_idx[0:5])
# print(test_idx[0:5])

splits = (list(train_idx), list(valid_idx))

In [None]:
## Split out categorical vs continuous data
cont, cat = aiTab.cont_cat_split(augmented_df, 1, dep_var="daily_demand")
print(cont)
print(cat)

In [None]:
## Create TabularPandas
procs = [aiTab.Categorify, aiTab.FillMissing]
dep_var = "daily_demand"
to = aiTab.TabularPandas(augmented_df, procs, cat, cont, y_names=dep_var, splits=splits)
len(to.train),len(to.valid)

Pick out the validation input and output data (and duplicate that effect with the test Dataframes)

In [None]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y
test_xs = test_df.drop("daily_demand", axis=1, inplace=False)
test_y = test_df["daily_demand"]
full_xs = augmented_df.drop("daily_demand", axis=1, inplace=False)
full_y = augmented_df["daily_demand"]

In [None]:
## Create a decision tree
tree = DecisionTreeRegressor(max_leaf_nodes=4)
_ = tree.fit(xs, y)

In [None]:
from sklearn.tree import export_graphviz
import graphviz
import re

def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs):
    s = export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                        special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))


# Draw the tree
draw_tree(tree, xs, size=10, leaves_parallel=True, precision=2)

In [None]:
## Let's see that in DTreeViz
samp_idx = np.random.permutation(len(y))[:500]
dtrees.dtreeviz(tree, xs.iloc[samp_idx], y.iloc[samp_idx], xs.columns, dep_var,
         fontname='DejaVu Sans', scale=1.6, label_fontsize=10,
         orientation='LR')

In [None]:
## MOAR LEAVES
tree = DecisionTreeRegressor(min_samples_leaf=25)
_ = tree.fit(xs, y)

In [None]:
## Functions to check root-mean-squared error for the model
import math
def r_mse(pred, y): return round(math.sqrt(((pred - y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [None]:
## Check error in the training and validation set
m_rmse(tree, xs, y), m_rmse(tree, valid_xs, valid_y)

In [None]:
## How many leaves do we have, vs number of measurements? Checking for overfitting here.
tree.get_n_leaves(), len(xs)

In [None]:
# Draw the full tree (use carefully, it's pretty big)
# samp_idx = np.random.permutation(len(y))[:500]
# dtrees.dtreeviz(tree, xs.iloc[samp_idx], y.iloc[samp_idx], xs.columns, dep_var,
#                 fontname='DejaVu Sans', scale=1.6, label_fontsize=10,
#                 orientation='LR')

In [None]:
## Graph test set predictions vs actuals
y_pred = tree.predict(test_xs)
pred_df = pd.DataFrame(data=y_pred, index=list(test_xs.index))
pred_df = pred_df.join(test_y, how="inner")
pred_df.rename(columns={0: "predicted_demand", "daily_demand":"actual_demand"}, inplace=True)
pred_df.plot()

In [None]:
# Graph full dataset predictions vs actuals
y_pred = tree.predict(full_xs)
pred_df = pd.DataFrame(data=y_pred, index=list(full_xs.index))
pred_df = pred_df.join(full_y, how="inner")
pred_df.rename(columns={0: "predicted_demand", "daily_demand": "actual_demand"}, inplace=True)
error_df = pred_df["actual_demand"] - pred_df["predicted_demand"]

fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_figwidth(15)
fig.set_figheight(4)
ax1.plot(pred_df)
ax2.plot(error_df)
ax3.hist(error_df, bins=25)
""