In [None]:
import warnings
import download_historical_data as dl
import os 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Importing some stuff from the FastAI book
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import fastai.tabular.all as aiTab
import sklearn as sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import dtreeviz.trees as dtrees
from IPython.display import Image, display_svg, SVG

import math

# Functions to check root-mean-squared error for the model
def r_mse(pred, y): return round(math.sqrt(((pred - y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)
def r_mape(pred, actual): return round(sklearn.metrics.mean_absolute_percentage_error(actual, pred), 5)
def m_mape(m, xs, y): return r_mape(m.predict(xs), y)

pd.options.display.max_rows = 10
pd.options.display.max_columns = 6

warnings.filterwarnings('ignore')


HISTORICAL_DATA_DIR = os.path.abspath("./historical_data")
ANALYSIS_DATA_DIR = os.path.abspath("./analysis_data/")
ELECTRIC_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "electric_data")
WEATHER_DATA_DIR = os.path.join(HISTORICAL_DATA_DIR, "weather_station_data")

for dir in [HISTORICAL_DATA_DIR, ANALYSIS_DATA_DIR, ELECTRIC_DATA_DIR, WEATHER_DATA_DIR]:
    if not os.path.exists(dir):
        os.makedirs(dir)


WEATHER_STATION_IDS = [
    "USW00023066",  # Grand Junction Walker Field
    "USC00053553",  # Greeley UNC
    "USC00053005",  # Ft Collins
    "USC00050848",  # Boulder
    "USC00055984",  # Northglenn
    "USC00058995",  # Wheat Ridge
    "USW00023061"  # Alamosa
]

# Uncomment following lines to force re-download of source data
# Otherwise can also run the download script manually via: python download_historical_data.py
# Data files are saved locally so you only need to re-download to get new/different data

#dl.download_eia_historical_data(ELECTRIC_DATA_DIR, eia_respondent="PSCO")
#dl.download_ghcnd_historical_data(WEATHER_DATA_DIR, WEATHER_STATION_IDS)

In [None]:
import glob

plt.style.use("default") #alternative "ggplot"

temp_df : pd.DataFrame = None

## Load up temperature data for each weather station, into their own columns
for df_file in glob.glob(WEATHER_DATA_DIR + "\*.json"):
    with open(df_file, "r", encoding="utf-8") as f:
        station_id = os.path.basename(df_file)[0:11]
        station_df = pd.read_json(f)
        station_df.index.rename("date", inplace=True)
        
        # TODO: This name-mangling seems like a halfassed way to either do a MultiIndex or maybe a tuple-index
        # Going to leave it for now as I'm not clear what will be easiest when trying to train an ML model
        col_renames = {col: f"{station_id}_{col}" for col in station_df.columns}
        station_df.rename(col_renames, axis="columns", inplace=True)
        
        if temp_df is not None:
            temp_df = pd.merge(left=temp_df, right=station_df, how="outer", left_index=True, right_index=True)
        else:
            temp_df = station_df

len(temp_df)

Load PSCO electric demand data from EIA

In [None]:
psco_demand_data_file = os.path.join(ELECTRIC_DATA_DIR, "psco-daily-dataframe.json")
with open(psco_demand_data_file, "r", encoding="utf-8") as f:
    demand_df = pd.read_json(f)

len(demand_df)

Merge demand and temperature data

In [None]:
joined_df = pd.merge(demand_df, temp_df, how="outer", left_index=True, right_index=True)
joined_df.dropna(inplace=True)
#len(joined_df),joined_df.columns

Augment data with new dates and maybe some other stuff

In [None]:
## Augment data
augmented_df = joined_df.copy()

# Extract date index into a column
augmented_df.reset_index(inplace=True)
augmented_df["date"] = augmented_df["index"]  
augmented_df.set_index("index", inplace=True)

## Adds date parts
augmented_df = aiTab.add_datepart(augmented_df, "date", drop=True)
## But a lot of the augmented parts are not that applicable in our case
augmented_df.drop(['Elapsed', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
                   'Is_year_end', 'Is_year_start'], axis=1, inplace=True)

## Add lagged values
for lag in range(1, 15):
    augmented_df[f"demand_lag_{lag}"] = augmented_df["daily_demand"].shift(lag)
augmented_df.dropna(inplace=True) ## Lag columns will have NaN values

# Create masks for data sets. Need to do this before we drop the non-preductive columns
# Because the "Year" columns will be dropped
train_mask = (augmented_df.Year < 2021)
validation_mask = ((augmented_df.Year >= 2021) & (augmented_df.Year < 2022))
test_mask = (augmented_df.Year >= 2022)

# List of columns with more predictive value, extracted from random_forest.ipynb
cols_to_keep = ['daily_demand', 'demand_lag_1', 'demand_lag_7', 'demand_lag_3', 'Dayofweek',
                'USC00050848_tmax', 'USC00055984_tmax', 'demand_lag_4',
                'USC00053553_tmax', 'demand_lag_14', 'USC00053553_tmin', 'demand_lag_8',
                'USW00023061_tmax']

# Drop non-predictive columns
augmented_df = augmented_df.filter(items=cols_to_keep, axis='columns')

augmented_df.columns

In [None]:
## Split out training, test and validation sets
train_df = augmented_df.where(train_mask).dropna()
validation_df = augmented_df.where(validation_mask).dropna()
test_df = augmented_df.where(test_mask).dropna()

train_idx = np.where(train_mask)[0]
valid_idx = np.where(validation_mask)[0]
test_idx = np.where(test_mask)[0]
print(f"trainSize={len(train_idx)}, validationSize={len(valid_idx)}, testSize={len(test_idx)}")

Pick out the validation input and output data (and duplicate that effect with the test Dataframes)

In [None]:
xs = train_df.drop("daily_demand", axis=1, inplace=False)
y = train_df["daily_demand"]
valid_xs = validation_df.drop("daily_demand", axis=1, inplace=False)
valid_y = validation_df["daily_demand"]
test_xs = test_df.drop("daily_demand", axis=1, inplace=False)
test_y = test_df["daily_demand"]
full_xs = augmented_df.drop("daily_demand", axis=1, inplace=False)
full_y = augmented_df["daily_demand"]

In [None]:
# Not using aiTab.Categorify from the book
procs_nn = [aiTab.FillMissing, aiTab.Normalize]

splits = (list(train_idx), list(valid_idx))

tab_panda = aiTab.TabularPandas(augmented_df, procs_nn, cat_names=[], cont_names=list(xs.columns),
                            splits=splits, y_names="daily_demand")

dataloader = tab_panda.dataloaders(1024)

In [None]:
# Find dependent variable min/max
y = tab_panda.train.y
y.min(),y.max()

In [None]:
learn = aiTab.tabular_learner(dataloader, y_range=(90000, 200000), layers=[500, 250],
                        n_out=1, loss_func=aiTab.F.mse_loss)

In [None]:
learn.lr_find()

In [None]:
#learn.fit_one_cycle(5, 0.00039)
learn.fit(25, 0.00039)

preds, targets = learn.get_preds()
r_mape(preds, targets)

In [None]:
# Check errors on our test set
#prediction, bias, contributions = treeinterpreter.predict(forest, row.values)
# m_mape(learn.predict(test_xs[0:0]), test_xs, test_y)
# learn.predict(test_xs[0])
test_xs[0:1]

# Analysis!

In [None]:
## Plot predicted vs actual
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_figwidth(15)
fig.set_figheight(4)
ax1.plot(pred_df)
ax2.plot(error_df)
ax3.hist(error_df, bins=25)
""

In [None]:
pred_df.plot.scatter(x="actual_demand", y="predicted_demand")