In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns
import custom_module as cm
#import optimizer_module as om

In [2]:
#############################################################################
####################### DATA PREPROCESSING #################################
###########################################################################

In [3]:
# Loading the dataset
data = pd.read_csv("combined_data.csv", parse_dates=True)
#data = data["2016":]

In [4]:
# Load this file for saving time
# Saving the data file so we can reload with the features made again to reduce time
# data.to_csv('final.csv') 
data = pd.read_csv("final.csv", parse_dates=True, index_col=0)

In [5]:
data

Unnamed: 0_level_0,RRP5MIN,RESIDUAL_DEMAND,AVG_PRICE,DIFF_PRICE,hour,weekday,month,business hour,public holiday
SETTLEMENTDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-01-01 00:00:00,4.096091,1593.93,5.046193,-0.486409,0,0,1,0,1
2016-01-01 00:05:00,5.257498,1557.06,5.102443,1.161408,0,0,1,0,1
2016-01-01 00:10:00,5.415838,1510.10,5.087579,0.158340,0,0,1,0,1
2016-01-01 00:15:00,5.415838,1474.70,5.087579,0.000000,0,0,1,0,1
2016-01-01 00:20:00,3.609315,1464.90,4.937838,-1.806523,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
2019-06-30 23:40:00,5.998145,546.95,4.556324,-0.136612,23,0,6,0,0
2019-06-30 23:45:00,5.961220,560.19,4.778101,-0.036925,23,0,6,0,0
2019-06-30 23:50:00,5.066510,559.87,4.907920,-0.894710,23,0,6,0,0
2019-06-30 23:55:00,5.002401,547.07,5.023500,-0.064109,23,0,6,0,0


In [6]:
# Setting the data to index
data["SETTLEMENTDATE"] = pd.to_datetime(data["SETTLEMENTDATE"])
data.index = data["SETTLEMENTDATE"]
data.drop(columns="SETTLEMENTDATE", inplace=True)

KeyError: 'SETTLEMENTDATE'

In [None]:
# replace outliers by outlier threshold
data = cm.replace_outliers(data, 'RRP5MIN', 4)

In [None]:
# Seperating the test dataset for testing purposes in evaluation
X_test = data["2019-01-01":"2019-06-30"].copy()
X_test = X_test["RRP5MIN"]

In [None]:
####################### SOME CHARTS OF THE DATA IN HAND ####################
###########################################################################

In [None]:
# Plotting the curve Price
cm.plot_chart(data["RRP5MIN"].loc["2018-02-20":"2018-02-21"], legend=True)

In [None]:
# Plotting the curve Residual Demand
# Looks stationary
cm.plot_chart(data["RESIDUAL_DEMAND"].loc["2019"], legend=True)

In [None]:
# ADD MORE IF YOU WANT

In [None]:
# ADD MORE IF YOU WANT

In [None]:
#############################################################################
####################### FEATURES PREPROCESSING #############################
###########################################################################

In [None]:
# Avg Price of last 1 hour i.e. 12 data points at 5 minutes granularity
data["AVG_PRICE"] = pd.DataFrame(cm.average_hours(data["RRP5MIN"]))

In [None]:
# Differencing the average price and creating a differenced price variable
data["AVG_PRICE"] = cm.period_difference(data["AVG_PRICE"])
data["DIFF_PRICE"] = cm.period_difference(data["RRP5MIN"])

In [None]:
############################################################################

In [None]:
# Generate 'hour', 'weekday' and 'month' features
data['hour'] = 0
data['weekday'] = 0
data['month'] = 0
for i in range(len(data)):
    position = data.index[i]
    data['hour'][i] = position.hour
    data['weekday'][i] = position.weekday()
    data['month'][i] = position.month

In [None]:
# MAKING FEATURES
# Generate 'business hour' feature. 7am-7pm business hours
data["business hour"] = 0
for i in range(len(data)):
    position = data.index[i]
    hour = position.hour
    if ((hour > 7 and hour < 12) or (hour > 14 and hour < 19)):
        data["business hour"][i] = 2
    elif (hour >= 12 and hour <= 14):
        data["business hour"][i] = 1
    else:
        data["business hour"][i] = 0

In [None]:
# Generate 'weekend' feature
for i in range(len(data)):
    position = data.index[i]
    weekday = position.weekday()
    if (weekday == 6):
        data['weekday'][i] = 2
    elif (weekday == 5):
        data['weekday'][i] = 1
    else:
        data['weekday'][i] = 0

In [None]:
from datetime import date
#pip install holidays
import holidays

In [None]:
aus_holidays = holidays.CountryHoliday('AUS', prov='NSW')

In [None]:
data["public holiday"] = 0
for i in range(len(data)):
    if (data.index[i] in aus_holidays):
        data["public holiday"][i] = 1

In [None]:
##### SAVE FILE HERE #####

In [None]:
#############################################################################
########### PREPARING DATA FOR KERAS TO PROCESS PREPROCESSING ##############
###########################################################################

In [None]:
################################## 2 MODELS #######################################################
######### 1st for processing Categorical Data for Regression via Multi-Layer Perceptron #########
########################### 2nd for processing Time Series via LSTM ##############################
################################################################################################

In [None]:
# Scaling the RRP between 0 and 1 as required by the NN
features = ['RESIDUAL_DEMAND', 'AVG_PRICE', 'DIFF_PRICE']
feature_scaler = MinMaxScaler()
for i in features:
    data[i] = feature_scaler.fit_transform(pd.DataFrame(data[i]))

In [None]:
# scale price data to 0-1 range
label_scaler = MinMaxScaler()
data['RRP5MIN'] = label_scaler.fit_transform(data['RRP5MIN'].values.reshape(-1, 1))

In [None]:
train = data['2016-12-25 00:00:00':].copy()

In [None]:
train

In [None]:
# include time lags of timeseries data for last day i.e. 288 data points at 5 minutes granularity
# Also 80 lags of same day previous week

# Creating Daily lags
for i in range(1,201):
    train["price_l_{}".format(i)] = train["DIFF_PRICE"].shift(i)
    train["demand_l_{}".format(i)] = train["RESIDUAL_DEMAND"].shift(i)
    train["avgPrice_l_{}".format(i)] = train["AVG_PRICE"].shift(i)
    

# Creating Week ago lags
j = 1
size = 2016
for i in range(size, size-80, -1):
    train["w_price_l_{}".format(j)] = train["DIFF_PRICE"].shift(i)
    train["w_demand_l_{}".format(j)] = train["RESIDUAL_DEMAND"].shift(i)
    train["w_avgPrice_l_{}".format(j)] = train["AVG_PRICE"].shift(i)
    j+=1


# # Creating Month ago lags
# j = 1
# for i in range(1728,2016):
#     train["w_l_{}".format(j)] = train["RRP5MIN"].shift(i)
#     j+=1
        
#Adjustment for leap year required Here!!!!!!!!!!
#Creating year ago lags
# j = 1
# size = data['2017'].shape[0]
# for i in range(size, size-50, -1):
#     train["y_price_l_{}".format(j)] = train["DIFF_PRICE"].shift(i)
#     train["y_demand_l_{}".format(j)] = train["RESIDUAL_DEMAND"].shift(i)
#     train["y_avgPrice_l_{}".format(j)] = train["AVG_PRICE"].shift(i)
#     j+=1


In [None]:
# Drop NANS
train.dropna(inplace=True)
train.head(5)
train

In [None]:
#################### PROCESSING THE DATA FOR MLP NETWORK ###################

In [None]:
########### THIS IS FOR MULTILAYER PERCEPTRON PURPOSES
train1 = data[['hour', 'weekday', 'month', 'business hour', 'public holiday', 'RRP5MIN']]
train1 = train1["2017":]

In [None]:
# Scaling the categorical variables using the same scaler used for LSTM variables
cont = ['hour', 'weekday', 'month', 'business hour', 'public holiday']
for i in cont:
    train1[i] = feature_scaler.transform(pd.DataFrame(train1[i]))

In [None]:
features1 = train1[train1.index.minute == 0]
features1 = features1[features1.index.hour == 0]

# Seperating training and test data for Multi-Layer Perceptron Network
features_train1 = features1[:'2018']
features_test1 = features1['2019':'2019-06-30']

# Reshaping the features and test data to NP-Array as per Keras input requirement
features_train1 = features_train1.to_numpy().reshape(features_train1.shape[0], features_train1.shape[1])
features_test1 = features_test1.to_numpy().reshape(features_test1.shape[0], features_test1.shape[1])

In [None]:
#################### PROCESSING THE DATA FOR LSTM NETWORK ###################

In [None]:
# create feature and label dataframes
prelim_features = train.drop(['RRP5MIN', 'RESIDUAL_DEMAND', 'AVG_PRICE', 'DIFF_PRICE', 'hour', 'weekday', 'month', 'business hour', 'public holiday'], axis=1)
prelim_labels = pd.DataFrame(train[['RRP5MIN']])

In [None]:
# format labels to 24 hour output range
for i in range(0, 288):
    prelim_labels['t_{}'.format(i)] = prelim_labels['RRP5MIN'].shift(-i)
prelim_labels.drop(['RRP5MIN'], axis=1, inplace=True)

# apply one-day discretization to the data
labels = prelim_labels[prelim_labels.index.minute == 0]
labels = labels[labels.index.hour == 0]
features = prelim_features[prelim_features.index.minute == 0]
features = features[features.index.hour == 0]

features_train = features[:'2018']
features_test = features['2019':'2019-06-30']
labels_train = labels[:'2018']

samples_train = len(features_train)
samples_test = len(features_test)
timesteps = 280

# convert pandas data frames to numpy ndarrays
features_train = features_train.to_numpy().reshape(samples_train, timesteps, 3)
features_test = features_test.to_numpy().reshape(samples_test, timesteps, 3)
labels_train = labels_train.to_numpy()

# check for correct data shape
features_train.shape, labels_train.shape

In [None]:
from keras.models import Model, load_model
from keras.layers.convolutional import Conv1D
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l1_l2

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import json

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Input
import tensorflow as tf

In [None]:
# split into training and validation data
# X_train, X_valid, y_train, y_valid = train_test_split(features_train, labels_train, test_size=0.2, random_state=7)

In [None]:
# #model.compile(loss='mae', optimizer='adam')
# checkpoint = ModelCheckpoint('./models/multidim_timeseries_testing.hdf5', save_best_only=True)

# hist = rnn.fit(X_train, y_train,
#                  validation_data=(X_valid, y_valid),
#                  callbacks=[checkpoint], 
#                  verbose=1, batch_size=16, epochs=160)

In [None]:
# rnn = cm.RNN_model(64, 64, 128, input_s=(X_train.shape[1],1))

In [None]:
# model.compile(loss='mae', optimizer='adam')
# # checkpoint = ModelCheckpoint('./models/multidim_timeseries_testing.hdf5', save_best_only=True)

# hist = model.fit(X_train, y_train,
#                  validation_data=(X_valid, y_valid),
#                  callbacks=[checkpoint], 
#                  verbose=1, batch_size=50, epochs=160)

In [None]:
# # results, hist = cm.train_predict_evaluate(rnn, X_train, X_valid, y_train, y_valid, features_test,
#                                        X_test.to_numpy().flatten(), X_test.index, label_scaler, 32, 160, 
#                                        'multidim_timeseries_testing.hdf5', verbose=0)

In [None]:
# best = load_model('./models/multidim_timeseries_testing.hdf5')
# # pred = best.predict([input_test, input_test[:, :, 3]])
# pred = best.predict(features_test)
# #pred = scaler.inverse_transform(pred.flatten().reshape(-1, 1))

In [None]:
#############################################################################
########### CONCATENATE THE 2 NN & COMPILE THEM TO FORM BIGGER NN ##########
###########################################################################

In [None]:
from keras.layers import concatenate

In [None]:
# Creating the 2 models
mlp = cm.create_mlp((features_train1.shape[1],))
lstm = cm.create_conv_lstm((None, features_train.shape[1], 3))                  

In [None]:
# Merging the 2 networks into a bigger network 
combinedInput = concatenate([mlp.output, lstm.output])

In [None]:
# Mapping the bigger Network to the output layer to predict one-day ahead i.e. 288 intervals
x = Dense(32, activation="relu")(combinedInput)
x = Dense(288, activation="sigmoid")(x)
model = Model(inputs=[mlp.input, lstm.input], outputs=x)

In [None]:
# Compiling the model with Mean Absolute Error as loss function and Adam as optimizer
model.compile(loss='mae', optimizer='adam')
#checkpoint = ModelCheckpoint('./models/multidim_timeseries_testing.hdf5', save_best_only=True)

In [None]:
# Training the Model
hist = model.fit(x=[features_train1, features_train], y=labels_train, 
                 verbose=1, batch_size=50, epochs=160)

In [None]:
# Making Predictions on the Testing Data
pred = model.predict([features_test1, features_test])

In [None]:
# Inverse scaling the predictions and re-shaping it to 1D output vector
pred = label_scaler.inverse_transform(pred.flatten().reshape(-1, 1))

In [None]:
# Combining Predictions and True Values in results dataframe
results = pd.DataFrame({'prediction':pred.flatten(), 'true values':X_test}, index=X_test.index)

In [None]:
# Plot of predictions against Actuals
cm.plot_chart(results["2019-01-01":"2019-01-10"], legend=True)

In [None]:
# Training loss comparision plot
cm.plot_chart(pd.DataFrame(hist.history), xlab='Training Epoch', ylab='Mean Squared Error', title='Training and Validation Error over the Course of Training', legend=True)

In [None]:
# Quantifying Performance using MAE, MSE, RMSE
cm.quantify_performance(results)

In [None]:
%reload_ext autoreload

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
################################################################################################

In [None]:
######## CHECKING MODELS ##########
models_list = []
models_list.append(cm.RNN_model(64, 64, 128, input_s=(X_train.shape[1],1)))
models_list.append(cm.RNN_model(128, 32, 64, input_s=(X_train.shape[1],1)))
models_list.append(cm.RNN_model(64, 32, 64, input_s=(X_train.shape[1],1)))
models_list.append(cm.RNN_model(32, 16, 32, input_s=(X_train.shape[1],1)))
models_list.append(cm.RNN_model(128, 32, 64, input_s=(X_train.shape[1],1)))
models_list.append(cm.RNN_model(128, 64, 128, input_s=(X_train.shape[1],1)))
# train all archtitectures and evaluate performance on the test set
for i, rnn in enumerate(models_list):

    results, hist = cm.train_predict_evaluate(rnn, X_train, X_valid, y_train, y_valid, features_test,
                                       X_test.to_numpy().flatten(), X_test.index, label_scaler, 32, 160, 
                                       'timeseries_architecture_tests.hdf5', verbose=0)
    print("Model {}".format(i))
    cm.quantify_performance(results)
    print("--------------------------------------------")
    print("--------------------------------------------")

In [None]:
# split = train_test_split(features_train, features_train1, test_size=0.25, random_state=42)
# (features_training, features_testing, features_training1, features_testing1) = split

In [None]:
# ### SEPERATING Y TRAIN AND Y TEST FOR VALIDATION PURPOSES
# trainY = features_training1["RRP5MIN"]
# testY = features_testing1["RRP5MIN"]

In [None]:
# features_training1.drop(columns=["RRP5MIN"], inplace=True)
# features_testing1.drop(columns=["RRP5MIN"], inplace=True)

In [None]:
# features_training1 = features_training1.to_numpy().reshape(features_training1.shape[0], features_training1.shape[1])

In [None]:
# features_testing1 = features_testing1.to_numpy().reshape(features_testing1.shape[0], features_testing1.shape[1])

In [None]:
#################################
# Hour of the day
# Day of the week
# Flag of Business Hour
# Public Holiday Flag
# Forecast Demand

In [None]:
data["2019"]

In [None]:
# Input prices into optimizer
import optimizer_module as om
numDays = 100 # Number of days to run model
start = 0 # Starting time interval from price data
bStorage0 = 0 # Starting battery charge

predPrices = results.iloc[start:start+(numDays*288)]["prediction"].tolist()
realPrices = results.iloc[start:start+(numDays*288)]["true values"].tolist()
outputResults = 1
outputActions = 1

print("Optimizer results for real prices")
realNxtAction, realNxtBatCharge, realActions = om.optimize(realPrices, bStorage0, outputResults, outputActions)

print("\nOptimizer results for predicted prices\nProfit is incorrect as it is calculating predicted profit not actual profit")
predNxtAction, predNxtBatCharge, predActions = om.optimize(predPrices, bStorage0, outputResults, outputActions)

maxProfit = sum([realActions[i]*realPrices[i]/12 for i in range(numDays*288)])
actualProfit = sum([predActions[i]*realPrices[i]/12 for i in range(numDays*288)])

print("\n----------RESULTS----------")
print("Max profit possible: $%.4g" % (maxProfit))
print("Actual profit: $%.4g -> %.4g%% of max profit possible" % (actualProfit,actualProfit/maxProfit*100))    