In [122]:
# univariate multi-step lstm
import os
import boto3
import json
import re
from kerastuner import Objective
import kerastuner as kt
import pandas as pd
import numpy as np
import glob
from datetime import datetime
from math import sqrt
from numpy import (split, array)
from pandas import read_csv
from sklearn.metrics import (mean_squared_error, mean_absolute_error)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import (LSTM, Dense, Flatten)

In [3]:
# define client, folder and files for AWS.
s3_client = boto3.client("s3")
s3_bucket_name = 'thesis-hydroponic-project'
s3_bucket_prefix = 'TemperatureAndHumidity2021/03'

In [4]:
# Fetch existing data or load fresh data.
# Remove redundancy and null.
#files = s3_client.list_objects(Bucket=s3_bucket_name, Prefix=s3_bucket_prefix)['Contents']
files = [{'Key': '/'.join(f.split('\\')[1:])} for f in glob.glob('.cache/**/**/**/***/*')]
parsed_sensor_values = []
for file in files:
    if False: #not os.path.isfile('.cache/' + file['Key']) or (datetime.now() - datetime.fromtimestamp(os.path.getctime('.cache/' + file['Key']))).days > 31:
        print("Fetching fresh file from S3:", file['Key'])
        if os.path.isfile('.cache/' + file['Key']):
            os.unlink('.cache/' + file['Key'])
        s3_object = s3_client.get_object(Bucket=s3_bucket_name, Key= file['Key'])
        raw_sensor_values = s3_object['Body'].read().decode()
        clean_sensor_values = re.sub(",+", ",", raw_sensor_values)
        sensor_values = json.loads("[" + clean_sensor_values.strip(',') + "]")
    else:
        print("Load from cache:", file['Key'])
        with open('.cache/' + file['Key']) as f:
            sensor_values = json.load(f)
    parsed_sensor_values += sensor_values
    os.makedirs('.cache/' + os.path.dirname(file['Key']), exist_ok=True)
    with open('.cache/' + file['Key'], 'w') as f:
        json.dump(sensor_values, f)

2d33c-944e-46de-8d37-28a6bf992123
Load from cache: TemperatureAndHumidity2021/03/29/02/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-02-55-47-7016dacb-4449-46ce-b8e7-1cbab7995d41
Load from cache: TemperatureAndHumidity2021/03/29/03/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-03-10-52-bb30d9e6-e9c3-4303-879e-362feecd838c
Load from cache: TemperatureAndHumidity2021/03/29/03/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-03-25-58-2209f8f2-fcb6-4653-bd0c-6856ad0429c0
Load from cache: TemperatureAndHumidity2021/03/29/03/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-03-41-06-1bb94952-fbe2-4851-a004-e3a4b2fef1ac
Load from cache: TemperatureAndHumidity2021/03/29/03/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-03-56-08-55391883-814a-44ed-899f-945b41f7138c
Load from cache: TemperatureAndHumidity2021/03/29/04/ThesisHydroponicSensorDeliveryStream-1-2021-03-29-04-11-14-d6528ed4-e4af-4d9f-9ec4-30cb3deda1c5
Load from cache: TemperatureAndHumidity2021/03/30/00/ThesisHydroponicSen

## Data preparation

In [5]:
df = pd.DataFrame(parsed_sensor_values)
df['temperature'] = df['temperature'].astype(float)
df['humidity'] = df['humidity'].astype(float)
df['datetime'] = pd.to_datetime(df['datetime'])
df.sort_values('datetime', ascending=True, inplace=True)

In [6]:
print('Number of rows and columns:', df.shape)
df.head(5)

Number of rows and columns: (91774, 4)


Unnamed: 0,sensor_type,temperature,humidity,datetime
0,hydroponic sensor,14.0,38.0,2021-02-26 08:51:37
1,hydroponic sensor,15.0,36.0,2021-02-26 08:51:43
2,hydroponic sensor,15.0,36.0,2021-02-26 08:51:48
3,hydroponic sensor,15.0,36.0,2021-02-26 08:51:53
4,hydroponic sensor,15.0,35.0,2021-02-26 08:51:58


In [7]:
# resample data to hourly
hourly_groups = df.set_index('datetime').resample('1H')
# Calulating the mean will simply eliminate the data outliers
hourly_data = hourly_groups.mean()
hourly_data.dropna(inplace=True)
# summarize
print(hourly_data.shape)
print(hourly_data)
# save
hourly_data.to_csv('temperature_humidity_hourly.csv')

(775, 2)
                     temperature   humidity
datetime                                   
2021-02-26 08:00:00    20.978495  27.376344
2021-02-26 09:00:00    23.768421  24.040602
2021-02-26 10:00:00    25.008850  30.794690
2021-02-26 11:00:00    22.839286  31.142857
2021-02-26 12:00:00    23.854545  29.054545
...                          ...        ...
2021-03-31 06:00:00    24.000000  20.067039
2021-03-31 07:00:00    24.000000  20.352273
2021-03-31 08:00:00    24.331461  20.067416
2021-03-31 09:00:00    25.084746  20.084746
2021-03-31 10:00:00    25.666667  20.666667

[775 rows x 2 columns]


In [8]:
# split a univariate dataset into train/test sets
def split_dataset(data, interval_size):
    temp_a = int((len(data)*0.8)/interval_size)
    split_index = temp_a*interval_size
    temp_b = int(len(data)/interval_size)
    end_index = temp_b*interval_size
    # split into standard weeks(i.e. divisible by interval_size)
    train, test = data[:split_index], data[split_index:end_index]
    # restructure into windows of weekly data
    train = array(split(train, len(train)/interval_size))
    test = array(split(test, len(test)/interval_size))
    return train, test

In [9]:
# evaluate one or more weekly forecasts against expected values
def evaluate_forecasts(actual, predicted):
    scores = list()
    # calculate an RMSE score for each day
    for i in range(actual.shape[1]):
        # calculate mse
        mse = mean_squared_error(actual[:, i], predicted[:, i])
        # calculate rmse
        rmse = sqrt(mse)
        # store
        scores.append(rmse)
    # calculate overall RMSE
    s = 0
    for row in range(actual.shape[0]):
        for col in range(actual.shape[1]):
            s += (actual[row, col] - predicted[row, col])**2
    score = sqrt(s / (actual.shape[0] * actual.shape[1]))
    return score, scores

In [10]:
# summarize scores
def summarize_scores(name, score, scores):
    s_scores = ', '.join(['%.1f' % s for s in scores])
    print('%s: [%.3f] %s' % (name, score, s_scores))

In [11]:
# convert history into inputs and outputs
def to_supervised(train, n_input, n_out):
	# flatten data
	data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
	X, y = list(), list()
	in_start = 0
	# step over the entire history one time step at a time
	for _ in range(len(data)):
		# define the end of the input sequence
		in_end = in_start + n_input
		out_end = in_end + n_out
		# ensure we have enough data for this instance
		if out_end <= len(data):
			x_input = data[in_start:in_end, 0]
			x_input = x_input.reshape((len(x_input), 1))
			X.append(x_input)
			y.append(data[in_end:out_end, 0])
		# move along one time step
		in_start += 1
	return array(X), array(y)

In [104]:
# train the model
def build_model(hp):
	# define hyperparameters parameters
	interval_size = hp.Int('interval_size', min_value=3, max_value=12, step=2)
	num_layers = hp.Int('num_layers', 1, 7, step=2)
	activation = hp.Choice("activation", ["sigmoid", "relu", "tanh"])
	hiddent_units = hp.Choice('hiddent_units', [50, 100, 150, 200])
	
	# define model
	model = Sequential()
	# we want sigmoid in the first layer so that we convert the data between 0 and 1
	model.add(LSTM(hiddent_units, activation='sigmoid', input_shape=(interval_size, 1), return_sequences=num_layers > 1))
	for i in range(num_layers - 2):
		model.add(LSTM(hiddent_units, activation=activation, return_sequences=True, dropout=hp.Float("dropout", 0.0 , 0.8 , 0.2)))
	if num_layers > 1:
		model.add(LSTM(hiddent_units, activation=activation))
	model.add(Dense(interval_size))
	model.compile(loss='mse', optimizer='adam')
	return model

In [105]:
# make a forecast
def forecast(model, history, n_input):
	# flatten data
	data = array(history)
	data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
	# retrieve last observations for input data
	input_x = data[-n_input:, 0]
	# reshape into [1, n_input, 1]
	input_x = input_x.reshape((1, len(input_x), 1))
	# forecast the next week
	yhat = model.predict(input_x, verbose=0)
	# we only want the vector forecast
	yhat = yhat[0]
	return yhat

In [106]:
# evaluate a single model
def evaluate_model(train, test, n_input, model):
	# history is a list of weekly data
	history = [x for x in train]
	# walk-forward validation over each week
	predictions = list()
	for i in range(len(test)):
		# predict the week
		yhat_sequence = forecast(model, history, n_input)
		# store the predictions
		predictions.append(yhat_sequence)
		# get real observation and add to history for predicting the next week
		history.append(test[i, :])
	# evaluate predictions days for each week
	predictions = array(predictions)
	score, scores = evaluate_forecasts(test[:, :, 0], predictions)
	return score, scores, predictions

In [143]:
# load the new file
dataset = read_csv('temperature_humidity_hourly.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])

def tuning_build_model(hp):
    interval_size = hp.Int('interval_size', min_value=3, max_value=12, step=2)
    return build_model(hp)

# When training with hyperparameters change the epoch size to 50 and the max_trails to min 72000
class MyTuner(kt.Tuner):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def run_trial(self, trial, train_ds, **kwargs):
        hp = trial.hyperparameters
        batch_size = hp.Choice("batch_size", [0, 32, 64, 128, 256, 512])
        epochs = hp.Choice("epochs", [10])
        interval_size = hp.Int('interval_size', min_value=3, max_value=12, step=2)
        train, test = split_dataset(train_ds, interval_size)
        model = self.hypermodel.build(trial.hyperparameters)
        x,y=to_supervised(train, interval_size, interval_size)
        model.fit(x, y, epochs=epochs, batch_size=batch_size, verbose=False)
        score, scores, predictions = evaluate_model(train, test, interval_size, model)
        self.oracle.update_trial(trial.trial_id, {'score': score})

tuner = MyTuner(
    oracle=kt.oracles.BayesianOptimization(
          objective=kt.Objective('score', 'min'),
          max_trials=20),
    hypermodel=tuning_build_model,
    directory='forecasting_evaluation',
    project_name='lstm', overwrite=True)
tuner.search_space_summary()


Search space summary
Default search space size: 4
interval_size (Int)
{'default': None, 'conditions': [], 'min_value': 3, 'max_value': 12, 'step': 2, 'sampling': None}
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 7, 'step': 2, 'sampling': None}
activation (Choice)
{'default': 'sigmoid', 'conditions': [], 'values': ['sigmoid', 'relu', 'tanh'], 'ordered': False}
hiddent_units (Choice)
{'default': 50, 'conditions': [], 'values': [50, 100, 150, 200], 'ordered': True}


In [144]:
tuner.search(train_ds=dataset.values)
tuner.results_summary()

Trial 20 Complete [00h 00m 09s]
score: 10.250206729003853

Best score So Far: 0.6584623685297628
Total elapsed time: 00h 03m 29s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in forecasting_evaluation\lstm
Showing 10 best trials
Objective(name='score', direction='min')
Trial summary
Hyperparameters:
interval_size: 11
num_layers: 3
activation: tanh
hiddent_units: 200
batch_size: 0
epochs: 10
dropout: 0.0
Score: 0.6584623685297628
Trial summary
Hyperparameters:
interval_size: 7
num_layers: 1
activation: tanh
hiddent_units: 200
batch_size: 0
epochs: 10
dropout: 0.0
Score: 1.2740476040851663
Trial summary
Hyperparameters:
interval_size: 3
num_layers: 5
activation: relu
hiddent_units: 50
batch_size: 32
epochs: 10
dropout: 0.2
Score: 2.6121612958940776
Trial summary
Hyperparameters:
interval_size: 5
num_layers: 5
activation: relu
hiddent_units: 150
batch_size: 32
epochs: 10
dropout: 0.6000000000000001
Score: 3.507735483915803
Trial summary
Hyperparameters:
interval_size: 11
n

In [146]:
from collections import defaultdict
hyperparameter_results = defaultdict(list)
for file in glob.glob("forecasting_evaluation/lstm/trial*/trial.json"):
    trial = json.load(open(file))
    hyperparameter_results[str(trial["hyperparameters"]["values"])].append(trial["metrics"]["metrics"]["score"]["observations"][0]["value"][0])

for param, rmse_scores in hyperparameter_results.items():
    mean_rmse_score = sum(rmse_scores)/len(rmse_scores)
    max_rmse_score = max(rmse_scores)
    min_rmse_score = min(rmse_scores)
    variance_rmse_score = (sum([(mean_rmse_score-current_score)**2 for current_score in rmse_scores]))/len(rmse_scores)
    std_dev_rmse_score = sqrt(variance_rmse_score)
    # summarize scores
    print("-----------------------------------------")
    print("hyperparameter tune",param)
    print("mean_rmse_score",mean_rmse_score)
    print("max_rmse_score",max_rmse_score)
    print("min_rmse_score",min_rmse_score)
    print("variance_rmse_score",variance_rmse_score)
    print("std_dev_rmse_score",std_dev_rmse_score)

-----------------------------------------
hyperparameter tune {'interval_size': 3, 'num_layers': 5, 'activation': 'sigmoid', 'hiddent_units': 50, 'batch_size': 64, 'epochs': 10, 'dropout': 0.4}
mean_rmse_score 19.38674145879552
max_rmse_score 19.38674145879552
min_rmse_score 19.38674145879552
variance_rmse_score 0.0
std_dev_rmse_score 0.0
-----------------------------------------
hyperparameter tune {'interval_size': 7, 'num_layers': 5, 'activation': 'sigmoid', 'hiddent_units': 50, 'batch_size': 64, 'epochs': 10, 'dropout': 0.0}
mean_rmse_score 19.342389176607455
max_rmse_score 19.342389176607455
min_rmse_score 19.342389176607455
variance_rmse_score 0.0
std_dev_rmse_score 0.0
-----------------------------------------
hyperparameter tune {'interval_size': 9, 'num_layers': 3, 'activation': 'sigmoid', 'hiddent_units': 50, 'batch_size': 128, 'epochs': 10, 'dropout': 0.2}
mean_rmse_score 20.355038189784477
max_rmse_score 20.355038189784477
min_rmse_score 20.355038189784477
variance_rmse_sco

# plot scores
hrs = ['1', '2', '3', '4', '5', '6', '7']
pyplot.plot(hrs, scores, marker='o', label='lstm')
pyplot.show()

In [147]:
df_test = pd.DataFrame(test.reshape(test.shape[0]* test.shape[1], 2), columns=["temperature","humidity"])
df_pred = pd.DataFrame(predictions.reshape(predictions.shape[0]* predictions.shape[1],),columns=["pred_temperature"])
predicted_and_test = df_pred.join(df_test)
predicted_and_test.to_csv('predicted_temperature_and_test_values.csv',index=False)

NameError: name 'predictions' is not defined