In [21]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import pickle 
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import layers

import keras_tuner as kt
from keras_tuner.tuners import RandomSearch

In [2]:
df = pd.read_csv('historic_readings_no_missing_data.csv'
        ).drop(columns=['Unnamed: 0']
        ).sort_values(by='timestamp')

In [3]:
df.sort_values(by='timestamp', inplace=True)

In [4]:
df_temp =  df[ df['timestamp'] > str(pd.Timestamp('2023-04-28')) ].drop(columns=['humidity'])
df_temp.set_index('timestamp',inplace=True)
df_temp.dropna(inplace=True)

In [28]:
# Split the data, 60% in training and 20% in validation, 20% in test, and not randomly selected since this is a time series

last_train_index = int(np.round(df_temp['temperature'].shape[0]*0.6))
last_val_index = int(np.round(df_temp['temperature'].shape[0]*0.8))

train_data = df_temp.iloc[:last_train_index,]
validation_data = df_temp.iloc[last_train_index:last_val_index]
test_data = df_temp.iloc[last_val_index:]

# Fit to the train set, then transform the train set and the test set

mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
scaled_train = train_data / std

scaled_validation = (validation_data - mean)/std
scaled_test = (test_data - mean)/std


print(f'Mean of training data: {mean},'
      f'\nStandard deviation of training data: {std}')



Mean of training data: temperature    22.981637
dtype: float64,
Standard deviation of training data: temperature    2.30707
dtype: float64


In [6]:
train_data

Unnamed: 0_level_0,temperature
timestamp,Unnamed: 1_level_1
2023-05-01 00:10:00,-2.311637
2023-05-01 00:20:00,-2.271637
2023-05-01 00:30:00,-2.281637
2023-05-01 00:40:00,-2.351637
2023-05-01 00:50:00,-2.341637
...,...
2023-07-12 06:10:00,1.948363
2023-07-12 06:20:00,1.958363
2023-07-12 06:30:00,1.998363
2023-07-12 06:40:00,1.928363


Creating the generators. The task will be to take in one hour of readings, spaced 10 minutes apart, and predict the temperature in two hours. 

For example, there will be readings at 3:00pm, 3:10pm, ..., 4:00pm, and the task will be to predict the temperature at 6pm. 

In [8]:
delay = 24
sequence_length = 12

This demonstrates how the generators will work, though they'll be randomly shuffled. 

In [9]:
train = keras.preprocessing.timeseries_dataset_from_array(scaled_train[:-delay], scaled_train[sequence_length+delay:], 
                            sequence_length=sequence_length, batch_size=1, shuffle=True)

validation = keras.preprocessing.timeseries_dataset_from_array(scaled_validation[:-delay], scaled_validation[sequence_length+delay:], 
                            sequence_length=sequence_length, batch_size=1, shuffle=True)

test = keras.preprocessing.timeseries_dataset_from_array(scaled_test[:-delay], scaled_test[sequence_length+delay:], 
                            sequence_length=sequence_length, batch_size=1, shuffle=True)

### Hyperparameter tuning using Keras Tuner

Pretty helpful page from the Tensorflow folks [here](https://www.tensorflow.org/tutorials/keras/keras_tuner). 

In [17]:
def model_builder(hp):

    inputs = keras.Input(shape=(12, 1))

    # Tune the number of units in the first Dense layer
    hp_units = hp.Int('units', min_value=8, max_value=64, step=8)

    x = layers.LSTM(hp_units, recurrent_dropout=0.25)(inputs)
    outputs = layers.Dense(1)(x)

    model = keras.Model(inputs, outputs)

    model.compile(optimizer="adam",
                loss="mse",
                metrics=['mae'])

    return model

In [19]:
tuner = kt.Hyperband(model_builder,
                     objective='val_mae',
                     max_epochs=10,
                     factor=3,
                     directory='tuning_outputs/20230831',
                     project_name='intro_to_kt')

In [22]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_mae', patience=5)

In [27]:
tuner.search(train, epochs=50, validation_data=validation, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first LSTM
layer is {best_hps.get('units')}.
""")

Trial 8 Complete [00h 00m 42s]
val_mae: 0.18314144015312195

Best val_mae So Far: 0.17289999127388
Total elapsed time: 00h 05m 43s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 32.

