In [None]:
import os 

data_dir = 'C:/Users/write/OneDrive/Deep Learning Data/jena_climate_2009_2016'
fname = os.path.join(data_dir, 'jena_climate_2009_2016.csv')

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
samples = lines[1:]
#print('Header:\n', header)
#print('Number of samples: ', len(samples))

In [None]:
import numpy as np

float_records = np.zeros((len(samples), len(header) - 1)) #exclude "Date Time" header
for line_num, record in enumerate(samples):
    float_records[line_num, : ] = [float(x) for x in record.split(',')[1:]] #exclude "Date Time" column entries

mean = float_records[: 200000].mean(axis = 0)
float_records -= mean
std = float_records[: 200000].std(axis = 0)
float_records /= std

#print(float_records.shape[-1])

# Based on 10 days data can we predict next 24 hrs temperatures?
#lookback = 1440 observations (6 * 24 * 10) 
#steps = 6 observations will be sampled at one data point every hr
#delay = 144 Targets will be 24 hrs in the future
#We will use first 200,000 as training data
#At this point first 200,000 records in float_data collection are normalized and can be used for training

# Create Data Generator
>It yields a tuple (samples, targets) where samples is one batch of input data
>and targets us the corresponding array of target temperatures

> data - The original array of floating-point data (float_records)
> lookback - Defines how many timesteps back the input data should go (720 means 5 days, 1440 means 10 days, etc.)
> delay - Defines how many timesteps in the future target should be (144 means 24 hrs given 1 record / 10 mins)
> min_index and max_index - Indices in the data array [float_data] that delimit which timesteps to draw from. 
Helps in data segmentation for training, validation and text.
Note, we normalized first 200,000 records for training
> shuffle - Whether to shuffle the samples or draw them in chronological order. Usually validation and test data are not shuffled in Timeseries problem
> batch_size - The number of samples per batch
> step - The period, in timesteps, at which data is sampled. Here the data is sampled every 10 mins i.e. 6/hr

In [None]:
def generator(data, lookback, delay, min_index, max_index, shuffle = False, batch_size = 128, step = 6):
    if max_index is None:
        max_index = ((len(data) - 1) - delay) #len() starts from 1 so for index need to -1 as index starts from 0
    
    lower_index = min_index + lookback
    
    while 1:
        if shuffle:
            rows = np.random.randint(lower_index, max_index, size = batch_size)
        else:
            if lower_index + batch_size >= max_index:
                lower_index = min_index + lookback
            rows = np.arange(lower_index, min(lower_index + batch_size, max_index))
            lower_index += len(rows)
        
        samples = np.zeros((len(rows), lookback // step, data.shape[-1]))  #data.shape[-1] = 14
        targets = np.zeros((len(rows),))
        
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]  #At index 1 we have "T (degC)"
        
        yield samples, targets

In [None]:
lookback = 1440 # 6 * 24 * 10 => 10 days
step = 6 #number of observations per hour
delay = 144 #target => 6 * 24 => predicting 24 hrs in future
batch_size = 128

train_gen = generator(float_records,
                     lookback = lookback,
                     delay = delay,
                     min_index = 0,
                     max_index = 200000, #first 200000 for training (normalized for training)
                     shuffle = True, #training data is shuffled
                     batch_size = batch_size, 
                     step = step
                     )

#Not shuffling validation and test data
validation_gen = generator(float_records,
                          lookback = lookback,
                          delay = delay,
                          min_index = 200001,
                          max_index = 300000,
                          batch_size = batch_size, 
                          step = step
                          )

test_gen = generator(float_records,
                    lookback = lookback,
                    delay = delay,
                    min_index = 300001,
                    max_index = None,
                    batch_size = batch_size,
                    step = step)

In [None]:
#np.mean(batch_maes) = 0.28969941979609765
#celsius_mae = 2.5645638478601653

from keras.models import Sequential
from keras import layers
from keras import optimizers
from keras import losses

val_steps = (300000 - 200001) - lookback

In [None]:
model_gru = Sequential()
model_gru.add(layers.GRU(32, input_shape = (None, float_records.shape[-1])))
model_gru.add(layers.Dense(1))

model_gru.compile(optimizer = optimizers.RMSprop(),
                 loss = losses.MAE)

history_gru = model_gru.fit_generator(train_gen,
                                     steps_per_epoch = 500,
                                     epochs = 20,
                                     validation_data = validation_gen,
                                     validation_steps = val_steps)

In [None]:
model_lstm = Sequential()
model_lstm.add(layers.LSTM(32, input_shape(None, float_records.shape[-1])))
model_lstm.add(layers.Dense(1))

model_lstm.compile(optimizer = optimizers.RMSprop(),
                  loss = losses.MAE)

history_lstm = model_lstm.fit_generator(train_gen,
                                       steps_per_epoch = 500,
                                       validation_data = validation_gen,
                                       validation_steps = val_steps)

In [None]:
import matplotlib.pyplot as plt

gru_loss = history_gru.history['loss']
gru_val_loss = history_gru.history['val_loss']

epochs = range(1, len(gru_loss) + 1)

plt.plot(epochs, gru_loss, 'bo', title = 'Training Loss')
plt.plot(epochs, gru_val_loss, 'b', title = 'Validation Loss')
plt.title('GRU Training and Validation Loss')
plt.legend()
plt.show()

lstm_loss = history_lstm.history['loss']
lstm_val_loss = history_lstm.history['val_loss']

epochs = range(1, len(lstm_loss) + 1)

plt.plot(epochs, lstm_loss, 'c^', title = 'Training Loss')
plt.plot(epochs, lstm_val_loss, r, title = 'Validation Loss')
plt.title('LSTM Training and Validation Loss')
plt.legend()
plt.show()