## Feature Generation Using seq2seq/autoencoder models

    - use seq2seq architecture to generate embeddings for multi-dimensional time series
    - use target = input for AutoEncoder setup
    - use the next day history as target for getting a different embedding

In [None]:
import pickle
import matplotlib.pyplot as plt
from datetime import datetime

import numpy as np
import random
import tensorflow as tf
import tensorflow_addons as tfa
import time

from sklearn.model_selection import train_test_split

In [None]:
# from the alarm classification data
with open('inverter-data-cnn-v01.pkl', 'rb') as handle:
    x_dict, y_dict, label_df = pickle.load(handle)
    
x_all, y_all = [], []
for inv in x_dict:
    x_ii, y_ii = x_dict[inv], y_dict[inv]
    x_all.append(x_ii)
    y_all.append(y_ii)

x_all = np.concatenate(x_all, axis=0)
y_all = np.concatenate(y_all, axis=0)
x_all.shape, y_all.shape

In [None]:
# from 04-c file
with open('autoencoder-data-v01.pkl', 'rb') as handle:
    x_all, y_all = pickle.load(handle)
x_all.shape, y_all.shape

In [None]:
plt.plot(x_all[0, :, 2])

In [None]:
xmax = np.amax(x_all, axis=(0, 1))
print(xmax)

In [None]:
ymax = np.amax(y_all, axis=(0, 1))
print(ymax)

In [None]:
x_norm = x_all / xmax
y_norm = y_all / ymax

### Check Normalization

In [None]:
np.amax(x_norm, axis=(0, 1)), np.amax(y_norm, axis=(0, 1))

In [None]:
# power
plt.subplot(2, 3, 1)
plt.plot(x_all[0, :, 0])
plt.title("Power")

plt.subplot(2, 3, 2)
plt.plot(x_all[0, :, 1])
plt.title("Temperature")

plt.subplot(2, 3, 3)
plt.plot(x_all[0, :, 2])
plt.title("Irradiance")

# normalized
plt.subplot(2, 3, 4)
plt.plot(x_norm[0, :, 0])

# temperature
plt.subplot(2, 3, 5)
plt.plot(x_norm[0, :, 1])

plt.subplot(2, 3, 6)
plt.plot(x_norm[0, :, 2])


In [None]:
f = plt.figure(figsize=(15,12))
ax = f.add_subplot(231)
ax2 = f.add_subplot(232)
ax3 = f.add_subplot(233)

ax.plot(x_all[0, :, 0])
ax2.plot(x_all[0, :, 1])
ax3.plot(x_all[0, :, 2])

ax4 = f.add_subplot(234)
ax5 = f.add_subplot(235)
ax6 = f.add_subplot(236)
ax4.plot(x_norm[0, :, 0])
ax5.plot(x_norm[0, :, 1])
ax6.plot(x_norm[0, :, 2])


In [None]:
x_norm[0, :, 2]

In [None]:
x_all[0, :, 2] / 18674.02929688

In [None]:
x_all[0, :, 2].max(), x_all[0, :, 2].max() / 18674.02929688

In [None]:
# make the target same as input - autoencoder or the next day history
# X_train,  X_test, Y_train, Y_test = train_test_split(x_norm, x_norm, test_size=0.2)

X_train,  X_test, Y_train, Y_test = train_test_split(x_norm, y_norm, test_size=0.2)
X_train.shape, X_test.shape

## Encoder Decoder Architecture 

In [None]:
# Model Parameters
embedding_dims = 256 #128
rnn_units = 256 #128
dense_units = 512 # 256
Dtype = tf.float32   #used to initialize DecoderCell Zero state
bidirectional = True
loss_function = 'mse' # 'mae'
BATCH_SIZE = 32

In [None]:
BUFFER_SIZE = x_norm.shape[0]
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
example_X, example_Y = next(iter(train_dataset))
print(example_X.shape) 
print(example_Y.shape) 

In [None]:
import sys

#ENCODER
class EncoderNetwork(tf.keras.Model):
    def __init__(self, rnn_units ):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Dense(rnn_units, activation='tanh')
        if bidirectional:
            basic_rnn = tf.keras.layers.LSTM(rnn_units, return_sequences=True, 
                                                     return_state=True )
            self.encoder_rnnlayer = tf.keras.layers.Bidirectional(basic_rnn, merge_mode='concat')
            self.state_converter = tf.keras.layers.Dense(rnn_units, activation='tanh')
        else:
            self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units, return_sequences=True, 
                                                         return_state=True )
            

#DECODER-No attention        
class DecoderNetwork2(tf.keras.Model):
    def __init__(self, rnn_units, num_features):
        super().__init__()
        self.decoder_embedding = tf.keras.layers.Dense(rnn_units, activation='tanh')
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # cannot use tf.keras.layers.LSTM since we need step-by-step during inference
        # https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTMCell
        # https://www.tensorflow.org/guide/keras/rnn
        self.decoder = tf.keras.layers.RNN(self.decoder_rnncell, 
                                                    return_sequences=True, 
                                                    return_state=True)
        self.dense_layer = tf.keras.layers.Dense(num_features, activation='relu')

#DECODER
class DecoderNetwork(tf.keras.Model):
    def __init__(self, rnn_units):
        super().__init__()
        self.dense_layer = tf.keras.layers.Dense(1)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
        self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
                                                output_layer=self.dense_layer)

    def build_attention_mechanism(self, units,memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units, memory = memory, 
                                          memory_sequence_length=memory_sequence_length)
        #return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decodernn cell  
    def build_rnn_cell(self, batch_size ):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
                                                attention_layer_size=dense_units)
        return rnn_cell
    
    def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, 
                                                                dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) 
        return decoder_initial_state


def loss_function(y_pred, y):   
    #shape of y [batch_size, seq_len, num_features]
    #shape of y_pred [batch_size, seq_len, num_features]
    if loss_function == 'mae':
        mae = tf.keras.losses.MeanAbsoluteError()
        loss = mae(y_true=y, y_pred=y_pred)
    else:
        mse = tf.keras.losses.MeanSquaredError()
        loss = mse(y_true=y, y_pred=y_pred)
#     mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
#     mask = tf.cast(mask, dtype=loss.dtype)
#     loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss

def train_step(input_batch, output_batch, encoder_initial_cell_state):
    #initialize loss = 0
    loss = 0
    with tf.GradientTape() as tape:
#         encoder_emb_inp = tf.expand_dims(input_batch, -1)  # add the feature dimension, 1
        encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)  # just a linear layer
    
        #[last step activations,last memory_state] of encoder passed as input to decoder Network
        if bidirectional:
            _, state_hf, state_hb, state_cf, state_cb = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                            initial_state =encoder_initial_cell_state)
            state_h = encoderNetwork.state_converter(tf.concat([state_hf, state_hb], axis=-1))
            state_c = encoderNetwork.state_converter(tf.concat([state_cf, state_cb], axis=-1))
        else:
            _, state_h, state_c = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                            initial_state =encoder_initial_cell_state)
        
        encoder_state = [state_h, state_c]

        # even if there is no start/end we need to shift the input
        # Prepare correct Decoder input & output sequence data
        decoder_input = output_batch[:,:-1] # ignore <end>
        #compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:,1:] #ignore <start>

        # Decoder Embeddings
#         print(decoder_input.shape)  # batch_size X sequence_length
#         decoder_emb_inp = tf.expand_dims(decoder_input, -1)  # add the feature dimension, 1
#         print(decoder_emb_inp.shape) # batch_size X sequence_length X 1
        decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#         print(decoder_emb_inp.shape) # batch_size X sequence_length X rnn_size

#         print(decoder_input, decoder_output, decoder_emb_inp)
    
        #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
        decoder_initial_state = encoder_state
    
        #BasicDecoderOutput        
        outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp, 
                                               initial_state=decoder_initial_state)
#         print('C0:', tf.math.count_nonzero(outputs), outputs.shape[0]*outputs.shape[1])
    
        # outputs is batch_size X sequence_length X RNN dimension
        logits = decoderNetwork.dense_layer(outputs)  # outputs.rnn_output
#         print('C1:', tf.math.count_nonzero(logits))

        # squeeze is required only for one output feature
#         logits = tf.squeeze(logits, axis=-1)
#         print(logits.shape, decoder_output.shape)  # batch_size X sequence_length
#         print(logits)
#         print('C2:', tf.math.count_nonzero(logits))
        
        #Calculate loss
        loss = loss_function(logits, decoder_output)

    #Returns the list of all layer variables / weights.
    variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables  
    
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    
    return loss

#RNN LSTM hidden and memory state initializer
def initialize_initial_state():
    if bidirectional:
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units)), 
                tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]        
    else:
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]


In [None]:
num_features = x_norm.shape[-1]
encoderNetwork = EncoderNetwork(rnn_units)
decoderNetwork = DecoderNetwork2(rnn_units, num_features)
optimizer = tf.keras.optimizers.Adam()

In [None]:
steps_per_epoch, X_train.shape, X_train.shape[0]/BATCH_SIZE, Y_train.shape

In [None]:
epochs = 10
batch_print = int(steps_per_epoch/10)
for i in range(1, epochs+1):

    encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0.0

    for ( batch , (input_batch, output_batch)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
        total_loss += batch_loss
        if (batch+1)%(batch_print) == 0:
            now = datetime.now()
            dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
            print(f"total loss: {batch_loss.numpy()} epoch-{i}, batch-{batch+1}, time: {dt_string}")

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(BATCH_SIZE, drop_remainder=False)
example_X, example_Y = next(iter(dataset_test))
print(example_X.shape) 
print(example_Y.shape) 

In [None]:
# example_X, example_Y = next(iter(dataset))
# print(example_X.shape) 
# print(example_Y.shape) 

In [None]:
input_sequences = example_X
inference_batch_size = input_sequences.shape[0]
if bidirectional:
    encoder_initial_cell_state = [tf.zeros((inference_batch_size, rnn_units)),
                                  tf.zeros((inference_batch_size, rnn_units)),
                                  tf.zeros((inference_batch_size, rnn_units)),
                                  tf.zeros((inference_batch_size, rnn_units))]
else:
    encoder_initial_cell_state = [tf.zeros((inference_batch_size, rnn_units)),
                                  tf.zeros((inference_batch_size, rnn_units))]
# encoder_emb_inp = tf.expand_dims(input_sequences, -1)
encoder_emb_inp = input_sequences
encoder_emb_inp = encoderNetwork.encoder_embedding(encoder_emb_inp)  # just a linear layer

if bidirectional:
    _, state_hf, state_hb, state_cf, state_cb = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                    initial_state = encoder_initial_cell_state)
    state_h = encoderNetwork.state_converter(tf.concat([state_hf, state_hb], axis=-1))
    state_c = encoderNetwork.state_converter(tf.concat([state_cf, state_cb], axis=-1))
else:    
    _, state_h, state_c = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                    initial_state = encoder_initial_cell_state)

#[last step activations,last memory_state] of encoder passed as input to decoder Network
encoder_state = [state_h, state_c]

#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState

# decoder_input = tf.expand_dims(example_Y[:,0], -1)
decoder_input = example_Y[:,0,:]

# decoder_input = tf.expand_dims([0]* inference_batch_size,1)
# decoder_emb_inp = tf.expand_dims(decoder_input, -1)  # add the feature dimension, 1
decoder_emb_inp = tf.expand_dims(decoder_input, axis=-2) # add the feature dimension
# print(decoder_emb_inp.shape) # batch_size X sequence_length X 1
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_emb_inp)
# print(decoder_emb_inp)

Ty = example_Y.shape[1]
maximum_iterations = Ty
inputs = decoder_emb_inp
state = encoder_state
predictions = np.empty((inference_batch_size, 0, 3), dtype = np.int32)

for jj in range(Ty):
    outputs, state_h, state_c = decoderNetwork.decoder(inputs, state)
    outputs = decoderNetwork.dense_layer(outputs)
    
#     current_prediction = tf.squeeze(outputs, axis=-1)
    current_prediction = outputs
    predictions = np.append(predictions, current_prediction, axis = -2)
    inputs = decoderNetwork.decoder_embedding(outputs)
    state = [state_h, state_c]

In [None]:
predictions.shape

In [None]:
import random

def plot_sample(example_X, example_Y, num_samples=1):
    len_x, len_y = example_X.shape[1], example_Y.shape[1]
    shifted_x = [x+len_x for x in range(len_y)]
    for ii in range(num_samples):
        cnum = random.randint(0, len(example_X))
        f = plt.figure(ii+1, figsize=(15,4))
        ax1 = f.add_subplot(131)
        ax2 = f.add_subplot(132)
        ax3 = f.add_subplot(133)

        ax1.plot(example_X[cnum,:, 0])
        ax1.plot(shifted_x, example_Y[cnum,:,0])  # , 'bo--'
        ax1.plot(shifted_x, predictions[cnum,:,0])  # , 'r+'
        ax1.title.set_text('Power')

        ax2.plot(example_X[cnum,:, 1])
        ax2.plot(shifted_x, example_Y[cnum,:,1])  # , 'bo--'
        ax2.plot(shifted_x, predictions[cnum,:,1])  # , 'r+'
        ax2.title.set_text('Temperature')

        ax3.plot(example_X[cnum,:, 2])
        ax3.plot(shifted_x, example_Y[cnum,:,2])  # , 'bo--'
        ax3.plot(shifted_x, predictions[cnum,:,2])  # , 'r+'
        ax3.title.set_text('Irradiance')

In [None]:
# 256 - 256 - 512 - mse
plot_sample(example_X, example_Y, 5)

In [None]:
# 128 - 128 - 256 - mae
plot_sample(example_X, example_Y, 5)