In [None]:
from transformer_models_ts import (create_padding_mask,
                                   create_look_ahead_mask,
                                   Transformer)

import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import pickle
import sys
import tensorflow as tf
import time
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tqdm import trange

In [None]:
def create_masks(inp, tar):
    # Encoder padding mask
    # (batch_size, 1, 1, seq_length, num_features)
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    # (batch_size, 1, 1, seq_length, num_features)
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input
    # received by the decoder.
    # (seq_len, seq_len)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])

    # dec_target_padding_mask = create_padding_mask(tar)
    # combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, look_ahead_mask, dec_padding_mask


def evaluate_one_example(encoder_input, decoder_input, transformer):

    encoder_input = tf.expand_dims(encoder_input, 0)  # (1, seq_len, features)
    output = tf.expand_dims(decoder_input, 0)  # (1, features)
    output = tf.expand_dims(output, 0)  # (1, 1, features)
#     print('inside evaluate_one_example:')
#     print(encoder_input.shape, decoder_input.shape, output.shape)
    MAX_LENGTH = encoder_input.shape[1]
    output = tf.cast(output, tf.float32)
    
    for i in range(MAX_LENGTH-1):
        # predictions.shape == (batch_size, seq_len, vocab_size)
        # print(i, encoder_input.shape, output.shape)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     None,
                                                     None,
                                                     None)

        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, features), float32
#         print(predictions.dtype, predictions.shape)
#         print(output.dtype, output.shape)
        output = tf.concat([output, predictions], axis=-2)  # concat on sequence

    return output, attention_weights


def get_ae_features(encoder_input, transformer, embed_dim, batch_size):

    num_points, seq_len, num_features = encoder_input.shape
    # output = tf.zeros(shape=(num_points, seq_len, embed_dim))
    output_list = []
    if num_points > 100:
        dataset = tf.data.Dataset.from_tensor_slices((encoder_input))
        dataset = dataset.batch(batch_size)
        for batch, inp in tqdm(enumerate(dataset)):
            out = transformer.encoder(inp, training=False, mask=None)
            # i0 = batch_size * batch
            # i1 = i0 + batch_size
            output_list.append(out)
        output = tf.concat(output_list, axis=0)
    else:
        output = transformer.encoder(encoder_input, training=False, mask=None)

    return output


def train(X_train, Y_train, X_test, Y_test, checkpoint_path, **kwargs):

    # typical example, num_layers=2, d_model=512, num_heads=8, dff=2048,
    # input_vocab_size=8500, target_vocab_size=8000,
    # pe_input=10000, pe_target=6000
    num_layers = kwargs.get("num_layers", 2)
    d_model = kwargs.get("d_model", 128)
    num_heads = kwargs.get("num_heads", 8)
    dff = kwargs.get("dff", 128)
    BATCH_SIZE = kwargs.get("BATCH_SIZE", 64)
    epochs = kwargs.get("epochs", 100)
    action = kwargs.get("action", "train")
    padding_reqd = kwargs.get("padding_reqd", False)
    normalize_flag = kwargs.get("normalize_flag", True)
    seed = kwargs.get("seed", 100)

    xpoints, inp_seq_len, inp_features = X_train.shape
    ypoints, out_seq_len, out_features = Y_train.shape
    pe_input = inp_seq_len
    pe_target = out_seq_len
    target_vocab_size = out_features

    print("Train and Test data shape:", X_train.shape, X_test.shape)
    BUFFER_SIZE = len(X_train)
    steps_per_epoch = BUFFER_SIZE//BATCH_SIZE

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
    # val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    # test_dataset = tf.data.Dataset.from_tensor_slices((X_test, Y_test))

    # sample for testing
    sample_transformer = Transformer(num_layers=num_layers,
                                     d_model=d_model,
                                     num_heads=num_heads, dff=dff,
                                     target_vocab_size=out_features,
                                     pe_input=inp_seq_len,
                                     pe_target=out_seq_len)

    temp_input = tf.random.uniform((BATCH_SIZE, inp_seq_len, inp_features),
                                   dtype=tf.float32, minval=0, maxval=1)
    temp_target = tf.random.uniform((BATCH_SIZE, out_seq_len, out_features),
                                    dtype=tf.float32, minval=0, maxval=1)

    fn_out, _ = sample_transformer(temp_input, temp_target, training=False,
                                   enc_padding_mask=None,
                                   look_ahead_mask=None,
                                   dec_padding_mask=None)
    print("Output shape:", fn_out.shape)  # (batch_size, tar_seq_len, target_vocab_size)
    
    print(temp_input.dtype, temp_target.dtype)

    # d_model % self.num_heads == 0
    transformer = Transformer(num_layers, d_model, num_heads, dff,
                              target_vocab_size, pe_input, pe_target, rate=0.1)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9,
                                         beta_2=0.999, epsilon=1e-7)
    loss_object = tf.keras.losses.MeanSquaredError(
                    reduction=tf.keras.losses.Reduction.NONE,
                    name='mean_squared_error')

    def loss_function(real, pred):
        loss_ = loss_object(real, pred)  # batch X seq_len
        loss_ = tf.reduce_sum(loss_, axis=-1)  # batch
        return tf.reduce_mean(loss_)

    def accuracy_function(real, pred):
        mae = tf.keras.losses.MeanAbsoluteError(
                reduction=tf.keras.losses.Reduction.NONE)
        loss_ = mae(real, pred)
        loss_ = tf.reduce_sum(loss_, axis=-1)
        return tf.reduce_mean(loss_)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

    ckpt = tf.train.Checkpoint(transformer=transformer,
                               optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=5)
    checkpoint_directory = os.path.dirname(checkpoint_path)
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
        
    return_dict = {}

    if action == "train":
        # The @tf.function trace-compiles train_step into a TF graph for faster
        # execution. The function specializes to the precise shape of the argument
        # tensors. To avoid re-tracing due to the variable sequence lengths or variable
        # batch sizes (the last batch is smaller), use input_signature to specify
        # more generic shapes.

        train_step_signature = [
            tf.TensorSpec(shape=(None, None, inp_features), dtype=tf.float64),
            tf.TensorSpec(shape=(None, None, out_features), dtype=tf.float64),
        ]

        @tf.function(input_signature=train_step_signature)
        def train_step(inp, tar):
            tar_inp = tar[:, :-1, :]
            tar_real = tar[:, 1:, :]

            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
            enc_padding_mask = None
            dec_padding_mask = None

            with tf.GradientTape() as tape:
                predictions, _ = transformer(inp, tar_inp,
                                            True,
                                            enc_padding_mask,
                                            combined_mask,
                                            dec_padding_mask)
                loss = loss_function(tar_real, predictions)

            gradients = tape.gradient(loss, transformer.trainable_variables)    
            optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

            train_loss(loss)
            train_accuracy(accuracy_function(tar_real, predictions))

        train_dataset = train_dataset.batch(BATCH_SIZE)
        print(f"Training on {BUFFER_SIZE} data points over {epochs} epochs")
        t = trange(epochs, desc='Epoch Desc', leave=True)
        
        loss_dict = {'mae': [], 'mse': []}
        for epoch in t:
            start = time.time()

            train_loss.reset_states()
            train_accuracy.reset_states()

            # inp -> portuguese, tar -> english
            for (batch, (inp, tar)) in enumerate(train_dataset):

                # check for dimensions
                # print(inp.shape, tar.shape)
                # print(inp.dtype, tar.dtype)
                # tar_inp = tar[:, :-1, :]
                # enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
                # predictions, _ = transformer(inp, tar_inp, True, None, combined_mask, None)
                # print(predictions.shape)

                train_step(inp, tar)

                # if batch % 50 == 0:
                #     print('Epoch {} Batch {} MSE-Loss {:.4f} MAE-Loss {:.4f}'.format(
                #         epoch + 1, batch, train_loss.result(), train_accuracy.result()))

            mse_loss, mae_loss = train_loss.result(), train_accuracy.result()
            loss_dict['mse'].append(mse_loss)
            loss_dict['mae'].append(mae_loss)
            t.set_description('Epoch {} MSE-Loss {:.6f}  MAE-Loss {:.6f}'.format(epoch + 1,
                                                                mse_loss,
                                                                mae_loss))
            t.refresh()
            if (epoch + 1) % 5 == 0:
                ckpt_save_path = ckpt_manager.save()
                # print('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))

            # print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
            #                                             train_loss.result(), 
            #                                             train_accuracy.result()))

            # print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
        print("Training Complete!")
        return_dict['history'] = loss_dict

    elif action == "predict_all":

        pred = get_ae_features(X_train, transformer, d_model, BATCH_SIZE)
        return_dict['prediction'] = pred
        
    elif action == "predict_batch":
        # ckpt.restore(checkpoint_path).expect_partial()
        dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(BATCH_SIZE, drop_remainder=False)
        example_X, example_Y = next(iter(dataset_test))
        print(example_X.shape, example_Y.shape)
        predictions = np.zeros((BATCH_SIZE, out_seq_len, out_features), dtype=np.float64)
        
        for index in trange(BATCH_SIZE):
            enc_inp, dec_inp0 = example_X[index, :, :], example_Y[index, 0, :]
            pred_ii, _ = evaluate_one_example(enc_inp, dec_inp0, transformer)
            predictions[index, :, :] = pred_ii
            # predictions = np.append(predictions, pred_seq, axis=0)

        return_dict['prediction'] = predictions
        nsample = 5
        len_x, len_y = example_X.shape[1], example_Y.shape[1]
        shifted_x = [x+len_x for x in range(len_y)]
        # indices = random.sample([ii for ii in range(example_X.shape[0])], k=nsample)
        for ii in range(nsample):
            filename = "image_"+str(ii)+".png"
            plt.figure(ii+1)
            cnum = random.randint(0, len(example_X)-1)
            colnum = random.randint(0, out_features-1)
            print(ii, cnum, colnum)
            # input
            plt.plot(example_X[cnum, :, colnum])

            # target
            plt.plot(shifted_x, example_Y[cnum, :, colnum])  # , 'bo--'

            # prediction
            plt.plot(shifted_x, predictions[cnum, :, colnum])  # , 'r+'
            plt.savefig(os.path.join(checkpoint_directory, filename))
            plt.title("Feature-"+str(colnum))
            print(f"saved {filename}")
        
    else:
        print("Unknown action type:", action)
    
    return return_dict


In [None]:
import random

def plot_sample(example_X, example_Y, predictions, num_samples=1):
    len_x, len_y = example_X.shape[1], example_Y.shape[1]
    shifted_x = [x+len_x for x in range(len_y)]
    for ii in range(num_samples):
        cnum = random.randint(0, len(example_X)-1)
        f = plt.figure(ii+1, figsize=(15,4))
        ax1 = f.add_subplot(131)
        ax2 = f.add_subplot(132)
        ax3 = f.add_subplot(133)

        ax1.plot(example_X[cnum,:, 0])
        ax1.plot(shifted_x, example_Y[cnum,:,0])  # , 'bo--'
        ax1.plot(shifted_x, predictions[cnum,:,0])  # , 'r+'
        ax1.title.set_text('Power')

        ax2.plot(example_X[cnum,:, 1])
        ax2.plot(shifted_x, example_Y[cnum,:,1])  # , 'bo--'
        ax2.plot(shifted_x, predictions[cnum,:,1])  # , 'r+'
        ax2.title.set_text('Temperature')

        ax3.plot(example_X[cnum,:, 2])
        ax3.plot(shifted_x, example_Y[cnum,:,2])  # , 'bo--'
        ax3.plot(shifted_x, predictions[cnum,:,2])  # , 'r+'
        ax3.title.set_text('Irradiance')

In [None]:
# from 04-c file
# 'autoencoder-data-v01.pkl' - for past 1 day data with only operational hours
# normalizing parameters
# [ 539.6          70.15598297 6227.027832  ]
# [ 539.6          70.78488922 6227.027832  ]

# 'autoencoder-data-1d.pkl' - without removing non-operational hours
# normalizing parameters
# [ 539.6          70.15598297 6227.027832  ]
# [ 539.6          70.78488922 18674.02929688]
with open('autoencoder-data-1d.pkl', 'rb') as handle:
    x_all, y_all = pickle.load(handle)
x_all.shape, y_all.shape

In [None]:
xmax = np.amax(x_all, axis=(0, 1))
ymax = np.amax(y_all, axis=(0, 1))
print(xmax)
print(ymax)

In [None]:
x_norm = x_all / xmax
y_norm = y_all / ymax

In [None]:
X_train,  X_test, Y_train, Y_test = train_test_split(x_norm, y_norm, test_size=0.2)
X_train.shape, X_test.shape

In [None]:
data_dir = os.getcwd()
ckpt_path = os.path.join(data_dir, "transformer_model_1d/cp.ckpt")

train(X_train,
      Y_train,
      X_test,
      Y_test,
      ckpt_path,
      num_layers=2,
      d_model=128,
      epochs=500,
      BATCH_SIZE=32,
      action="train",
      )


## Predict for some sample data

In [None]:
predictions = train(X_train, Y_train, X_test, Y_test, ckpt_path,
                      num_layers=2,
                      d_model=128,
                      epochs=500,
                      BATCH_SIZE=32,
                      action="predict_batch",
                      )


In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(32, drop_remainder=False)
example_X, example_Y = next(iter(dataset_test))
print(example_X.shape, example_Y.shape, predictions['prediction'].shape)

## Reconstruction of One Day Profile

In [None]:
plot_sample(example_X, example_Y, predictions['prediction'], num_samples=5)

## Reconstruction for Operational Hours

In [None]:
plot_sample(example_X, example_Y, predictions['prediction'], num_samples=5)

## Apply the Transformer for Feature/EMbedding Generation

In [None]:
# from the alarm classification data
with open('inverter-data-cnn-daily.pkl', 'rb') as handle:
    x_dict, y_dict, label_df = pickle.load(handle)
    
x_all, y_all = [], []
for inv in x_dict:
    x_ii, y_ii = x_dict[inv], y_dict[inv]
    x_all.append(x_ii)
    y_all.append(y_ii)

x_all = np.concatenate(x_all, axis=0)
y_all = np.concatenate(y_all, axis=0)
x_all.shape, y_all.shape

In [None]:
data_dir = os.getcwd()
# ckpt_path = os.path.join(data_dir, "transformer_model/cp.ckpt")
ckpt_path = os.path.join(data_dir, "transformer_model_1d/cp.ckpt")

predictions = train(x_all, x_all, x_all, x_all, ckpt_path,
                      num_layers=2,
                      d_model=128,
                      epochs=500,
                      BATCH_SIZE=32,
                      action="predict_all",
                      )


### The embedding is for every time-step
    - dimension of the embedding = [:, sequence_length, embedding_dimension]

In [None]:
x_embed = predictions['prediction']
x_embed.shape

    - one can choose either the first one (similar to BERT) or the last one for embedding vectors

In [None]:
x_embed_final = x_embed[:, -1, :]
x_embed_final.shape

In [None]:
x_embed_initial = x_embed[:, 0, :]
x_embed_initial.shape

In [None]:
from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
x_tsne = TSNE(n_components=2).fit_transform(x_embed_final)
x_tsne.shape

In [None]:
import seaborn as sns

In [None]:
df_subset = pd.DataFrame()
df_subset['tsne-2d-one'] = x_tsne[:,0]
df_subset['tsne-2d-two'] = x_tsne[:,1]
df_subset["y"] = y_all

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 2),
    data=df_subset,
    legend="full",
    alpha=0.3
)

t-SNE plot reveals that the embeddings are not really discriminating between the classes

In [None]:
data = (x_embed_final, y_all, label_df)
with open('inverter-data-daily-embedded-final.pkl', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
x_tsne2 = TSNE(n_components=2).fit_transform(x_embed_initial)
x_tsne2.shape

In [None]:
df_subset = pd.DataFrame()
df_subset['tsne-2d-one'] = x_tsne2[:,0]
df_subset['tsne-2d-two'] = x_tsne2[:,1]
df_subset["y"] = y_all

plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 2),
    data=df_subset,
    legend="full",
    alpha=0.3
)

In [None]:
data = (x_embed_initial, y_all, label_df)
with open('inverter-data-daily-embedded-initial.pkl', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def plot_history_sample(x, y, num_samples=5):
    pos_indices = np.where(y==1)
    neg_indices = np.where(y==0)
    
    x_pos = x[pos_indices[0], :, :]
    x_neg = x[neg_indices[0], :, :]
    
    print(x_pos.shape)
    print(len(pos_indices[0]), len(neg_indices[0]))
    
    for ii in range(num_samples):
        pindx = random.randint(0, len(pos_indices[0])-1)
        nindx = random.randint(0, len(neg_indices[0])-1)
        
        pnum = pos_indices[0][pindx]
        nnum = neg_indices[0][nindx]
        print(y[pnum], y[nnum])
        
        f = plt.figure(ii+1, figsize=(15,4))
        ax1 = f.add_subplot(131)
        ax2 = f.add_subplot(132)
        ax3 = f.add_subplot(133)

        ax1.plot(x[pnum, :, 0], label='pos')
        ax1.plot(x[nnum, :, 0], label='neg')
        ax1.title.set_text('Power')
        ax1.legend(loc="upper right")

        ax2.plot(x[pnum, :, 1], label='pos')
        ax2.plot(x[nnum, :, 1], label='neg')
        ax2.title.set_text('Temperature')
        ax2.legend(loc="upper right")

        ax3.plot(x[pnum, :, 2], label='pos')
        ax3.plot(x[nnum, :, 2], label='neg')
        ax3.title.set_text('Irradiance')
        ax3.legend(loc="upper right")

In [None]:
plot_history_sample(x_all, y_all, 5)