# AUTOENCODER + CLUSTER ANOMALY DETECTION: TO DO

STEPS FOR SURE


### 1. implement k means clustering across different k's, minimizing loss
### 2. from clustering and users in cluster, run algorithm that would output "flag rules" (anomalous actions + time elapsed intervals in the cluster

btw clustering is similar to variational inference
(remember that VAEs perform variational inference within the features' latent space,
which is different from clustering)


To do:

-- so far, time = time elapsed, not date/time (so cyclicality of day/week/month/year is not taken into account)

- increase accuracy of uobs embeddings if possible (transformers)


- time-agnostic user-level embedding, with session delimitation, padding, and attention mask
    - my proposal for now is that we skip this, because adding time gates inherently separates sessions through the short-term memory (hidden state), while still keeping a minor note in the long-term memory (cell state)
- variational autoencoders and transformers... are they necessary?
- should we make it operation->session->user operation behaviour sequence?
    - session is delimited by min(time delimitation, operation delimitation)

Ok, here are some axioms:
- Sessions are delimited by a maximum time window (3 days)
    - dynamic padding by batch
- UOBS per user is a weighted addition of session vectors
    - because vector addition inherently scales up magnitude
    - cosine similarity between user embeddings is applied instead of euclidean distance (for operation embeddings)
- https://cmry.github.io/notes/euclidean-v-cosine




Then:
- then concatenate with an operation-agnostic timestamp sequence embedding 
    - compare normalized vs non-normalized (time-subtracted or raw)





Later:
- cluster users into groups based on cosine similarity
- assume users within the cluster should behave similarly across time
- pipeline group/cluster-level similarity score

Low priority considerations:
- nonlinearity
- peepholes
- gradient clipping

Done:
- operation-level embedding (checked empirically)
- time-agnostic user-level embedding, without session delimitation
- visual / cosine similarity clustering for empirical check (10-20 pairs)
- add time gates and do a comparison in reconstruction loss vs without time-gates (it was lower on avg)
- add time gates to the lstm

In [1]:
!pip install numpy

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import torch
from torch.autograd import Variable

import s3fs
from pyarrow.parquet import ParquetDataset
import io
import re
import string
import tqdm
import numpy as np
import pandas as pd

import random
import os, errno
import sys
from tqdm import trange

import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [3]:
#with open('vectors.tsv') as f:
#    long_description = f.read()
#print(long_description)
# https://docs.python.org/3/library/io.html

with open('weights.npy', 'rb') as f:
    weights = np.load(f)

In [4]:
import pickle

with open('sequences', 'rb') as fp:  
    sequences = pickle.load(fp)
    
with open('timestamps', 'rb') as fp:  
    timestamps = pickle.load(fp)

In [5]:
def w2vembedding(sequences, weights):
    for seqIndex, sequence in enumerate(sequences):
        for opIndex, operation in enumerate(sequence):
            sequences[seqIndex][opIndex] = weights[operation - 1]
    return sequences

num_sequences = sequences
sequences = w2vembedding(sequences, weights)

In [6]:
def window(sequences, window_size = 20): 
    windowed_sequences = []
    for sequence in sequences:
        if len(sequence) >= window_size:
            for i in range(len(sequence) - window_size):
                windowed_sequences.append(sequence[i:window_size+i])
    return windowed_sequences

In [7]:
windowed_sequences = window(sequences)
torched_windowed_sequences = torch.from_numpy(np.array(windowed_sequences))
torched_windowed_sequences = torched_windowed_sequences.type(torch.FloatTensor)

windowed_timestamps = window(timestamps)
torched_windowed_timestamps = torch.from_numpy(np.array(windowed_timestamps))
torched_windowed_timestamps  = torched_windowed_timestamps.type(torch.FloatTensor)

In [None]:
from sklearn.model_selection import train_test_split

train_op, test_op, _, _ = train_test_split(np.array(windowed_sequences), np.array(windowed_sequences), test_size=0.15, random_state=42)
train_op = torch.from_numpy(train_op)
train_op = train_op.type(torch.FloatTensor)
test_op = torch.from_numpy(test_op)
test_op = test_op.type(torch.FloatTensor)

train_time, test_time, _, _ = train_test_split(np.array(windowed_timestamps), np.array(windowed_timestamps), test_size=0.15, random_state=42)
train_time = torch.from_numpy(train_time)
train_time = train_time.type(torch.FloatTensor)
test_time = torch.from_numpy(test_time)
test_time = test_time.type(torch.FloatTensor)

In [None]:
#train_op = train_op.permute(1,0,2)
#test_op = test_op.permute(1,0,2)
torched_windowed_sequences = torched_windowed_sequences.permute(1,0,2)
#train_time = train_time.permute(1,0)
#test_time = test_time.permute(1,0)
torched_windowed_timestamps= torched_windowed_timestamps.permute(1,0)

In [None]:
with open('train_op','rb') as f:
  train_op = torch.load(f)

with open('test_op','rb') as f:
  test_op = torch.load(f)

with open('train_time','rb') as f:
  train_time = torch.load(f)

with open('test_time','rb') as f:
  test_time = torch.load(f)

with open('torched_windowed_sequences','rb') as f:
  torched_windowed_sequences = torch.load(f)

with open('torched_windowed_timestamps','rb') as f:
  torched_windowed_timestamps = torch.load(f)

In [None]:
print(sequences[0])

In [8]:
print(torched_windowed_sequences.shape)
print(torched_windowed_timestamps.shape)
#print(train_op.shape)
#print(test_time.shape)

torch.Size([158859, 20, 128])
torch.Size([158859, 20])


# Time-LSTM Variational Autoencoder

Link on custom LSTM implementations: https://github.com/pytorch/pytorch/blob/master/benchmarks/fastrnns/custom_lstms.py

TorchScript and PyTorch JIT: https://www.youtube.com/watch?v=2awmrMRf0dA

Variational Autoencoder: https://github.com/AntixK/PyTorch-VAE/blob/8700d245a9735640dda458db4cf40708caf2e77f/models/vanilla_vae.py#L8

In [9]:
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.jit as jit
import warnings
from collections import namedtuple
from typing import List, Tuple
from torch import Tensor
import numbers

In [10]:
import math

### LSTM Cell

In [11]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) # previously 4, added another gate
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)

        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t, # time modification here
                hidden):
        h, c = hidden

        preact = self.i2h(x) + self.h2h(h) 

        gates = preact[:, :, :3 * self.hidden_size].sigmoid()
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        
        i_t = gates[:, :, :self.hidden_size]
        
        f_t = gates[:, :, self.hidden_size:2 * self.hidden_size]
        o_t = gates[:, :, -self.hidden_size:]
        
        c_t = torch.mul(c, f_t) + torch.mul(i_t, g_t)
        
        h_t = torch.mul(o_t, c_t.tanh())
        
        return h_t, (h_t, c_t)

### Time LSTM Cell 1

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) # previously 4, added another gate
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        
        # new gate
        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        
        # output gate addition
        self.Wto = nn.Linear(1, hidden_size, bias=False)
        
        self.wci = nn.Parameter(torch.zeros(1))
        self.wcf = nn.Parameter(torch.zeros(1))
        self.wco = nn.Parameter(torch.zeros(1))
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t, # time modification here
                hidden):
        h, c = hidden
        
        #print(c.shape)
        
        # Linear mappings
        preact = self.i2h(x) + self.h2h(h) 

        time = self.x2h(x) + self.t2h(t.unsqueeze(0).unsqueeze(2)) #.sigmoid()
        
        # activations
        gates = preact[:, :, :3 * self.hidden_size]
        # print(gates.shape) #1,5,48
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        #print("g_t is " + str(g_t.shape))
        
        i_t = (gates[:, :, :self.hidden_size] 
               + self.wci * c
              ).sigmoid()
        #print("i_t is " + str(i_t.shape))
        
        f_t = (gates[:, :, self.hidden_size:2 * self.hidden_size] 
               +  self.wcf * c
              ).sigmoid()
        
        t_t = time.sigmoid()
        
        #MODIFICATION BELOW
        c_t = torch.mul(c, f_t) + torch.mul(torch.mul(i_t, g_t), t_t)
        
        o_t = (gates[:, :, -self.hidden_size:] 
               + self.Wto(t.unsqueeze(0).unsqueeze(2)) 
               + self.wco * c_t
              ).sigmoid()
        
        h_t = torch.mul(o_t, c_t.tanh())
        
        return h_t, (h_t, c_t)

### Parameter Constraints (Use Weight Clipper)

In [None]:
class weightConstraint(object):
    def __init__(self):
        pass
    
    def __call__(self,module):
        if hasattr(module,'weight'):
            w=module.weight.data
            w=w.clamp(0)
            module.weight.data=w

### Time LSTM Cell 2

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) # previously 4, added another gate
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        
        # TIME GATE 1
        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        
        # TIME GATE 2
        self.t2h2 = nn.Linear(1, hidden_size, bias=bias)
        self.x2h2 = nn.Linear(input_size, hidden_size, bias=bias) 
        
        # output gate addition
        self.Wto = nn.Linear(1, hidden_size, bias=False)
        
        self.wci = nn.Parameter(torch.zeros(1))
        self.wcf = nn.Parameter(torch.zeros(1))
        self.wco = nn.Parameter(torch.zeros(1))
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t, # time modification here
                hidden):
        h, c = hidden
        
        # Linear mappings
        preact = self.i2h(x) + self.h2h(h) 
        
        constraints=weightConstraint()
        self.t2h.apply(constraints)
        time1 = self.x2h(x) + self.t2h(t.unsqueeze(0).unsqueeze(2))
        
        time2 = self.x2h2(x) + self.t2h2(t.unsqueeze(0).unsqueeze(2))
        
        # activations
        gates = preact[:, :, :3 * self.hidden_size]
        # print(gates.shape) #1,5,48
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        #print("g_t is " + str(g_t.shape))
        
        i_t = (gates[:, :, :self.hidden_size] 
               + self.wci * c
              ).sigmoid()
        #print("i_t is " + str(i_t.shape))
        
        f_t = (gates[:, :, self.hidden_size:2 * self.hidden_size] 
               +  self.wcf * c
              ).sigmoid()
        
        t1m = time1.sigmoid()
        t2m = time2.sigmoid()
        
        #MODIFICATION BELOW
        
        c_t_ = torch.mul(c, f_t) + torch.mul(torch.mul(i_t, g_t), t1m)
        c_t = torch.mul(c, f_t) + torch.mul(torch.mul(i_t, g_t), t2m)
        
        
        o_m = (gates[:, :, -self.hidden_size:] 
               + self.Wto(t.unsqueeze(0).unsqueeze(2)) 
               + self.wco * c_t_
              ).sigmoid()
        
        h_t = torch.mul(o_m, c_t_.tanh())
        
        return h_t, (h_t, c_t)

### Time LSTM Cell 3

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) # previously 4, added another gate
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        
        # TIME GATE 1
        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        
        # TIME GATE 2
        self.t2h2 = nn.Linear(1, hidden_size, bias=bias)
        self.x2h2 = nn.Linear(input_size, hidden_size, bias=bias) 
        
        # output gate addition
        self.Wto = nn.Linear(1, hidden_size, bias=False)
        
        self.wci = nn.Parameter(torch.zeros(1))
        self.wcf = nn.Parameter(torch.zeros(1))
        self.wco = nn.Parameter(torch.zeros(1))
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t, # time modification here
                hidden):
        h, c = hidden
        
        # Linear mappings
        preact = self.i2h(x) + self.h2h(h) 
        
        constraints=weightConstraint()
        self.t2h.apply(constraints)
        time1 = self.x2h(x) + self.t2h(t.unsqueeze(0).unsqueeze(2))
        
        time2 = self.x2h2(x) + self.t2h2(t.unsqueeze(0).unsqueeze(2))
        
        # activations
        gates = preact[:, :, :3 * self.hidden_size]
        # print(gates.shape) #1,5,48
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        #print("g_t is " + str(g_t.shape))
        
        i_t = (gates[:, :, :self.hidden_size] 
               + self.wci * c
              ).sigmoid()
        #print("i_t is " + str(i_t.shape))
        
        t1m = time1.sigmoid()
        t2m = time2.sigmoid()
        
        #MODIFICATION BELOW
        
        c_t_ = torch.mul(1 - torch.mul(i_t, t1m), c) + torch.mul(torch.mul(i_t, g_t), t1m)
        c_t = torch.mul(1 - i_t, c) + torch.mul(torch.mul(i_t, g_t), t2m)
        
        
        o_m = (gates[:, :, -self.hidden_size:] 
               + self.Wto(t.unsqueeze(0).unsqueeze(2)) 
               + self.wco * c_t_
              ).sigmoid()
        
        h_t = torch.mul(o_m, c_t_.tanh())
        
        return h_t, (h_t, c_t)

### Peephole Only

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) 
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)

        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        
        self.wci = nn.Parameter(torch.zeros(1))
        self.wcf = nn.Parameter(torch.zeros(1))
        self.wco = nn.Parameter(torch.zeros(1))
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t, # time modification here
                hidden):
        h, c = hidden

        preact = self.i2h(x) + self.h2h(h) 
        a = self.x2h(x)
        
        b = self.t2h(t.unsqueeze(0).unsqueeze(2)).sigmoid()
        time = a+b

        gates = preact[:, :, :3 * self.hidden_size]
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        
        i_t = (gates[:, :, :self.hidden_size] + self.wci * c).sigmoid()
        
        f_t = (gates[:, :, self.hidden_size:2 * self.hidden_size] + self.wcf * c).sigmoid()
        
        t_t = time.sigmoid()
        
        c_t = torch.mul(c, f_t) + torch.mul(i_t, g_t)
        
        o_t = (gates[:, :, -self.hidden_size:] + self.wco * c_t).sigmoid()
        
        h_t = torch.mul(o_t, c_t.tanh())
        
        return h_t, (h_t, c_t)

### Peephole + Coupled Gates

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) 
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)

        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        
        self.wci = nn.Parameter(torch.zeros(1))
        self.wcf = nn.Parameter(torch.zeros(1))
        self.wco = nn.Parameter(torch.zeros(1))
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t,
                hidden):
        h, c = hidden

        preact = self.i2h(x) + self.h2h(h) 
        a = self.x2h(x)
        
        b = self.t2h(t.unsqueeze(0).unsqueeze(2)).sigmoid()
        time = a+b

        gates = preact[:, :, :3 * self.hidden_size]
        
        g_t = preact[:, :, 3 * self.hidden_size:].tanh()
        
        i_t = (gates[:, :, :self.hidden_size] + self.wci * c).sigmoid()
        
        f_t = (gates[:, :, self.hidden_size:2 * self.hidden_size] + self.wcf * c).sigmoid()
        
        t_t = time.sigmoid()
        
        c_t = torch.mul(1 - i_t, f_t) + torch.mul(i_t, g_t)
        
        o_t = (gates[:, :, -self.hidden_size:] + self.wco * c_t).sigmoid()
        
        h_t = torch.mul(o_t, c_t.tanh())
        
        return h_t, (h_t, c_t)

### Custom LSTM

In [None]:
class LSTMCell(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        
        self.i2h = nn.Linear(input_size, 4 * hidden_size, bias=bias) # previously 4, added another gate
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        
        self.t2h = nn.Linear(1, hidden_size, bias=bias)
        self.x2h = nn.Linear(input_size, hidden_size, bias=bias)
        self.Wto = nn.Linear(1, hidden_size, bias=False)
        
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, 
                t,
                hidden):
        h, c = hidden
        
        preact = self.i2h(x) + self.h2h(h) 

        time = self.x2h(x) + self.t2h(t.unsqueeze(0).unsqueeze(2)) #.sigmoid()
        
        gates = preact[:, :, :3 * self.hidden_size]

        g_t = preact[:, :, 3 * self.hidden_size:].tanh()

        i_t = (gates[:, :, :self.hidden_size]
              + 0.1 * time).sigmoid()

        f_t = (gates[:, :, self.hidden_size:2 * self.hidden_size] 
              ).sigmoid()
        
        t_t = time.sigmoid()
        
        c_t = torch.mul(c, f_t) + torch.mul(i_t, g_t)
        
        o_t = (gates[:, :, -self.hidden_size:] 
               #+ self.Wto(t.unsqueeze(0).unsqueeze(2)) 
              ).sigmoid()
        
        h_t = torch.mul(o_t, c_t.tanh())
        
        return h_t, (h_t, c_t)

In [None]:
from sklearn.metrics import r2_score

### LSTM Architecture

https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352

In [12]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, bias=True):
        super().__init__()
        self.lstm_cell = LSTMCell(input_size, hidden_size) 

    def forward(self, input_, time, hidden):
        for i, x in enumerate(torch.unbind(input_, dim=0)): 
            h_t, hidden = self.lstm_cell(x.unsqueeze(0), 
                                             time[i].unsqueeze(0), 
                                             hidden) 
        return h_t, hidden
    
class lstm_encoder(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers = 1):
        super(lstm_encoder, self).__init__()
        self.input_size = input_size 
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = LSTM(input_size = input_size, hidden_size = hidden_size,)
    def forward(self, x_input, time_input):
        batch_size = x_input.shape[1]
        lstm_out, self.hidden = self.lstm(x_input,
                                          #.view(x_input.shape[0], x_input.shape[1], self.input_size)
                                          time_input, 
                                          (torch.zeros(self.num_layers, batch_size, self.hidden_size), torch.zeros(self.num_layers, batch_size, self.hidden_size)))
        #print("self.hidden is" + str(self.hidden[0].shape))
        return lstm_out, self.hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_size),
                torch.zeros(self.num_layers, batch_size, self.hidden_size))
    
    def test(self, x_input, time_input, embedding, batch_size=1):
        mid = len(embedding)//2
        hid1 = torch.from_numpy(embedding[:mid])
        hid2 = torch.from_numpy(embedding[mid:])
        hidden = (hid1, hid2)
        lstm_out, self.hidden = self.lstm(x_input,
                                          time_input, 
                                          hidden)
        return self.hidden

class lstm_decoder(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers = 1):
        super(lstm_decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = LSTM(input_size = input_size, hidden_size = hidden_size,
                            #num_layers = num_layers
                               )
        self.linear = nn.Linear(hidden_size, input_size)           

    def forward(self, x_input, time_input, encoder_hidden_states):
        '''
        print("We are now in the lstm decoder")
        print("x_input shape is")
        print(x_input.shape)
        print(x_input)
        '''
        x_input = x_input.unsqueeze(0)
        #print(encoder_hidden_states[0].shape) # torch.Size([1, 5, 15])

        lstm_out, self.hidden = self.lstm(x_input, 
                                          time_input,
                                          encoder_hidden_states)
        #print('lstm_out')
        #print(lstm_out.shape)
        output = self.linear(lstm_out.squeeze(0))  
        
        #print('output')
        #print(output.shape)
        return output, self.hidden

### Autoencoder Architecture

In [15]:
class lstm_autoencoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(lstm_autoencoder, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.encoder = lstm_encoder(input_size = input_size, hidden_size = hidden_size)
        self.decoder = lstm_decoder(input_size = input_size, hidden_size = hidden_size)
        
        self.predictor = lstm_decoder(input_size = input_size, hidden_size = hidden_size)


    def train_autoencoder(self, input_tensor, 
                          time_tensor,
                          #target_tensor, 
                          n_epochs, 
                          #target_len, 
                          batch_size, 
                          training_prediction = 'recursive', teacher_forcing_ratio = 0.5, learning_rate = 0.01, dynamic_tf = False):
        
        print('addfasafsdf')
        
        # initialize array of losses 
        losses = np.full(n_epochs, np.nan)

        optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        criterion = nn.MSELoss()

        # calculate number of batch iterations
        n_batches = int(input_tensor.shape[0] / batch_size)
        
        # torch.Size([158859, 20, 128])

        with trange(n_epochs) as tr: # USING TGE 
            for it in tr:
                
                batch_loss = 0.
                batch_loss_tf = 0.
                batch_loss_no_tf = 0.
                num_tf = 0
                num_no_tf = 0

                for b in range(n_batches):
                    # select data 
                    input_batch = input_tensor[:, b: b + batch_size, :]
                    
                    time_batch = time_tensor[:, b: b + batch_size]
                    
                    target_batch = input_tensor[:, b: b + batch_size, :]

                    # outputs tensor
                    #outputs = torch.zeros(target_len, batch_size, input_batch.shape[2])
                    outputs = torch.zeros(input_tensor.shape[0], batch_size, input_tensor.shape[2])

                    # initialize hidden state
                    encoder_hidden = self.encoder.init_hidden(batch_size)

                    # zero the gradient
                    optimizer.zero_grad()

                    # encoder outputs
                    encoder_output, encoder_hidden = self.encoder(input_batch, 
                                                                  time_batch
                                                                 )

                    # decoder with teacher forcing
                    #decoder_input = input_batch[-1, :, :]   # taking the last time step // shape: (batch_size, input_size)
                    

                    
                    decoder_input = torch.ones(input_batch[-1,:,:].shape) # SOS token
                    #print('decoder_input')
                    #print(decoder_input.shape)
                    time = time_batch[0,:]

                    decoder_hidden = encoder_hidden
                    
                    target_len = input_batch.shape[0] # new modification

                    if training_prediction == 'recursive':
                        # predict recursively
                        for t in range(target_len): 
                            decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                            time = time_batch[t,:]
                            outputs[t] = decoder_output
                            decoder_input = decoder_output

                    if training_prediction == 'teacher_forcing':
                        # use teacher forcing
                        if random.random() < teacher_forcing_ratio:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                                time = time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = target_batch[t, :, :]

                        # predict recursively 
                        else:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.decoder(decoder_input, time,  decoder_hidden)
                                time = time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = decoder_output

                    if training_prediction == 'mixed_teacher_forcing':
                        # predict using mixed teacher forcing
                        for t in range(target_len): # IN REVERSE
                            time = time_batch[t,:]
                            
                            #print('afsasfdfaf')
                            
                            decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                            
                            
                            print(outputs[t].shape)
                            print(decoder_output.shape)
                            
                            outputs[t] = decoder_output
                            
                            # predict with teacher forcing
                            if random.random() < teacher_forcing_ratio:
                                decoder_input = target_batch[t, :, :]
                            
                            # predict recursively 
                            else:
                                decoder_input = decoder_output

                    # compute the loss 
                    loss = criterion(outputs, target_batch)
                    #loss = torch.norm(outputs - target_batch)
                    batch_loss += loss.item()
                    
                    # backpropagation
                    loss.backward()
                    optimizer.step()

                # loss for epoch 
                batch_loss /= n_batches 
                losses[it] = batch_loss

                # dynamic teacher forcing
                if dynamic_tf and teacher_forcing_ratio > 0:
                    teacher_forcing_ratio = teacher_forcing_ratio - 0.02 

                # progress bar 
                tr.set_postfix(loss="{0:.10f}".format(batch_loss))
                    
        return losses
    
    # TIME SERIES PREDICTION: take 10 actions, predict next 2 with time inputs
    def train_predecoder(self, input_tensor, 
                         time_tensor,
                         target_len,
                         n_epochs,
                         batch_size = 5, 
                         training_prediction = 'recursive', teacher_forcing_ratio = 0.5, learning_rate = 0.01, dynamic_tf = False):
        
        # IMPORTANT: THIS PREVENTS ENCODER MODEL FROM BEING RE-TRAINED
        #for param in model.encoder.parameters():
        #    param.requires_grad = False
    
        # initialize array of losses 
        losses = np.full(n_epochs, np.nan)

        optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        criterion = nn.MSELoss()

        # calculate number of batch iterations
        n_batches = int(input_tensor.shape[0] / batch_size)
        
        with trange(n_epochs) as tr: # USING TGE 
            for it in tr:
                
                batch_loss = 0.
                batch_loss_tf = 0.
                batch_loss_no_tf = 0.
                num_tf = 0
                num_no_tf = 0

                for b in range(n_batches):
                    
                    # select data 
                    input_batch = input_tensor[:-target_len, b: b + batch_size, :]
                    time_batch = time_tensor[:-target_len, b: b + batch_size]
                    
                    target_time_batch = time_tensor[-target_len:, b: b + batch_size]
                    target_batch = input_tensor[-target_len:, b: b + batch_size, :]
                    
                    # outputs tensor
                    #outputs = torch.zeros(target_len, batch_size, input_batch.shape[2])
                    outputs = torch.zeros(target_len, batch_size, input_tensor.shape[2])

                    # initialize hidden state
                    encoder_hidden = self.encoder.init_hidden(batch_size)

                    # zero the gradient
                    optimizer.zero_grad()

                    # encoder outputs
                    encoder_output, encoder_hidden = self.encoder(input_batch, 
                                                                  time_batch)
                    
                    decoder_input = torch.ones(input_batch[-1,:,:].shape) # SOS token
                    #print('decoder_input')
                    #print(decoder_input.shape)
                    time = time_batch[0,:]

                    decoder_hidden = encoder_hidden
                
                    if training_prediction == 'recursive':
                        # predict recursively
                        for t in range(target_len): 
                            decoder_output, decoder_hidden = self.predictor(decoder_input, time, decoder_hidden)
                            time = target_time_batch[t,:]
                            outputs[t] = decoder_output
                            decoder_input = decoder_output

                    if training_prediction == 'teacher_forcing':
                        # use teacher forcing
                        if random.random() < teacher_forcing_ratio:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.predictor(decoder_input, time, decoder_hidden)
                                time = target_time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = target_batch[t, :, :]

                        # predict recursively 
                        else:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.predictor(decoder_input, time,  decoder_hidden)
                                time = target_time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = decoder_output

                    if training_prediction == 'mixed_teacher_forcing':
                        # predict using mixed teacher forcing
                        for t in range(target_len):
                            time = target_time_batch[t,:]
                            decoder_output, decoder_hidden = self.predictor(decoder_input, time, decoder_hidden)
                            
                            #print('sayoooo')
                            
                            outputs[t] = decoder_output
                            
                            
                            
                            # predict with teacher forcing
                            if random.random() < teacher_forcing_ratio:
                                decoder_input = target_batch[t, :, :]
                            
                            # predict recursively 
                            else:
                                decoder_input = decoder_output

                    # compute the loss 
                    loss = criterion(outputs, target_batch)
                    #loss = torch.norm(outputs-target_batch)
                    batch_loss += loss.item()
                    
                    # backpropagation
                    loss.backward()
                    optimizer.step()

                # loss for epoch 
                batch_loss /= n_batches 
                losses[it] = batch_loss

                # dynamic teacher forcing
                if dynamic_tf and teacher_forcing_ratio > 0:
                    teacher_forcing_ratio = teacher_forcing_ratio - 0.02 

                # progress bar 
                tr.set_postfix(loss="{0:.10f}".format(batch_loss))
        
        for param in model.encoder.parameters():
            param.requires_grad = True
                    
        return losses
    
    def test(self, input_tensor, time_tensor, batch_size = 5, teacher_forcing_ratio = 0.6): # TEST AUTOENCODER
        
        score = 0

        criterion = nn.MSELoss()

        n_batches = int(input_tensor.shape[0] / batch_size)
        
        batch_loss = 0.

        for b in range(n_batches):

            input_batch = input_tensor[:, b: b + batch_size, :]

            time_batch = time_tensor[:, b: b + batch_size]

            target_batch = input_tensor[:, b: b + batch_size, :]

            outputs = torch.zeros(input_tensor.shape[0], batch_size, input_tensor.shape[2])

            encoder_hidden = self.encoder.init_hidden(batch_size)

            encoder_output, encoder_hidden = self.encoder(input_batch, 
                                                          time_batch
                                                         )

            decoder_input = torch.ones(input_batch[-1,:,:].shape) 

            time = time_batch[0,:]

            decoder_hidden = encoder_hidden

            target_len = input_batch.shape[0] 

            for t in range(target_len): 
                time = time_batch[t,:]
                decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                #print('hiiiiiiii')
                outputs[t] = decoder_output

                # predict with teacher forcing
                if random.random() < teacher_forcing_ratio:
                    decoder_input = target_batch[t, :, :]

                # predict recursively 
                else:
                    decoder_input = decoder_output

            # compute the loss 
            
            loss = criterion(outputs, target_batch) 
            #loss = torch.norm(outputs-target_batch)
            
            score += loss
        
        score /= n_batches
                    
        return score
    
    # predicts one at a time
    
    def predict(self, input_tensor, 
                time_tensor):
        
        criterion = nn.MSELoss()
        
        target_len = input_tensor.shape[0] # new modification
        
        # encode input_tensor
        
        input_tensor = input_tensor.unsqueeze(1)     # add in batch size of 1
        
        encoder_output, encoder_hidden = self.encoder(input_tensor, time_tensor, batch_size = 1)

        # initialize tensor for predictions
        outputs = torch.zeros(target_len, input_tensor.shape[2])

        # decode input_tensor
        decoder_input = torch.ones(input_tensor[-1,:,:].shape)
        decoder_hidden = encoder_hidden

        for t in range(target_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, time_tensor, decoder_hidden)
            outputs[t] = decoder_output
            decoder_input = decoder_output
   
        np_outputs = outputs.detach().numpy()

        return np_outputs, encoder_hidden, criterion(outputs, input_tensor)
    

### Variational Autoencoder Architecture

In [None]:
class VAE(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(VAE, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        self.encoder = lstm_encoder(input_size = input_size, hidden_size = hidden_size)
        self.decoder = lstm_decoder(input_size = input_size, hidden_size = hidden_size)
        
        self.predictor = lstm_decoder(input_size = input_size, hidden_size = hidden_size)
        
        #================================================
        self.fc_mu = nn.Linear(hidden_size *2, hidden_size *2)
        self.fc_var = nn.Linear(hidden_size *2, hidden_size *2)
        #===================================================
        
        '''
        self.fc_mu = nn.Linear(hidden_dims[-1]*4, latent_dim)
        self.fc_var = nn.Linear(hidden_dims[-1]*4, latent_dim)
        '''
        
    def train_autoencoder(self, input_tensor, 
                          time_tensor,
                          #target_tensor, 
                          n_epochs, 
                          #target_len, 
                          batch_size, 
                          training_prediction = 'recursive', teacher_forcing_ratio = 0.5, learning_rate = 0.01, dynamic_tf = False):
        
        # initialize array of losses 
        losses = np.full(n_epochs, np.nan)

        optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        criterion = nn.MSELoss()

        # calculate number of batch iterations
        n_batches = int(input_tensor.shape[0] / batch_size)
        
        # torch.Size([158859, 20, 128])

        with trange(n_epochs) as tr: # USING TGE 
            for it in tr:
                
                batch_loss = 0.
                batch_loss_tf = 0.
                batch_loss_no_tf = 0.
                num_tf = 0
                num_no_tf = 0

                for b in range(n_batches):
                    # select data 
                    input_batch = input_tensor[:, b: b + batch_size, :]
                    
                    time_batch = time_tensor[:, b: b + batch_size]
                    
                    target_batch = input_tensor[:, b: b + batch_size, :]

                    # outputs tensor
                    #outputs = torch.zeros(target_len, batch_size, input_batch.shape[2])
                    outputs = torch.zeros(input_tensor.shape[0], batch_size, input_tensor.shape[2])

                    # initialize hidden state
                    encoder_hidden = self.encoder.init_hidden(batch_size)

                    # zero the gradient
                    optimizer.zero_grad()

                    # encoder outputs
                    encoder_output, encoder_hidden = self.encoder(input_batch, 
                                                                  time_batch
                                                                 )

                    # decoder with teacher forcing
                    #decoder_input = input_batch[-1, :, :]   # taking the last time step // shape: (batch_size, input_size)
                    
                    #=====================================
                    encoder_hidden = torch.stack(encoder_hidden).reshape(encoder_hidden[0].shape[0],
                                                                            encoder_hidden[0].shape[1],
                                                                            encoder_hidden[0].shape[2]*2)
        
                    mu = self.fc_mu(encoder_hidden)
                    log_var = self.fc_var(encoder_hidden)
                    std = torch.exp(0.5 * log_var)
                    eps = torch.randn_like(std)
                    encoder_hidden = eps * std + mu
                    
                    #print("===============")
                    #print(encoder_hidden.shape)
                    encoder_hidden = torch.tensor_split(encoder_hidden, 2, dim=2)
                    #print(encoder_hidden[1].shape)
                    #======================================
                    
                    decoder_input = torch.ones(input_batch[-1,:,:].shape) # SOS token

                    time = time_batch[0,:]

                    decoder_hidden = encoder_hidden
                    
                    target_len = input_batch.shape[0] # new modification

                    if training_prediction == 'recursive':
                        # predict recursively
                        for t in range(target_len): 
                            decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                            time = time_batch[t,:]
                            outputs[t] = decoder_output
                            decoder_input = decoder_output

                    if training_prediction == 'teacher_forcing':
                        # use teacher forcing
                        if random.random() < teacher_forcing_ratio:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                                time = time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = target_batch[t, :, :]

                        # predict recursively 
                        else:
                            for t in range(target_len): 
                                decoder_output, decoder_hidden = self.decoder(decoder_input, time,  decoder_hidden)
                                time = time_batch[t,:]
                                outputs[t] = decoder_output
                                decoder_input = decoder_output

                    if training_prediction == 'mixed_teacher_forcing':
                        # predict using mixed teacher forcing
                        for t in range(target_len):
                            time = time_batch[t,:]
                            decoder_output, decoder_hidden = self.decoder(decoder_input, time, decoder_hidden)
                            
                            #print(decoder_output.shape)
                            #print(decoder_output)
                            
                            outputs[t] = decoder_output
                            
                            # predict with teacher forcing
                            if random.random() < teacher_forcing_ratio:
                                decoder_input = target_batch[t, :, :]
                            
                            # predict recursively 
                            else:
                                decoder_input = decoder_output
                                
                    # ===========================================
                    # compute the loss 
                    # KL-divergence loss
                    kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp()))
                    #print(kld_loss)
                    loss = criterion(outputs, target_batch) + kld_loss/batch_size
                    # ===========================================
                    
                    batch_loss += loss.item()
                    
                    # backpropagation
                    loss.backward()
                    optimizer.step()

                # loss for epoch 
                batch_loss /= n_batches 
                losses[it] = batch_loss

                # dynamic teacher forcing
                if dynamic_tf and teacher_forcing_ratio > 0:
                    teacher_forcing_ratio = teacher_forcing_ratio - 0.02 

                # progress bar 
                tr.set_postfix(loss="{0:.10f}".format(batch_loss))
                    
        return losses
    
    # predicts one at a time
    
    def predict(self, input_tensor, 
                time_tensor):

        target_len = input_tensor.shape[0] # new modification
        
        # encode input_tensor
        
        input_tensor = input_tensor.unsqueeze(1)     # add in batch size of 1
        
        encoder_output, encoder_hidden = self.encoder(input_tensor, time_tensor, batch_size = 1)

        # initialize tensor for predictions
        outputs = torch.zeros(target_len, input_tensor.shape[2])

        # decode input_tensor
        decoder_input = torch.ones(input_tensor[-1,:,:].shape)
        decoder_hidden = encoder_hidden

        for t in range(target_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, time_tensor, decoder_hidden)
            outputs[t] = decoder_output
            decoder_input = decoder_output
   
        np_outputs = outputs.detach().numpy()

        return np_outputs, encoder_hidden

In [None]:
print(torched_windowed_sequences.shape)
print(torched_windowed_timestamps.shape)

In [None]:
# separate training and testing dataset

In [None]:
def rec_acc(torched_windowed_sequences, loss):
    total = 0
    for seq in torched_windowed_sequences:
        for op in seq:
            total += torch.sum(torch.abs(op)) / op.shape[0]
    score = 1 - (math.sqrt(loss)/ (total / torched_windowed_sequences.shape[1]))
    return score
# because it's the MEAN SQUARED ERROR

In [None]:
print(o.shape)
print(t.shape)

In [17]:
# split the data

#op = torched_windowed_sequences
#time = torched_windowed_timestamps

model = lstm_autoencoder(input_size = torched_windowed_sequences.shape[2], hidden_size = 64)

o = torched_windowed_sequences
t = torched_windowed_timestamps

# training 
loss = model.train_autoencoder(o, 
                               t
                  , n_epochs = 10 
                  , batch_size = 5
                  , training_prediction = 'mixed_teacher_forcing' 
                  , teacher_forcing_ratio = 0.6
                  , learning_rate = 0.01, dynamic_tf = False
                 )

'''
# testing 
score = model.test(test_op, 
                         test_time, batch_size = 5, teacher_forcing_ratio = 0.6)

print('TRAINING ACCURACY')
print(loss[-1])
print(math.sqrt(loss[-1]))
#print(rec_acc(torched_windowed_sequences, loss).item())

print('TESTING MEAN LOSS')
print(score.item())
print(math.sqrt(score))

#print('TESTING ACCURACY')
#print(rec_acc(torched_windowed_sequences, score).item())
'''

addfasafsdf


  0%|          | 0/10 [00:00<?, ?it/s]

torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5

  0%|          | 0/10 [00:33<?, ?it/s]

torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5, 128])
torch.Size([5




KeyboardInterrupt: 

In [None]:
torched_windowed_timestamps.shape

In [None]:
rec_errors = []
for i in range(torched_windowed_sequences.shape[1]):
    _,_, rec_error = model.predict(torched_windowed_sequences[:,i,:], torched_windowed_timestamps[:,i,:])
    rec_errors.append(rec_error)
    print(i)

In [None]:
# TRAIN TEST

# LSTM AUTOENCODER
# 100%|██████████| 500/500 [00:42<00:00, 11.90it/s, loss=0.0091651650]
# TRAINING ACCURACY
# 0.009165165014564991
# 0.09573486833210244
# TESTING MEAN LOSS
# 25.63962173461914
#5.063558208870432

In [None]:
# VARIATIONAL AUTOENCODER

# 100%|██████████| 500/500 [00:43<00:00, 11.39it/s, loss=0.0042354001]
# 100%|██████████| 500/500 [00:43<00:00, 11.36it/s, loss=0.0154977629]
# 100%|██████████| 500/500 [00:43<00:00, 11.41it/s, loss=0.0088684270]



In [None]:
# REVERSED VS NON-REVERSED => REVERSED WORKS BETTER!

# REVERSED
# 100%|██████████| 500/500 [00:42<00:00, 11.87it/s, loss=0.0000022728]
# 100%|██████████| 500/500 [00:42<00:00, 11.81it/s, loss=0.0000189260]
# 100%|██████████| 500/500 [00:42<00:00, 11.84it/s, loss=0.0000130772]

# NON-REVERSED
# 100%|██████████| 500/500 [00:42<00:00, 11.86it/s, loss=0.0000736655]
# 100%|██████████| 500/500 [00:42<00:00, 11.84it/s, loss=0.0000185950]
# 100%|██████████| 500/500 [00:42<00:00, 11.75it/s, loss=0.0000177078]

In [None]:
print(model.encoder.state_dict())

In [None]:
print(model.decoder.state_dict())

In [None]:
loss = model.train_predecoder(torched_windowed_sequences, 
                              torched_windowed_timestamps, 2,
                              n_epochs = 500, batch_size = 5, training_prediction = 'mixed_teacher_forcing', teacher_forcing_ratio = 0.6, learning_rate = 0.01, dynamic_tf = False)
print(math.sqrt(loss[-1]))

In [None]:
# with freezing
# 100%|██████████| 500/500 [00:09<00:00, 50.44it/s, loss=0.0011930717]
# 100%|██████████| 500/500 [00:10<00:00, 49.09it/s, loss=0.0013867632]

# without freezing
# 

In [None]:
print(model.encoder.state_dict())
print(model.predictor.state_dict())

In [None]:
# LSTM
# 100%|██████████| 500/500 [00:45<00:00, 10.88it/s, loss=0.0000253534]
# 100%|██████████| 500/500 [00:43<00:00, 11.58it/s, loss=0.0000157571]

# LSTM w/ Peephole
# 100%|██████████| 500/500 [00:52<00:00,  9.59it/s, loss=0.0000155904]
# 100%|██████████| 500/500 [00:52<00:00,  9.56it/s, loss=0.0001305166]

# LSTM w/ Peephole + Coupled Gates
# 100%|██████████| 500/500 [00:55<00:00,  9.02it/s, loss=0.0005010203]
# 100%|██████████| 500/500 [00:56<00:00,  8.83it/s, loss=0.0051181654]

# TIME-LSTM 1
# 100%|██████████| 500/500 [01:07<00:00,  7.43it/s, loss=0.0060292104]
# 100%|██████████| 500/500 [01:07<00:00,  7.41it/s, loss=0.0104567041]

# TIME-LSTM 2
# 100%|██████████| 500/500 [01:20<00:00,  6.23it/s, loss=0.0013810578]
# 100%|██████████| 500/500 [01:22<00:00,  6.07it/s, loss=0.0025202912]

# TIME-LSTM 3
# 100%|██████████| 500/500 [01:11<00:00,  6.95it/s, loss=0.0001785948]
# 100%|██████████| 500/500 [01:13<00:00,  6.83it/s, loss=0.0001769290]
# 5000 epochs

# CUSTOM TIME
# 100%|██████████| 500/500 [00:52<00:00,  9.57it/s, loss=0.0000210490]

# PHASED-LSTM


# No problem 
# Next steps:
# I see some progress in modifying the lstm with time gates
# I think rn my custom models are a bit too strong in terms of how time influences it
# which is why it's performing wrose than usual lstm
# I'm gonna look at it until the end of the day
# but if no improvement, i'll make some hypothesis on why (time lstm is performing worse than lstm) and use that in the business meeting
# it could be related to how certain actions are more relevant when they are more spaced out apart
# vs some others that are more relevant when they are closer to each other

# or idk...
# not overfitting, just performing worse overall

# generaly accuracy only

# not yet 

# i think i'll use some sort of clipping or constraint so that the time gates perform AT LEAST as good than lstm

# but i simplified the code a lot so i can write down how the next person can modify the lstm architecture!


### Encoder-Decoder for Time-Series

Using the same encoder model, we encode a window of operations to predict the next window of operations
This is to show that the embeddings are meaningful such that through:
1. The AE Decoder, they contain the historical user behaviour information
2. The Time-Series Decoder, they contain the future user behaviour information:
        1. Retrieve categorical sequences
        2. Each output, categorical layer to assign to operation
        3. Compute accuracy from correct operation category output

100%|██████████| 500/500 [00:38<00:00, 12.95it/s, loss=0.0000069499]

Run a grid search/random search on LSTM layers, latent space dimension, and epochs

Where does the embedding come from?

ASSUME TIME GATE WORKS BETTER THAN NO TIME GATE? because loss seems to be \leq 

(we may use torch.nn.lstm for comparison)

-loss=0.0000102043 on hidden size 15, epochs 500

-loss=0.0000010081 on hidden size 30, epochs 500

In [None]:
np.set_printoptions(threshold=sys.maxsize)
#torch.set_printoptions(threshold=sys.maxsize) # this will kill the kernel!

print(torched_windowed_timestamps[6][0])

In [None]:
torched_windowed_timestamps[:,9].unsqueeze(1).shape

In [None]:
print(torched_windowed_sequences[:,6][-1])
print(torched_windowed_timestamps[:,6][-1])

### Cluster Tests

In [None]:
input_tensor = torch.tensor([ 7.2859e-02, -5.1723e-02,  1.6887e-01,  9.6478e-03,  2.7092e-01,
         1.4126e-02,  1.2384e-01, -3.9685e-04,  2.2763e-01, -4.0240e-01,
        -1.9798e-01, -3.4064e-01, -2.0647e-01,  1.3255e-01,  6.4034e-03,
        -1.3859e-02,  1.0164e-01,  1.6872e-01, -1.1312e-01, -3.2962e-01,
        -1.0826e-01, -1.7407e-02, -2.5063e-01, -1.6811e-01, -1.6072e-02,
        -3.7014e-02,  8.8129e-02,  5.9020e-02,  1.5442e-01,  1.1737e-01,
        -7.6847e-02,  4.6167e-02,  1.7816e-01,  3.5143e-01, -2.0578e-01,
        -8.1568e-02,  1.6674e-01,  1.3171e-01,  4.0933e-02, -4.2767e-02,
         4.8727e-01,  5.1966e-02, -2.6346e-01,  6.2688e-01,  2.4226e-01,
         1.6288e-02,  1.5292e-01, -3.1451e-01, -4.3505e-02, -1.1195e-01,
         3.7763e-02,  1.8957e-01,  6.5843e-02,  6.7268e-03, -1.6360e-01,
         7.9658e-02,  8.2879e-03, -1.2846e-01,  1.8420e-01, -1.6801e-01,
        -2.2163e-01, -8.7116e-02,  1.8645e-01, -7.0144e-02,  2.9216e-02,
        -2.6450e-02, -3.7200e-02,  3.4123e-02, -1.3855e-01,  1.2327e-01,
         9.6004e-03,  1.6783e-01, -1.9080e-01, -1.8574e-02,  1.3258e-01,
         3.9922e-01,  2.0783e-01,  2.1572e-02, -2.8567e-02, -4.5407e-02,
        -8.5485e-02, -8.9264e-03, -3.4358e-02,  1.5016e-02, -2.4072e-02,
         2.4404e-01, -3.1834e-02, -2.7986e-02,  7.4648e-02, -1.0087e-02,
        -3.1862e-02, -2.3848e-01, -8.0869e-02, -2.0942e-02, -6.9803e-02,
         2.3009e-01,  1.3550e-01, -9.0289e-02,  7.8311e-03, -8.4256e-02,
         1.4535e-01, -1.3228e-01,  1.1738e-03,  1.1044e-01, -9.6383e-02,
        -3.6381e-03, -2.2033e-01,  1.2453e-01, -1.4537e-01, -7.5747e-03,
         5.2217e-01, -3.2780e-01,  6.7146e-02, -1.4363e-01, -1.9472e-01,
        -6.7836e-02,  2.1534e-01, -2.1962e-02,  5.1992e-01,  4.6406e-02,
        -1.7308e-01,  8.7515e-02, -6.2887e-02,  2.3623e-02,  2.3809e-02,
        -2.0818e-01, -2.4797e-02, -7.4244e-09])

seq2 = torch.cat((torched_windowed_sequences[:,6], input_tensor.unsqueeze(0)),0)

In [None]:
time_tensor = torch.tensor([325235235523.])
time2 = torch.cat((torched_windowed_timestamps[:,6],time_tensor)).unsqueeze(1)
print(torched_windowed_timestamps[:,6].shape)

In [None]:
prediction, embedding = model.predict(torched_windowed_sequences[:,6], torch.zeros(torched_windowed_sequences[:,6].shape[1]).unsqueeze(1))
embedding = torch.cat(embedding).ravel().detach().numpy()
print(embedding)

In [None]:
prediction2, embedding2 = model.predict(seq2, time2)
embedding2 = torch.cat(embedding2).ravel().detach().numpy()
print(embedding2)

In [None]:
np.linalg.norm(embedding2 - embedding)

In [None]:
### is there any cost/energy associated with training for such a long time?

# set window size
# user sequences > window size -> more than one instance (sequence len - window size number of instances)

# better to truncate based on time gap

# reimplement with time gates

# freedom, accessibility with machine learning models
# random search instead of grid search (more efficient)

## Saving the model (and its checkpoints)

In [None]:
model.state_dict()

## Embed all Users!

In [None]:
len(sequences)
len(timestamps)

In [None]:
sequences1= sequences[:2000]
timestamps1=timestamps[:2000]

In [None]:
uobs = []

In [None]:
i = 0
for user, time in zip(sequences, timestamps):
    user = torch.from_numpy(np.array(user)).type(torch.FloatTensor)
    time = torch.from_numpy(np.array(time)).type(torch.FloatTensor)
    pred, embed = model.predict(user, time.unsqueeze(1))
    uobs.append(torch.cat(embed).ravel().detach().numpy())
    print(i)
    i = i + 1

In [None]:
print(uobs[0])

In [None]:
input_tensor = torch.tensor([-4.4352e-02,  1.2201e-02,  1.6216e-01,  1.6379e-01,  3.9218e-01,
        -2.5394e-01, -5.2667e-02, -6.7875e-02,  6.5994e-01, -1.0205e-01,
        -2.6776e-01, -2.0883e-01, -4.1226e-01,  7.7409e-02, -1.0235e-01,
         3.9226e-01, -4.4637e-01, -3.3146e-01, -1.2367e-02, -5.5867e-02,
        -7.0543e-01,  2.5752e-02, -5.8277e-02, -5.1242e-01, -9.3612e-02,
         3.1211e-01,  2.6222e-01, -9.2457e-02,  1.3677e-01,  3.0865e-01,
         2.4187e-01,  3.6984e-01,  6.5900e-01,  5.1530e-01,  1.4048e-02,
        -2.5620e-01,  1.0549e-01,  3.4391e-01, -4.7275e-01,  3.2565e-02,
         4.3158e-01, -1.5974e-01, -3.0765e-01,  5.4733e-01,  5.8574e-01,
        -3.1861e-01,  2.0723e-01, -2.6745e-01, -6.3177e-02,  1.5469e-02,
         5.2354e-02,  1.3191e-01,  2.1890e-01,  2.7721e-04,  3.7968e-02,
        -9.9480e-02, -5.0709e-01,  1.6384e-01,  7.4540e-02,  4.1532e-02,
        -5.4125e-02, -9.9936e-01,  3.2398e-01, -4.7699e-01,  1.5951e-01,
         2.2943e-01, -2.6444e-01,  1.4613e-01, -5.8839e-01, -4.0362e-01,
         1.2432e-01,  3.8341e-01, -2.6790e-01, -9.5816e-02, -2.8904e-01,
         3.6368e-01,  8.3652e-02,  1.7725e-01,  2.7741e-01, -2.8172e-01,
        -7.3207e-02,  4.3167e-01,  5.2676e-01, -2.8241e-01,  2.3268e-01,
         2.0204e-01,  1.8995e-01,  2.2696e-01,  2.2350e-01, -1.2188e-01,
        -3.5101e-02, -5.4444e-01, -4.6613e-01,  2.1541e-01,  2.2307e-01,
        -1.0715e-01,  2.8139e-01,  1.6742e-01,  9.2263e-02,  2.7756e-01,
         8.0698e-02, -1.8247e-01,  2.1044e-01,  2.0641e-01,  8.4537e-02,
         2.7049e-01, -1.9913e-01, -2.0460e-01, -1.5651e-02,  5.7233e-01,
         4.0540e-01, -1.4468e-01, -3.6146e-01, -2.8766e-01, -1.9464e-03,
        -4.5186e-01, -8.1728e-02,  7.6493e-02, -1.9496e-02, -2.9143e-01,
        -1.7943e-01,  4.2151e-01, -2.0292e-01,  5.1099e-01,  2.5352e-02,
        -1.0709e-01,  1.5144e-01,  9.6194e-02])

time_tensor = torch.tensor([34000.])

embedding = uobs[0]

In [None]:
output = model.test(input_tensor, time_tensor, embedding)

In [None]:
prediction, embedding = model.test(torched_windowed_sequences[:,6], torched_windowed_timestamps[:,6].unsqueeze(1))
print(len(embedding))

In [None]:
with open('uobs', 'wb') as f:
    pickle.dump(uobs,f)

In [None]:
out = io.open('uobs.tsv', 'w', encoding='utf-8')

for index, user in enumerate(uobs):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out.write(word + "\n")
out.close()

In [None]:
pred1, embed1 = model.predict(torch.from_numpy(np.array(sequences[0])).type(torch.FloatTensor), torch.from_numpy(np.array(timestamps[0])).type(torch.FloatTensor).unsqueeze(1))

In [None]:
print(torch.cat(embed1).ravel().detach().numpy())

### To Do (Low Priority)

Next steps:
(1. Run a random hyperparameter search and re-review the code)
(1. a) Finish the KMeans Operation-Level Embedding Test code)
(2. Modify architecture based on Seq2Seq literature
    - There are various architectures for many-to-many, namely Recursive (feeding), Repeat vectors, etc.
    - Consider using EOS or SOS (currently we have only SOS)
    - Consider delimitating sessions (but how to do it uniformly? does it go hand in hand with time gates?))
2. Sanity check closest pairs
3. Add time gates next, only after optimizing w/o
4. Proceed with Transformers


# Flagging Model

In [None]:
print(model.parameters())

In [None]:
i=1
for param in model.parameters():
    print(i)
    print(param.data)
    print(param.data.shape)
    i+=1

# K-Means Clustering + Operation-Level Flagging

### To Do Later: DBScan Clustering, Variational Inference

In order for there to be operation-level flagging, the user must be considered non-anomalous

In other words, the next/latest operation must be anomalous enough for it to cause the hidden state to diverge

Steps:
1. Time-LSTM Autoencoder yields UOBS embeddings (encoder hidden states)
2. UOBS embeddings are clustered to form user segmentations (or variational inference later...)
3. Get hidden vector
4. For each operation, calculate the bounds of time elapsed in which the operation keeps the user "safe"

Such bounds are the "flagging rules". Alternatively, we may perform this on the cluster center to approximate a "user profile" based on fully-safe operations and safe bounds per non-fully-safe operations for some sort of user profiling that we can understand/imagine.

In [None]:
# find optimal kmeans cluster (ensemble, repeated trainings)
# we want to maximize the Dunn index = min(intra cluster distance)/max (inter cluster distance)

#https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-k-means-clustering/ for KMeans ++

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(X)
kmeans.cluster_centers_


## Rough Analysis of User Level Embeddings

### Roughly check similarities between 10-20 pairs of users and get back on this, then proceed to time gates

In [None]:
# how to do this with a windowed dataset? 
# as there is a different number of windows/instances generated per user

OUT_path_pairs = 's3://tk-dev-datalake2/risk/jerry.zhu/UBS_Embedding/Op_Log_masked_20221121_20221129.parquet'
data = pd.read_parquet(OUT_path_pairs)

In [None]:
data.head()
# filter out 1-operation users (non-sequential data)

user_counts = data['user_id'].value_counts().reset_index()
bottom_users = user_counts[user_counts['user_id']<=1]
bottom_users = bottom_users['index'].tolist()
data = data[~data['user_id'].isin(bottom_users)] 
print("Dataframe shape after filtering out singular operations is " + str(data.shape))

In [None]:
users = data['user_id'].unique()

print(len(users))
print(users)

def findIndex(name, users):
    for i, user in enumerate(users):
        if user==name:
            return i

In [None]:
print(len(sequences))
twentiesIndex = [i for i in range(len(sequences)) if len(sequences[i])==20]
twentiesUsers = [users[i] for i in twentiesIndex]
print(len(twentiesIndex))
#print(twentiesUsers)

So we have 6726 users (and therefore sequences) with more than one operation in the last week of November;
109 users with exactly 20 operations, so we're only going to compare across these users

In [None]:
embTest = []
predTest = []
#twenties = w2vembedding(twenties, weights)
for i in twentiesIndex:
    prediction, embedding = model.predict(torched_windowed_sequences[:,i], torched_windowed_timestamps[:,i].unsqueeze(1))
    embTest.append(torch.cat(embedding).ravel())
    predTest.append(prediction)

In [None]:
embTest = [emb.detach().numpy() for emb in embTest]
X = embTest

In [None]:
print(kmeans.labels_)
print(set(kmeans.labels_))

#kmeans.predict(X)
#kmeans.cluster_centers_

'''
[ 6  7  6 10  7  7  6  6 14  4  6  9  2  6  0 10  9  6  6  3 11  9  7  1
  1  1 11  5  7  0  4 12  2  0  6 11  6 11  0  3  6  3  2 12 10  0  6 11
  1  3 12  1  1  0 12  6  2  9  4  3  9 10  0  8  0  3  0 10 11  5  8  1
  1  1  1  7  1  1  1  1  1  6  6 10  4  6  2  9  9 11 10  4 14 10 12  2
  4  6  7  9 10  0 10  6 13 13  6 10  7]
'''

In [None]:
out_v = io.open('embTest.tsv', 'w', encoding='utf-8')
out_m = io.open('userTest.tsv', 'w', encoding='utf-8')
out_o = io.open('indexTest.tsv', 'w', encoding='utf-8')


for index, user in enumerate(twentiesUsers):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = embTest[index]
    
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(user + "\n")
    out_o.write(str(index) + "\n")
out_v.close()
out_m.close()
out_o.close()


In [None]:
print(len(twentiesUsers))
print(len(embTest))

In [None]:
print(embTest[37])
print(twentiesUsers[37])
print(embTest[35])
print(twentiesUsers[35])
print(embTest[81])
print(twentiesUsers[81])

#### Empirical check: 

79e2cebcc6e9cc225183b4b337ff2c95694de30f5fe61809ab06d0a6a20f5a42 AND 

67a4091d373bcf3fb471eaa42296c5d317b732f8525f583ae736fea4d4656be9

are similar, while

3eb626670ff5a334a4d88d7ec94a7b1af5ee611a7f04010ebfb278403707f279

is different from both

In [None]:
print(originalSeqs[findIndex(twentiesUsers[36], users)])
print(originalSeqs[findIndex(twentiesUsers[46], users)])

In [None]:
print(originalSeqs[findIndex(twentiesUsers[48], users)])
print(originalSeqs[findIndex(twentiesUsers[52], users)])

Comments: in this scenario, some sequences are similar, yielding similar user-level vectors, but the significance of difference is not apparent. 

So, we try to find pairs with similar vectors, but apparently different sequences:
- 36 and 46: 
    - c08db24027925633ced80abe9a69c7b728f800f3baeb9e8ca68f9e57a7b36628
    - cb8ca9c313ecc4eee300d6da678cd84baa1ce10630689aa1ca1ed32acec4ccdc
    - [17, 154, 154, 154, 53, 9, 47, 230, 47, 62, 154, 62, 92, 160, 187, 80, 202, 33, 138, 17]
    - [243, 145, 116, 131, 144, 144, 37, 121, 13, 13, 164, 230, 206, 62, 160, 230, 80, 187, 202, 36]
- 35 and 68:
    - cedef2cf30f55063102ec7e27c6893533deb615211bc32c0fc2dd2a3317941c8
    - 6af0230119b758863087fbf3a50ef1b796d52b0ab436c044a3dc739bc174f805
    - [144, 144, 154, 230, 183, 17, 36, 144, 144, 62, 160, 80, 187, 202, 154, 230, 183, 154, 230, 183]
    - [13, 13, 14, 180, 14, 180, 18, 109, 109, 18, 17, 14, 53, 230, 110, 230, 68, 154, 230, 183]
- 38 and 33:
     - 4cbd3ddf4ec5cfd1a85c36aa639ecd4980b48554544ea23d39a87131d8e08c00
     - d2395776de5f71dc61f8d4c41c5bccd7f7ad02fc26d239d17aa1ccdbcf10f1eb
     - [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 97, 17, 75, 13, 13, 13, 13, 13, 13]
     - [144, 144, 154, 230, 183, 144, 144, 154, 230, 183, 154, 230, 183, 144, 144, 154, 230, 183, 144, 144]
- 56 and 32:
    - c00b9995dea99fa81237e13d6f3cb21c794de6976890c0ab4152695e0eaaa9af
    - ca9f36790380cd38078c1199bfc2490b607838869ec74a0615e64fad0ca5dae7
    - [17, 154, 53, 9, 47, 230, 230, 53, 230, 53, 9, 47, 230, 53, 53, 230, 53, 9, 47, 47]
    - [17, 11, 154, 36, 53, 9, 47, 230, 47, 53, 9, 47, 230, 47, 53, 53, 53, 47, 230, 47]
- 108 and 4:
    - 5742754f10dc16310b3b14ecc038092749285e49c80286f9e495a453df65a072
    - 55588e044f1fbafabe0f845fadcc1e5755720c6b02af9786988204f05b90749e
    - [243, 145, 131, 158, 131, 158, 116, 131, 131, 109, 109, 82, 30, 144, 144, 27, 144, 144, 148, 7]
    - [223, 223, 144, 144, 223, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 75, 144, 144]
- 58 and 96:
    - 7db44715a40576ad4d3d3d7e363827ef0071ab29e957d75d6d60e01e6f60221b
    - 48ba4b829beababb63b988f3d4d6fd4a05d8239debaba79bd4ffe00aec34b2a7
    - [144, 144, 39, 131, 39, 248, 184, 201, 39, 39, 39, 167, 167, 242, 242, 242, 53, 53, 190, 111]
    - [144, 144, 144, 75, 202, 205, 154, 149, 53, 149, 53, 149, 53, 9, 47, 230, 47, 149, 53, 36]
- 31 and 50:
    - 3eb626670ff5a334a4d88d7ec94a7b1af5ee611a7f04010ebfb278403707f279
    - 5a74be87c7c4e9dd50bf3d89467d8c3ace953bff712b92c966f0d9ea14960132
    - [14, 17, 154, 230, 230, 183, 154, 230, 230, 183, 96, 154, 230, 230, 183, 53, 9, 47, 230, 47]
    - [17, 154, 53, 9, 47, 230, 47, 154, 53, 9, 47, 230, 47, 17, 154, 53, 47, 9, 230, 47]
- 60 and 88:
    - 22f352771d9fe706e3b22d6c189fe44519618910c7fb75d712443cffdfa838d9
    - f4e57bb45ae5bbdc08c0766a28e9255b95db8e9483ada25d1b0637e493ecfb2c
    - [94, 94, 94, 94, 235, 130, 235, 235, 235, 11, 53, 149, 47, 47, 53, 149, 36, 17, 173, 173]
    - [144, 27, 156, 100, 156, 100, 75, 156, 100, 133, 157, 157, 157, 157, 157, 156, 100, 156, 100, 14]
- 27 and 69:
    - 57178cbf6694f828ca46611392a1685c9b6ea8edbee4c4ff82a856c2df37ff3f
    - 2559d6c8cf9a1b627f400fb17d09270149372c02dbcac83e9c7080c33d2e6235
    - [53, 47, 9, 230, 230, 47, 230, 110, 154, 230, 183, 17, 230, 68, 154, 230, 183, 154, 230, 183]
    - [154, 154, 230, 230, 230, 183, 154, 154, 154, 230, 230, 183, 154, 154, 230, 230, 230, 230, 230, 183]
- 48 and 52:
    - 8b38c4db3f3523c15217f6e627e30c0e4b707e61b855cc53f81c080ed25c1e65
    - fc6e57b07995733da9f81dadb22d5dffa53df091ade6dab66d188a175608b6f0
    - [154, 154, 154, 154, 154, 154, 183, 154, 154, 183, 154, 154, 154, 154, 154, 183, 154, 183, 154, 183]
    - [154, 154, 154, 183, 154, 154, 154, 154, 154, 154, 183, 154, 154, 183, 154, 183, 154, 154, 154, 183]
    
Check vector similarity between operations of different categories

In [None]:
print(twentiesUsers[48])
print(twentiesUsers[52])

Observations at a first glance:
- More often than not, pairs of user-level vectors match at the end 
    - This can be explained by the encoding of the cell-state (short-term memory), as the model encoding gives equal weighting (concatenates along the 0 axis) to the cell-state. 
    - An easy fix would be to discard the cell-state, keeping only the hidden-state prior to decoding (and using a default blank cell-state). Look into this?
- Several pairs of user-level vector have similar frequency of a certain operation (154, 230, 144, etc), "drowning out" the other potentially important operations in the sequence
    - 154, 230, and 144 correspond respectively to: 
        - '97a874321e4d1572fbfed141041b6c451751df55d0a1e78826a6c8dd446123ec'
        - 'e55e4e5125823bcf5c3dd21a5b9bddeb35dc30cbae3aed81ccad1609d587310d'
        - '8e2eef2e7c9028e26094ee1420f0507a976ca9e6b0fd04ef7b8b75d161f4046d'

# Variational Auto Encoder Component

In [None]:
import pytorch_lightning as pl
from torch import nn
from torch.nn import functional as F

class VAE(pl.LightningModule):
    def __init__(self, enc_out_dim=512, latent_dim=256, input_height=32):
        super().__init__()

        self.save_hyperparameters()

        # encoder, decoder
        self.encoder = resnet18_encoder(False, False)
        self.decoder = resnet18_decoder(
            latent_dim=latent_dim, 
            input_height=input_height, 
            first_conv=False, 
            maxpool1=False
        )

        # distribution parameters
        self.fc_mu = nn.Linear(enc_out_dim, latent_dim)
        self.fc_var = nn.Linear(enc_out_dim, latent_dim)

        # for the gaussian likelihood
        self.log_scale = nn.Parameter(torch.Tensor([0.0]))

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    def gaussian_likelihood(self, mean, logscale, sample):
        scale = torch.exp(logscale)
        dist = torch.distributions.Normal(mean, scale)
        log_pxz = dist.log_prob(sample)
        return log_pxz.sum(dim=(1, 2, 3))

    def kl_divergence(self, z, mu, std):
        # --------------------------
        # Monte carlo KL divergence
        # --------------------------
        # 1. define the first two probabilities (in this case Normal for both)
        p = torch.distributions.Normal(torch.zeros_like(mu), torch.ones_like(std))
        q = torch.distributions.Normal(mu, std)

        # 2. get the probabilities from the equation
        log_qzx = q.log_prob(z)
        log_pz = p.log_prob(z)

        # kl
        kl = (log_qzx - log_pz)
        kl = kl.sum(-1)
        return kl

    def training_step(self, batch, batch_idx):
        x, _ = batch

        # encode x to get the mu and variance parameters
        x_encoded = self.encoder(x)
        mu, log_var = self.fc_mu(x_encoded), self.fc_var(x_encoded)

        # sample z from q
        std = torch.exp(log_var / 2)
        q = torch.distributions.Normal(mu, std)
        z = q.rsample()

        # decoded 
        x_hat = vae.decoder(z)

        # reconstruction loss
        recon_loss = self.gaussian_likelihood(x_hat, self.log_scale, x)

        # kl
        kl = self.kl_divergence(z, mu, std)

        # elbo
        elbo = (kl - recon_loss)
        elbo = elbo.mean()

        self.log_dict({
            'elbo': elbo,
            'kl': kl.mean(),
            'recon_loss': recon_loss.mean(), 
            'reconstruction': recon_loss.mean(),
            'kl': kl.mean(),
        })

        return elbo

## CNN Encoder-Decoder

In [None]:
! pip install -v theano==0.9.0
! pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
#! pip install -v pandas==0.18.1

In [None]:
! pip install torch
! pip install torchvision

In [None]:
!source cuda11.1
# To see Cuda version in use
!nvcc -V
!pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip list

In [None]:
import torch
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt

In [None]:
tensor_transform = transforms.ToTensor()
dataset = datasets.MNIST(root = "./data",
                       train = True,
                       download = True,
                       transform = tensor_transform)
loader = torch.utils.data.DataLoader(dataset = dataset,
                                    batch_size = 32,
                                    shuffle = True)

In [None]:
class autoencoder(torch.nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(28*28,128), 
            torch.nn.ReLU(True), #what does true do?
            torch.nn.Linear(128,64),
            torch.nn.ReLU(True),
            torch.nn.Linear(64,36),
            torch.nn.ReLU(True),
            torch.nn.Linear(36,18),
            torch.nn.ReLU(True),
            torch.nn.Linear(18,9)
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(9,18),
            torch.nn.ReLU(True),
            torch.nn.Linear(18,36),
            torch.nn.ReLU(True),
            torch.nn.Linear(36,64),
            torch.nn.ReLU(True),
            torch.nn.Linear(64,128),
            torch.nn.ReLU(True),
            torch.nn.Linear(128,28*28),
            torch.nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
model = autoencoder()
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr = 1e-3, 
                             weight_decay = 1e-5)

In [None]:
epochs = 3
outputs = []
losses = []
for epoch in range(epochs):
    for (image, _) in loader:
        image = image.reshape(-1,28*28)
        reconstructed = model(image)
        loss = criterion(reconstructed, image)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss)
    outputs.append((epochs, image, reconstructed))

plt.style.use('fivethirtyeight')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.plot(losses[-100:])

In [None]:
print(len(losses))

In [None]:
print(reconstructed.size())

In [None]:
item = image[31].reshape(-1,28,28)
plt.imshow(item[0])

In [None]:
item = reconstructed[31].reshape(-1,28,28)
item = item.detach().numpy()
plt.imshow(item[0])