In [3]:
import torch
import torch.optim as optim

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
from custom_torch_dataset import SwipeDataset
import os

data = SwipeDataset(os.path.join(os.getcwd(), "dataset"), batch=True, batch_first=True)

In [None]:
print(f"{data[0][0]}")
print(f"{data[0][0].shape}")

In [None]:
print(f"{data[0][2]}")
print(f"{data[0][2].shape}")

In [None]:
data[0][2][0][0]

In [None]:
torch.cat((torch.Tensor([0]), data[0][2].squeeze()))

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional = False):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,)
        self.fc = nn.Linear(2*hidden_size,hidden_size)

    def forward(self, x):
        # x: (seq_len, batch_size, input_size)      seq len would be the number of touchpoints
        outputs, (hidden, cell) = self.lstm(x)

        # if self.bidirectional:
        #     outputs = self.fc(outputs)
        # outputs: (seq_len, batch_size, hidden_size)
        # hidden, cell: (num_layers, batch_size, hidden_size)
        return outputs, hidden, cell

In [31]:
encoder = Encoder(input_size=6,
                  hidden_size=64,
                  num_layers=2,
                  bidirectional=True,).to(device)

en_output, en_hidden, en_cell = encoder(data[0][0].to(device))
print(en_output)
print(en_hidden)

tensor([[[-6.3466e-02, -5.8422e-02,  5.7168e-02,  ..., -1.1227e-02,
           9.7457e-02,  1.1600e-01],
         [-5.9514e-02, -2.0010e-02,  7.6389e-02,  ...,  4.5899e-02,
           7.3622e-02,  2.3051e-01],
         [-4.3995e-02,  3.5770e-02,  6.4889e-02,  ...,  6.9043e-02,
           6.6774e-02,  2.8291e-01],
         ...,
         [ 4.2326e-05,  2.5941e-01,  1.0277e-01,  ...,  4.9886e-03,
          -2.3909e-02,  2.1044e-01],
         [ 1.7118e-03,  2.6129e-01,  1.0233e-01,  ...,  5.6349e-03,
          -3.3850e-02,  1.6091e-01],
         [ 4.2687e-03,  2.5741e-01,  9.1282e-02,  ...,  3.3023e-03,
          -3.7539e-02,  8.8736e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward0>)
tensor([[[ 1.6683e-01,  7.6160e-01, -7.6159e-01,  5.9580e-19,  2.3287e-12,
          -1.2129e-01, -3.2210e-02,  8.6411e-03, -3.1395e-03,  5.9091e-04,
          -1.0000e+00, -2.8133e-09,  1.8214e-21, -2.0838e-21,  2.6293e-11,
          -3.4006e-25,  7.4989e-02,  1.0000e+00, -7.4796e-01, -3.0850e-03,
 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# d_k = 64    
# d_v = 64    # should equal hidden size of decoder
# N = 16  # batch size
# H = 1   # attention heads
# w_L = 1 # always 1 because we are doing letter by letter
# S_L = 50    # sequence length

# # query is the hidden state of encoder or decoder
# query = torch.rand(w_L, d_k, dtype=torch.float16, device="cuda")
# # key and value are encoder output
# key = torch.rand( S_L, d_k, dtype=torch.float16, device="cuda")
# value = torch.rand( S_L, d_v, dtype=torch.float16, device="cuda")
attention = F.scaled_dot_product_attention(en_hidden.permute(1,0,2),en_output,en_output)

In [None]:
hidden_size = 128       # decoder hidden size is double that of the encoder if encoder is bidirectional
output_size = 27

test_emb=nn.Embedding(output_size,hidden_size).to(device)
test_lstm = nn.LSTM(input_size=2*hidden_size,
                    hidden_size=hidden_size,
                    bidirectional=False,
                    batch_first=True).to(device)
test_fc = nn.Linear(hidden_size, output_size).to(device)    # the input shape of linear is 2*hidden if lstm is bidirectional

In [None]:
batch_size = 1
first_input =  torch.tensor([0] * batch_size, dtype=torch.long, device=device).unsqueeze(1)     # (N, 1), 1 because only one letter at a time
emb_first_input = test_emb(first_input)     # (N, 1, hidden size)
emb_first_input.device

device(type='cuda', index=0)

In [None]:
encoder_final_hidden = torch.cat((en_hidden[1], en_hidden[-1]), dim=-1).unsqueeze(1)    # shape (N, 1, decoder hidden size) 
encoder_final_hidden.device
encoder_final_cell = torch.cat((en_cell[1], en_cell[-1]), dim=-1).unsqueeze(1)    # shape (N, 1, decoder hidden size)
encoder_final_cell.device

device(type='cuda', index=0)

In [45]:
context = F.scaled_dot_product_attention(encoder_final_hidden, en_output, en_output)
context.device

device(type='cuda', index=0)

In [46]:
decoder_input = torch.cat((emb_first_input, context), dim=-1)

In [None]:
test_hidden = torch.rand((1, 4, 128)).to(device)
test_cell = torch.rand((1, 4, 128)).to(device)
test_input = torch.rand((4, 1, 256)).to(device)

In [None]:
lstm_out, (de_hidden, de_cell) = test_lstm(decoder_input, (encoder_final_hidden, encoder_final_cell))   # (N, 1, hidden size) lstmout
lstm_out.device

device(type='cuda', index=0)

In [None]:
fc_output = test_fc(lstm_out)   # shape (N, 1, 27)  middle dimension is 1 because we are doing one letter at a time, so sequence length is 1
fc_output.squeeze(1).shape

torch.Size([1, 27])

In [58]:
example_out = torch.rand((4, 1, 27)).to(device)

In [66]:
# for using when not teacher forcing
predicted_tokens = torch.argmax(example_out.squeeze(1), dim=-1) # 1D with length N
new_input = test_emb(predicted_tokens.unsqueeze(1)) # (N, 1, 128)

In [83]:
example_targets = torch.randint(0,28, (4,)).to(device)  # (N) shape
example_input = test_emb(example_targets.unsqueeze(1))
example_targets.dtype

torch.int64

In [None]:
def step(decoder_input, hidden, cell, encoder_output):
    """
    Args:
        decoder_input: shape (N), should be indicies for the letters, dtype either int or long
        hidden: hidden state of lstm (D*Layers, N, decoder hidden)
        cell: cell state of lstm (D*Layers, N, deocder hidden)
        encoder_output: output from encoder (N, L, decoder hidden)
    """
    embedded = test_emb(decoder_input.unsqueeze(1))  # (N, 1 decoder hidden)
    query = hidden.permute(1, 0, 2) # (N, D*Layers, decoder hidden)
    context = F.scaled_dot_product_attention(query, encoder_output, encoder_output) # (N, 1, decoder hidden)
    input_lstm = torch.cat((embedded, context), dim=-1) # (N, 1, 2*decoder hidden)
    output_lstm, (hidden, cell) = test_lstm(input_lstm, (hidden, cell)) # output lstm (N, 1, D*hidden)
    output_fc = test_fc(output_lstm)

    return output_fc, hidden, cell

In [None]:
step(first_input.squeeze(1), encoder_final_hidden.permute(1, 0 , 2), )

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

BLANK_TOKEN = 0

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size=27, force_ratio=0.5):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.force_ratio = force_ratio
        self.lstm = nn.LSTM(2 * hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor):
        # Ensure target_tensor is on the same device as the model
        target_tensor = target_tensor.to(next(self.parameters()).device)
        
        # target tensor shape is [N, target length]
        batch_size = target_tensor.size(0)
        target_tensor = target_tensor.long()  # Ensure target_tensor is LongTensor
        
        # Initialize decoder_input with BLANK_TOKEN on the correct device, WRONNGNGNNGNGNGNNGNGNGNNGG
        decoder_input = self.embedding(
            torch.tensor([BLANK_TOKEN] * batch_size, dtype=torch.long, device=target_tensor.device).unsqueeze(1))  # Shape: (batch_size, 1, hidden_size)
        
        # Initial decoder hidden state is encoder hidden state
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        target_length = target_tensor.shape[1]

        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_cell = self.forward_step(decoder_input.long(), decoder_hidden, encoder_outputs)
            decoder_outputs.append(decoder_output)
            teacher_force = torch.rand(1).item() < self.force_ratio
            teacher_force = True  # Force teacher forcing for debugging

            if teacher_force:
                # Use ground truth token
                decoder_input = self.embedding(target_tensor[:, i].unsqueeze(1))  # Shape: (batch_size, 1, hidden_size)
            else:
                # Use predicted token
                predicted_token = torch.argmax(decoder_output, dim=-1)
                predicted_token = torch.clamp(predicted_token, 0, self.embedding.num_embeddings - 1)  # Clamp to valid range
                decoder_input = self.embedding(predicted_token.unsqueeze(1))  # Shape: (batch_size, 1, hidden_size)

        decoder_outputs = torch.stack(decoder_outputs, dim=1)  # Shape: (batch_size, sequence_length, output_size)
        decoder_outputs = F.softmax(decoder_outputs, dim=-1)
        return decoder_outputs

    def forward_step(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)  # Shape: (batch_size, 1, hidden_size)
        query = hidden.permute(1, 0, 2)  # Shape: (batch_size, 1, hidden_size)
        print(query)
        context = F.scaled_dot_product_attention(query, encoder_outputs, encoder_outputs)  # Shape: (batch_size, 1, hidden_size)
        decoder_input = torch.cat((embedded, context), dim=-1)  # Shape: (batch_size, 1, 2 * hidden_size)
        lstm_out, (hidden, cell) = self.lstm(decoder_input, hidden)  # Shape: (batch_size, 1, hidden_size)
        output = self.fc(lstm_out)  # Shape: (batch_size, 1, output_size)
        return output, hidden, cell

In [None]:
data[0][2][:,0]

In [None]:
decoder = Decoder(hidden_size=64, output_size=27).to(device)
out = decoder(en_output, en_hidden, data[0][2].to(device))

In [None]:
decoder.embedding.num_embeddings