In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
# focus on 2 layer single direction lstms before fucking around

In [2]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, bidirectional = False):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,)
        self.fc = nn.Linear(2*hidden_size,hidden_size)

    def forward(self, x):
        # x: (seq_len, batch_size, input_size)      seq len would be the number of touchpoints
        outputs, (hidden, cell) = self.lstm(x)
        # print(next(self.parameters()).device)
        # if self.bidirectional:
        #     outputs = self.fc(outputs)
        # outputs: (seq_len, batch_size, hidden_size)
        # hidden, cell: (num_layers, batch_size, hidden_size)
        return outputs, hidden, cell

In [40]:
test_encoder = Encoder(input_size=6,
                       hidden_size=32,
                       num_layers=2,
                       bidirectional=True).to(device)
test_en_in = torch.rand((8, 60, 6)).to(device)
en_out, en_hidden, en_cell = test_encoder(test_en_in)

In [41]:
key_project = nn.Linear(64,32).to(device)

In [42]:
key = key_project(en_out)

In [23]:
t_proj_layer_1 = nn.Linear(64,32).to(device)
t_proj_layer_2 = nn.Linear(64,32).to(device)

In [None]:
layer_1_state = torch.cat((en_hidden[0], en_hidden[2]), dim = -1)
layer_2_state = torch.cat((en_hidden[1], en_hidden[3]), dim = -1)

In [25]:
proj_layer_1 = t_proj_layer_1(layer_1_state).unsqueeze(0)
proj_layer_2 = t_proj_layer_2(layer_2_state).unsqueeze(0)

In [None]:
tt = torch.Tensor().to(device)

In [None]:
new = torch.rand((8,1, 27))
tt = torch.cat((tt, new), dim = 1)

In [3]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_letters=10, force_ratio = 0.7, num_layers = 2, bidirectional = False):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.force_ratio = force_ratio
        self.max_letters=max_letters

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(input_size=2*hidden_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        # the output size doubles if the lstm is bidirectional
        fc_in_size = 2*hidden_size if bidirectional else hidden_size
        self.fc = nn.Linear(fc_in_size,output_size)

    def forward(self, key, value, encoder_hidden, encoder_cell, word_tensor=None):
        """
        Args:
            encoder_output: output from encoder (N, L, decoder hidden)
            encoder_hidden: hidden state from encoder (D * Layers, N, Hidden) for encoder
            encoder_cell: cell state from encoder (D * Layers, N, Hidden) for encoder
            word_tensor: word tensors containing indicies of letters (N, max length)
        """
        batch_size = key.shape[0]
        word_length = min(self.max_letters, word_tensor.shape[-1]) if word_tensor is not None else self.max_letters
        # create the first decoder input, which is blank inputs
        decoder_input = torch.tensor([0] * batch_size, dtype=torch.long, device=next(self.parameters()).device)
        # initial hidden and cell state comes from final layer of encoder, shape is (N, encoder hidden)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        decoder_outputs = torch.Tensor().to(next(self.parameters()).device)
        # for each letter in word
        for i in range(word_length):
            decoder_output, decoder_hidden, decoder_cell = self.step(decoder_input,
                                                                     decoder_hidden,
                                                                     decoder_cell,
                                                                     key,
                                                                     value)
            decoder_outputs = torch.cat((decoder_outputs,decoder_output), dim=1)  # add to list of outputs
            # print(decoder_output.shape)

            teacher_force = torch.rand(1).item() < self.force_ratio
            
            if teacher_force and word_tensor is not None:
                decoder_input = word_tensor[:,i]    # next letter
            else:
                # for using when not teacher forcing, use model prediction
                decoder_input = torch.argmax(decoder_output.squeeze(1), dim=-1) # 1D with length N
        
        probs = F.softmax(decoder_outputs, dim=-1)

        return probs, decoder_hidden, decoder_cell

    def step(self, decoder_input, hidden, cell, key, value):
        """
        Args:
            decoder_input: shape (N), should be indicies for the letters, dtype either int or long
            hidden: hidden state of lstm (D*Layers, N, decoder hidden)
            cell: cell state of lstm (D*Layers, N, deocder hidden)
            encoder_output: output from encoder (N, L, decoder hidden)
        """
        embedded = self.embedding(decoder_input.unsqueeze(1))  # (N, 1 decoder hidden)
        query = hidden[-1].unsqueeze(0).permute(1, 0, 2) # (N, D*Layers, decoder hidden)    Query should come from the decoders hidden state
        # print(query.shape)
        context = F.scaled_dot_product_attention(query, key, value) # (N, 1, decoder hidden)
        # print(context.shape)
        input_lstm = torch.cat((embedded, context), dim=-1) # (N, 1, 2*decoder hidden)
        output_lstm, (hidden, cell) = self.lstm(input_lstm, (hidden, cell)) # output lstm (N, 1, D*hidden)
        output_fc = self.fc(output_lstm)

        return output_fc, hidden, cell

In [39]:
test_decoder = Decoder(hidden_size=32,
                       output_size=27,
                       force_ratio=0.7,
                       num_layers=2,
                       bidirectional=False).to(device)

test_en_output = torch.rand((8, 60, 32)).to(device)
test_en_hid = torch.rand((2, 8, 32)).to(device)
test_en_cell = torch.rand((2, 8, 32)).to(device)
test_word = torch.randint(0, 27, (8, 4)).to(device)

de_out, de_hid, de_cell = test_decoder(test_en_output,test_en_output, test_en_hid, test_en_cell, test_word)
de_out.shape

torch.Size([8, 4, 27])

In [40]:
indicies = torch.argmax(de_out.squeeze(1), dim=-1)

In [9]:
vocabulary = {'_': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8,
              'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16,
              'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24,
              'y': 25, 'z': 26}

In [10]:
reversed_vocab = {k: u for u, k in vocabulary.items()}

In [44]:
reversed_vocab[5]

'e'

In [46]:
# indicies=indicies.tolist()
words = []
for word in indicies:
    characters = [reversed_vocab[i] for i in word]
    words.append(characters)

In [8]:
def handle_outputs(decoder_output):
    indicies = torch.argmax(decoder_output.squeeze(1), dim=-1).tolist()
    words = []
    for word in indicies:
        characters = [reversed_vocab[i] for i in word]
        words.append(characters)
    
    return words

In [48]:
handle_outputs(de_out)

[['w', 'v', 'v', 'v', 'i', 'i', 'v', 'i', 'i', 'v'],
 ['y', 'i', 'v', 'i', 'i', 'i', 'v', 'v', 'i', 'v'],
 ['u', 'u', 'v', 'v', 'v', 'v', 'v', 'i', 'v', 'v'],
 ['a', 'v', 'v', 'i', 'i', 'i', 'i', 'i', 'i', 'i'],
 ['u', 'u', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v'],
 ['i', 'v', 'v', 'i', 'i', 'i', 'i', 'i', 'i', 'i'],
 ['h', 'v', 'v', 'i', 'i', 'i', 'i', 'v', 'v', 'v'],
 ['u', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v', 'v']]

In [4]:
class Seq2Seq(nn.Module):
    def __init__(self, hidden_size, num_layers, bidirectional=False, input_size=6, max_letters=10, force_ratio=0.7, output_size=27):
        super(Seq2Seq, self).__init__()
        self.input_size = input_size                # number of features for input
        self.output_size = output_size              # output size of decoder, should be 27
        self.encoder_hidden_size = hidden_size      # hidden size of encoder and decoder
        self.num_layers = num_layers                # number of layers for both lstms
        self.bidirectional = bidirectional          # bidirectional for both lstms
        self.max_letters = max_letters              # max length for word output, word tensors should be padded to this length
        self.force_ratio = force_ratio              # chance for teacher forcing in training

        self.encoder = Encoder(input_size=input_size,
                               hidden_size=hidden_size,
                               num_layers=num_layers,
                               bidirectional=bidirectional)

        self.decoder = Decoder(hidden_size=hidden_size,
                               output_size=output_size,
                               max_letters=max_letters,
                               force_ratio=force_ratio,
                               num_layers=num_layers,
                               bidirectional=False) # keep decoder unidirectional
        
        # used for projecting the hidden and cell for both layers of the encoder to the inital decoder state
        self.hidden_projection_1 = nn.Linear(hidden_size*2,hidden_size)
        self.cell_projection_1 = nn.Linear(hidden_size*2,hidden_size)

        self.hidden_projection_2 = nn.Linear(hidden_size*2,hidden_size)
        self.cell_projection_2 = nn.Linear(hidden_size*2,hidden_size)

        self.key_projection = nn.Linear(hidden_size*2,hidden_size)
        self.value_projection = nn.Linear(hidden_size*2,hidden_size)
    
    def forward(self, input, word_tensors=None):
        # calculate the encoder outputs
        encoder_output, encoder_hidden, encoder_cell = self.encoder(input)
        # if the encoder is bidirectional, the hidden and cell states of the encoder need to be projected to match dimensions
        if self.bidirectional:
            # concatenate the layer states 
            layer_1_hidden = torch.cat((encoder_hidden[0], encoder_hidden[2]), dim=-1)
            layer_2_hidden = torch.cat((encoder_hidden[1], encoder_hidden[3]), dim=-1)

            layer_1_cell = torch.cat((encoder_cell[0], encoder_cell[2]), dim=-1)
            layer_2_cell = torch.cat((encoder_cell[1], encoder_cell[3]), dim=-1)

            # project
            project_h1 = F.softmax(self.hidden_projection_1(layer_1_hidden), dim=-1).unsqueeze(0)
            project_h2 = F.softmax(self.hidden_projection_2(layer_2_hidden), dim=-1).unsqueeze(0)

            project_c1 = F.softmax(self.cell_projection_1(layer_1_cell), dim=-1).unsqueeze(0)
            project_c2 = F.softmax(self.cell_projection_2(layer_2_cell), dim=-1).unsqueeze(0)

            # recombine
            encoder_hidden = torch.cat((project_h1, project_h2), dim=0)
            encoder_cell = torch.cat((project_c1, project_c2), dim=0)

            key = self.key_projection(encoder_output)
            value = self.value_projection(encoder_output)
        else:
            # if not bidirectional, key and value arent transformed
            key = encoder_output
            value = encoder_output

        # print(encoder_hidden.shape)
        # print(encoder_cell.shape)
        # print(encoder_output.shape)
        # decoder
        log_probs, decoder_hidden, decoder_cell = self.decoder(key, value, encoder_hidden, encoder_cell, word_tensors)
        # shape of log_probs (N, T, C)
        return log_probs  
     

In [30]:
t_model = Seq2Seq(32,2,True).to(device)

test_input = torch.rand((1, 60, 6)).to(device)
test_output = t_model(test_input)
test_pred_words = handle_outputs(test_output)

In [2]:
from model import Encoder

t_enc = Encoder(6, 32, 2, True).to(device)

In [3]:
t_input = torch.rand(8, 60, 6).to(device)
t_output, t_en_hid, t_en_cell = t_enc(t_input)

In [4]:
hidden_states = torch.cat([torch.cat((t_en_hid[i,:,:], t_en_hid[i+1,:,:]), dim=1).unsqueeze(0) for i in range(0, t_en_hid.shape[0], 2)], dim=0)

In [18]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=2, bidirectional=False):
        """
        LSTM in the model is always batch first
        param input_size: input size for decoder, should equal 27 also equals output size
        param hidden_size: hidden_size for LSTM
        param num_layers: number of layers
        param bidirectional: bool
        """
        super(Decoder, self).__init__()
        self.outuput_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=output_size+hidden_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,)
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self, input, hidden, cell, encoder_output):
        """
        param input: (batch_size, 1, 27) next input, either previous output or one hot encoded letters
        param hidden: previous hidden state of decoder  (num_layers, N, hidden_size)
        param cell: previous cell state of decoder  (num_layers, N, hidden_size)
        param encoder_output: encoder output used for attention value and key
        """

        query = hidden[-1].unsqueeze(0).permute(1, 0, 2) # (N, num_layers, hidden_size)
        context = F.scaled_dot_product_attention(query, encoder_output, encoder_output)
        print(context.shape)
        input = torch.cat((input, context), dim = -1)   # (N, 1, hidden + output_size)

        # lstm_out shape (N, 1, hidden_size)
        lstm_out, (hidden, cell) = self.lstm(input, (hidden, cell))
        output = F.log_softmax(self.fc(lstm_out), dim=-1)
        
        # output: (batch_size, 1, hidden_size)
        # hidden, cell: (num_layers, batch_size, hidden_size)
        return output, hidden, cell

t_de = Decoder(27, 32, 2, False).to(device)

In [20]:
t_de_in = torch.LongTensor([5,8,3,1,6,3,6,8]).to(device)
# one hot encodes the target characters
one_hot = F.one_hot(t_de_in, num_classes =27).unsqueeze(1) # (N, 1, 27)

t_de_out, t_de_hid, t_de_cell = t_de(one_hot, t_en_hid, t_en_cell, t_output)

torch.Size([8, 1, 32])


In [17]:
torch.LongTensor([1] + [0]*7)

tensor([1, 0, 0, 0, 0, 0, 0, 0])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, bidirectional = True):
        """
        LSTM in the model is always batch first
        param input_size: input size for encoder, 6
        param hidden_size: hidden_size for LSTM
        param num_layers: number of layers
        param bidirectional: bool
        """
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,)
        self.fc_hidden = nn.Linear(2*hidden_size,hidden_size)
        self.fc_cell = nn.Linear(2*hidden_size,hidden_size)
        self.fc_out = nn.Linear(2*hidden_size, hidden_size)

    def forward(self, x):
        """
        : param x: (batch_size, seq_len, input_size)      seq len would be the number of touchpoints
        : return outputs: (seq_len, batch_size, hidden_size)
        : return hidden, cell: the final hidden and cell states
        """
        lstm_outputs, (hidden, cell) = self.lstm(x)
        # shape now of states is (2, hidden state) after concat
        hidden_states = torch.cat([torch.cat((hidden[i,:,:], hidden[i+1,:,:]), dim=1).unsqueeze(0) for i in range(0, hidden.shape[0], 2)], dim=0)
        cell_states = torch.cat([torch.cat((cell[i,:,:], cell[i+1,:,:]), dim=1).unsqueeze(0) for i in range(0, cell.shape[0], 2)], dim=0)

        hidden = torch.relu(self.fc_hidden(hidden_states))
        cell = torch.relu(self.fc_cell(cell_states))
        outputs = torch.relu(self.fc_out(lstm_outputs))

        return outputs, hidden, cell
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=2, bidirectional=False):
        """
        LSTM in the model is always batch first
        param input_size: input size for decoder, should equal 27 also equals output size
        param hidden_size: hidden_size for LSTM
        param num_layers: number of layers
        param bidirectional: bool
        """
        super(Decoder, self).__init__()
        self.outuput_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size=hidden_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True,
                            bidirectional=bidirectional,)
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self, hidden, cell, encoder_output):
        """
        param hidden: previous hidden state of decoder  (num_layers, N, hidden_size)
        param cell: previous cell state of decoder  (num_layers, N, hidden_size)
        param encoder_output: encoder output used for attention value and key
        """

        query = hidden[-1].unsqueeze(0).permute(1, 0, 2) # (N, num_layers, hidden_size)
        context = F.scaled_dot_product_attention(query, encoder_output, encoder_output) # (N, 1, hidden_size)

        # lstm_out shape (N, 1, hidden_size)
        lstm_out, (hidden, cell) = self.lstm(context, (hidden, cell))
        output = F.log_softmax(self.fc(lstm_out), dim=-1)
        
        # output: (batch_size, 1, hidden_size)
        # hidden, cell: (num_layers, batch_size, hidden_size)
        return output, hidden, cell    

class Seq2Seq(nn.Module):
    def __init__(self, hidden_size, num_layers, input_size=6, output_size=27, max_letters = 20):
        super(Seq2Seq, self).__init__()
        self.input_size = input_size                # number of features for input
        self.output_size = output_size              # output size of decoder, should be 27
        self.encoder_hidden_size = hidden_size      # hidden size of encoder and decoder
        self.num_layers = num_layers                # number of layers for both lstms
        self.max_letters = max_letters              # max length of prediction

        self.encoder = Encoder(input_size=input_size,
                               hidden_size=hidden_size,
                               num_layers=num_layers,
                               bidirectional=True)

        self.decoder = Decoder(hidden_size=hidden_size,
                               output_size=output_size,
                               num_layers=num_layers,
                               bidirectional=False) # keep decoder unidirectional
    
    def forward(self, input):
        """
        param input: (N, T, 6) input sequence
        """
        batch_size = input.shape[0]

        # encode the sequence
        encoder_output, hidden, cell = self.encoder(input)

        # where to store all the log probabilities
        outputs = torch.zeros(batch_size, encoder_output.shape[1], self.output_size).to(next(self.parameters()).device)

        for i in range(encoder_output.shape[1]):
            output, hidden, cell = self.decoder(hidden, cell, encoder_output)
            # print(output.shape)
            outputs[:,i,:] = output.squeeze(1)

        return outputs
    
    def predict(self, input, max_length):
        batch_size = input.shape[0]
        # store outputs
        outputs = torch.zeros(batch_size, max_length, self.output_size).to(next(self.parameters()).device)

        # encode the sequence
        encoder_output, hidden, cell = self.encoder(input)
        # get the first decoder input
        blank = torch.LongTensor([0]*batch_size).to(next(self.parameters()).device)
        decoder_input = F.one_hot(blank, num_classes=27).unsqueeze(1)   # (N, 1, 27)

        for i in range(max_length):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell, encoder_output)
            outputs[:,i,:] = output.squeeze(1)
            decoder_input = output

        return outputs
    
t_model = Seq2Seq(32, 2, 6, 27).to(device)

In [27]:
torch.zeros(2, 4, 27)[:,0,:].shape

torch.Size([2, 27])

In [15]:
import numpy as np

t_input = torch.rand(2, 60, 6).to(device)

vocabulary = {'_': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8,
              'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16,
              'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24,
              'y': 25, 'z': 26}

words = ["walk", "bulk"]
encoded = np.zeros(shape=(2, 4))
for i, word in enumerate(words):
    encoded_word = []
    for letter in word:
        encoded_word.append(vocabulary[letter])
    encoded[i] = encoded_word

encoded = torch.LongTensor(encoded).to(device)

In [16]:
t_output = t_model(t_input, encoded, 0.5)
t_predict = t_model.predict(t_input, 4)
torch.equal(t_output, t_predict)

False

In [10]:
loss_fn = nn.CTCLoss()
input_lengths = torch.LongTensor([4, 4])
target_lengths = torch.LongTensor([4, 4])
loss_fn(t_output.permute(1, 0, 2), encoded, input_lengths, target_lengths)

tensor(3.2946, device='cuda:0', grad_fn=<MeanBackward0>)