In [3]:
import torch
import torch.nn as nn

In [222]:
# encoder_layer = nn.TransformerEncoderLayer(d_model=8, nhead=8)
# src = torch.tensor([[1,2,3,4,5,6,7,8]])
# out = encoder_layer(src)
# out

In [344]:
import numpy as np

a = [0,1,2,3]

np.pad(a,pad_width=(0,4-len(a))).tolist()

[0, 1, 2, 3]

In [323]:

class EncoderRNN(nn.Module):
    def __init__(self, input_size, output_size, num_layers=1):
        super(EncoderRNN, self).__init__()
        self.output_size = output_size

        self.embedding = nn.Embedding(input_size, output_size)
        self.rnn = nn.GRU(output_size, output_size, batch_first=True)
        
        
        #self.rnn = nn.LSTM(output_size, output_size, num_layers=num_layers,bidirectional=True)
        
        
        #self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        #embedded = self.dropout(self.embedding(input))
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded)
        return hidden
    


In [325]:

enc = EncoderRNN(19,2)
print(enc(torch.tensor([1,2,3,4])).detach())
print(enc(torch.tensor([1,2,3,4])).detach())
print(enc(torch.tensor([1,2,3,18,9])).detach())
print(enc(torch.tensor([1,3,2,3])).detach())

tensor([[ 0.3928, -0.1704]])
tensor([[ 0.3928, -0.1704]])
tensor([[ 0.2258, -0.4056]])
tensor([[ 0.3825, -0.1454]])


In [314]:
import torch
import torch.nn as nn

class Attention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Attention, self).__init__()
        self.W_q = nn.Linear(input_size, hidden_size)
        self.W_k = nn.Linear(input_size, hidden_size)
        self.W_v = nn.Linear(input_size, hidden_size)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_size]))

    def forward(self, query, key, value):
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)

        Q = Q.unsqueeze(1)  # Add an extra dimension
        K = K.transpose(1, 2)  # Transpose to get the correct shape
        attention_scores = torch.matmul(Q, K) / self.scale
        attention_weights = torch.softmax(attention_scores, dim=-1)

        attended_values = torch.matmul(attention_weights, V)

        return attended_values, attention_weights

class TextEncoderWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1,num_heads=8):
        super(TextEncoderWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        #self.attention = Attention(hidden_size * 2, hidden_size)
        self.attention = nn.MultiheadAttention(hidden_size,num_heads)

    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)
        output, (hidden, cell) = self.lstm(embedded)
        print(f'{output=}')
        print(f'{hidden=}')

        # Use the last hidden states from both directions
        hidden_states = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        # Apply attention
        attended_values, attention_weights = self.attention(hidden_states, output, output)

        return attended_values, attention_weights

# Example usage:
vocab_size = 10  # replace with your vocabulary size
embed_size = 32   # replace with your desired embedding size
hidden_size = 8  # replace with your desired hidden size

model = TextEncoderWithAttention(vocab_size, embed_size, hidden_size)
#input_sequence = torch.randint(0, vocab_size, (32, 20))  # replace with your input data


In [315]:
print(model(torch.tensor([[1,2,3]])))
# print(model(torch.tensor([[1,3,2]]))[0].detach().view(-1))
# print(model(torch.tensor([[1,3,2]]))[0].detach().view(-1))

output=tensor([[[ 0.0368, -0.3513, -0.0332, -0.2456,  0.0197, -0.0442,  0.0710,
           0.1692,  0.1750, -0.3338,  0.0923,  0.1968, -0.0456, -0.5608,
          -0.2093,  0.2147],
         [ 0.0509, -0.0438, -0.0054, -0.0192,  0.0094, -0.1162,  0.0343,
          -0.0616,  0.1316, -0.2480,  0.1062,  0.4024, -0.3118, -0.4493,
          -0.0595,  0.0845],
         [-0.3997,  0.0142,  0.0871,  0.0187, -0.3732, -0.1396,  0.0146,
           0.0992,  0.0075,  0.1440,  0.0429,  0.1881,  0.0247, -0.1914,
          -0.4735,  0.4208]]], grad_fn=<TransposeBackward0>)
hidden=tensor([[[-0.3997,  0.0142,  0.0871,  0.0187, -0.3732, -0.1396,  0.0146,
           0.0992]],

        [[ 0.1750, -0.3338,  0.0923,  0.1968, -0.0456, -0.5608, -0.2093,
           0.2147]]], grad_fn=<StackBackward0>)


AssertionError: For unbatched (2-D) `query`, expected `key` and `value` to be 2-D but found 3-D and 3-D tensors respectively

In [321]:
import torch
import torch.nn as nn

class TextEncoderWithMultiheadAttention(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_heads, num_layers=1):
        super(TextEncoderWithMultiheadAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.multihead_attention = nn.MultiheadAttention(embed_size, num_heads)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=False)

    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)
        
        # Permute dimensions for MultiheadAttention
        embedded = embedded.permute(1, 0, 2)

        # Apply Multihead Attention
        attention_output, _ = self.multihead_attention(embedded, embedded, embedded)
        
        # Reshape and permute back
        attention_output = attention_output.permute(1, 0, 2)

        # LSTM encoding
        output, (hidden, cell) = self.lstm(attention_output)
        #print(f'{hidden=}')

        # Use the last hidden states from both directions
        #hidden_states = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        return hidden

# Example usage:
vocab_size = 10000  # replace with your vocabulary size
embed_size = 256   # replace with your desired embedding size
hidden_size = 5  # replace with your desired hidden size
num_heads = 8      # replace with your desired number of heads

model = TextEncoderWithMultiheadAttention(vocab_size, embed_size, hidden_size, num_heads)
#input_sequence = torch.randint(0, vocab_size, (32, 20))  # replace with your input data
input_sequence = torch.tensor([[1,2,3]])
output = model(input_sequence)
output

tensor([[[ 0.0744, -0.1406, -0.1321, -0.5291, -0.2384]]],
       grad_fn=<StackBackward0>)