In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
from collections import OrderedDict

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1), dtype=torch.float32))
batch.append(torch.tensor(tokenizer.encode(txt2), dtype=torch.float32))

batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109., 3626., 6100.,  345.],
        [6109., 1110., 6622.,  257.]])


In [3]:
input_size: int = 10
hidden_size: int = 768
sequence_length = 12
batch_size: int = 2
rnn = nn.GRU(input_size, hidden_size=hidden_size, batch_first=True)
input = torch.randn(batch_size, sequence_length, input_size)
output, hn = rnn(input)
print(f"output.shape: {output.shape}")
print(f"hn.shape: {hn.shape}")
print(hn)
print(torch.permute(hn, (1, 0, 2)).size())
porra = output + torch.permute(hn, (1, 0, 2))
print(porra.shape)

output.shape: torch.Size([2, 12, 768])
hn.shape: torch.Size([1, 2, 768])
tensor([[[ 0.0699,  0.0321,  0.0518,  ...,  0.0973, -0.0248, -0.0499],
         [ 0.1034, -0.0761,  0.0723,  ..., -0.0060,  0.0452, -0.0351]]],
       grad_fn=<StackBackward0>)
torch.Size([2, 1, 768])
torch.Size([2, 12, 768])


In [4]:
# Initializes the Multihead attention layer
multihead_attn = torch.nn.MultiheadAttention(
    embed_dim=hidden_size,
    num_heads=4,
    dropout=0.5,
    bias=False,
    batch_first=True
)

In [5]:
attn_output, attn_output_weights = multihead_attn(porra, porra, porra)
print(attn_output.shape)

torch.Size([2, 12, 768])


In [6]:
model = nn.Sequential(OrderedDict([
    ('dense1', nn.Linear(768, 100)),
    ('act1', nn.ReLU()),
    ('dense2', nn.Linear(100, 50)),
    ('act2', nn.ReLU()),
    ('output', nn.Linear(50, 10)),
    ('outact', nn.Sigmoid()),
]))
model

Sequential(
  (dense1): Linear(in_features=768, out_features=100, bias=True)
  (act1): ReLU()
  (dense2): Linear(in_features=100, out_features=50, bias=True)
  (act2): ReLU()
  (output): Linear(in_features=50, out_features=10, bias=True)
  (outact): Sigmoid()
)

In [7]:
norm = nn.LayerNorm(10)

In [8]:
outi = model(attn_output)
outi = F.normalize(outi)
outi

tensor([[[0.2886, 0.2887, 0.2887, 0.2887, 0.2887, 0.2888, 0.2886, 0.2886,
          0.2886, 0.2887],
         [0.2886, 0.2886, 0.2886, 0.2887, 0.2886, 0.2888, 0.2887, 0.2886,
          0.2886, 0.2887],
         [0.2888, 0.2884, 0.2885, 0.2887, 0.2886, 0.2886, 0.2888, 0.2888,
          0.2888, 0.2886],
         [0.2887, 0.2889, 0.2889, 0.2888, 0.2887, 0.2886, 0.2887, 0.2887,
          0.2886, 0.2888],
         [0.2887, 0.2887, 0.2887, 0.2888, 0.2887, 0.2888, 0.2886, 0.2887,
          0.2886, 0.2887],
         [0.2888, 0.2887, 0.2888, 0.2886, 0.2886, 0.2886, 0.2887, 0.2887,
          0.2887, 0.2888],
         [0.2886, 0.2886, 0.2886, 0.2886, 0.2887, 0.2887, 0.2886, 0.2886,
          0.2888, 0.2886],
         [0.2886, 0.2885, 0.2885, 0.2887, 0.2887, 0.2887, 0.2887, 0.2886,
          0.2887, 0.2885],
         [0.2888, 0.2887, 0.2887, 0.2887, 0.2888, 0.2886, 0.2887, 0.2887,
          0.2887, 0.2887],
         [0.2887, 0.2887, 0.2887, 0.2885, 0.2887, 0.2885, 0.2887, 0.2887,
          0.2888,