In [18]:
import torch

# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cuda"

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [19]:
import torch


class Tokenizer:
    def __init__(self, string: str, special_tokens: list):
        unique_list = sorted(set(string))
        unique_list.extend(special_tokens)
        self.letter_to_index = {letter: index for index, letter in enumerate(unique_list)}
        self.index_to_letter = {index: letter for index, letter in enumerate(unique_list)}

    def tokenize(self, string) -> torch.Tensor:
        string = list(string)
        tokenized_string = [self.letter_to_index[letter] for letter in string]
        tokens_tensor = torch.tensor(tokenized_string, dtype=torch.int)
        return tokens_tensor.to(device=device)

    def get_vocab_size(self):
        return len(self.letter_to_index)


tokenizer_text_to_sample = "qwertyuiop[]\\asdfghjkl;\'zxcvbnm,./QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?1234567890-=`!@#$%^&*()_+~ "
special_tokens = [
    "<model>", "</model>",
    "<user>", "</user>",
    "<system>", "</system>",
]
tokenizer = Tokenizer(tokenizer_text_to_sample, special_tokens)

In [20]:
import torch
from torch import nn


class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.att_block = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True)
        self.feedforward = nn.Sequential(
            nn.LazyLinear(out_features=embed_dim * 4),
            nn.LeakyReLU(),
            nn.LazyLinear(out_features=embed_dim)
        )
        self.queries = nn.LazyLinear(out_features=embed_dim)
        self.keys = nn.LazyLinear(out_features=embed_dim)
        self.values = nn.LazyLinear(out_features=embed_dim)

    def forward(self, token_sequence):
        q = self.queries(token_sequence)
        k = self.queries(token_sequence)
        v = self.queries(token_sequence)

        attended_sequence, att_output_weights = self.att_block(q, k, v)

        fed_forwards_sequence = self.feedforward(attended_sequence)

        return fed_forwards_sequence


In [21]:
import torch
from torch import nn


class Decoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.LazyLinear(out_features=vocab_size * 4),
            nn.LeakyReLU(),
            nn.LazyLinear(out_features=vocab_size)
        )

    def forward(self, attended_sequence):
        outputs = self.decoder(attended_sequence)
        return outputs

In [22]:
import torch
from torch import nn


class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_transformer_blocks):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim).to(device=device)
        self.transformer_blocks = [TransformerBlock(embed_dim=embed_dim, num_heads=num_heads).to(device=device) for _ in
                                   range(num_transformer_blocks)]
        self.decoder = Decoder(vocab_size).to(device=device)

    def forward(self, tokenized_sequence):
        embedded_sequence = self.embedding(tokenized_sequence)
        for i in range(len(self.transformer_blocks)):
            embedded_sequence = self.transformer_blocks[i](embedded_sequence)
        predictions = self.decoder(embedded_sequence)
        return predictions

In [23]:
model = Model(
    vocab_size=tokenizer.get_vocab_size(),
    embed_dim=1024,
    num_heads=4,
    num_transformer_blocks=3,
).to(device=device)

sample_sentence = "Hello World!"
tokenized_sentence = tokenizer.tokenize(sample_sentence)
model_out = model(tokenized_sentence)
print(model_out.size())

torch.Size([12, 101])
