In [197]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [243]:
import torch
from torch import nn


class Encoder(nn.Module):
    def __init__(self, encode_size: int, letters: str) -> None:
        super().__init__()

        self.letters = letters
        letters_list = list(letters)
        self.letters_dict = {}
        for index, letter in enumerate(letters_list):
            self.letters_dict[letter] = index

        self.vocabulary = nn.Parameter(torch.randn(encode_size, len(self.letters_dict)))

    def tokenize(self, string: str) -> torch.Tensor:
        indices = [self.letters_dict[letter] for letter in string]
        tensor_text = torch.zeros(len(self.letters_dict), len(indices))
        for index, letter in enumerate(indices):
            tensor_text[letter, index] = 1.0
        print(tensor_text.size())
        return tensor_text

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        print(self.vocabulary.size(), x.size())
        embedded_tokens = torch.matmul(self.vocabulary, x)
        # TODO positional embedding
        # TODO normalize the stuff
        return embedded_tokens

In [None]:
import torch
from torch import nn


class AttentionBlock(nn.Module):
    def __init__(self, num_heads: int, encode_size: int) -> None:
        super().__init__()

        self.num_heads = num_heads
        self.encode_size = encode_size
        self.hidden_encode_size = encode_size // num_heads

        self.query = nn.Linear(in_features=self.encode_size, out_features=self.hidden_encode_size)
        self.key = nn.Linear(in_features=self.encode_size, out_features=self.hidden_encode_size)
        self.value = nn.Linear(in_features=self.encode_size, out_features=self.hidden_encode_size)

        self.lrelu = nn.LeakyReLU()

        self.mlp = nn.Sequential(
            nn.Linear(in_features=self.encode_size, out_features=self.hidden_encode_size * 4),
            self.lrelu,
            nn.Linear(in_features=self.hidden_encode_size * 4, out_features=self.hidden_encode_size)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        queries, keys, values = self.query(x), self.key(x), self.value(x)

        scores = torch.matmul(queries, keys.transpose(-2, -1))

        # TODO figure out the rest of the optimization mumbo jumbo

        return x

In [None]:
import torch
from torch import nn


class Decoder(nn.Module):
    def __init__(self, encode_size: int, vocab_amount: int) -> None:
        super().__init__()

        self.lrelu = nn.LeakyReLU()

        self.decoder = nn.Sequential(
            nn.Linear(in_features=encode_size, out_features=encode_size * 4),
            self.lrelu,
            nn.Linear(in_features=encode_size * 4, out_features=vocab_amount)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.decoder(x)
        return x

In [None]:
import torch
from torch import nn


class Model(nn.Module):
    def __init__(self, num_heads: int, encode_size: int) -> None:
        super().__init__()

        self.encoder = Encoder(encode_size=encode_size, letters="abcdefghijklmnopqrstuvwxyz .,")
        self.attention_block_1 = AttentionBlock(num_heads=num_heads, encode_size=encode_size)
        self.attention_block_2 = AttentionBlock(num_heads=num_heads, encode_size=encode_size)
        self.attention_block_3 = AttentionBlock(num_heads=num_heads, encode_size=encode_size)
        self.decoder = Decoder(encode_size=encode_size, vocab_amount=len(self.encoder.letters_dict))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder(x)
        x = self.attention_block_1(x)
        x = self.attention_block_2(x)
        x = self.attention_block_3(x)
        x = self.decoder(x)
        return x