In [1]:
%pip install pytorch-forecasting
%pip install wandb
%pip install optuna

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.0.0-py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.4/140.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi>=0.80
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lightning<3.0.0,>=2.0.0
  Downloading lightning-2.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting pytorch-optimizer<3.0.0,>=2.5.1
  Downloading pytorch_optimizer-2.12.0-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.8/155.8 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna<4.0.0,>=3.1.0
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
import pickle
import pytorch_forecasting as pl
import wandb
import optuna

  warn(f"Failed to load image Python extension: {e}")


In [3]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
eval_interval = 500
block_size = 356

In [6]:
# wandb.init(
#    project="shell-transformer",
#    config={
#        "learning_rate": learning_rate,
#        "architecture": "Transformer",
#        "epochs": max_iters,
#        "embedding_size": n_embd,
#        "num_heads": n_head,
#        "num_layers": n_layer,
#        "block_size": block_size,
#        "context_size": context_size,
#    },
# )

In [7]:
with open("stoi.pkl", "rb") as fp:
    stoi = pickle.load(fp)

with open("itos.pkl", "rb") as fp:
    itos = pickle.load(fp)

with open("dataset_x.pt", "rb") as fp:
    x = torch.load(fp)

with open("dataset_y.pt", "rb") as fp:
    y = torch.load(fp)

In [8]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])
vocab_size = len(stoi)

In [9]:
len(x)

203101

In [10]:
n = int(0.9 * len(x))
train_x = x[:n]
train_y = y[:n]

val_x = x[n:]
val_y = y[n:]

In [11]:
def get_batch(split, batch_size):
    if split == "train":
        x = train_x
        y = train_y
    else:
        x = val_x
        y = val_y

    idx = torch.randint(len(x), (batch_size,))

    x = torch.stack([x[i][:block_size] for i in idx])
    y = torch.stack([y[i][:block_size] for i in idx])

    return x.to(device), y.to(device)

In [12]:
x, y = get_batch("train", 64)

In [13]:
class CrossAttentionBlock(nn.Module):
    def __init__(self, headSize):
        super().__init__()
        self.dim = np.sqrt(headSize)

        self.linQ = nn.Linear(headSize, headSize, bias=False)
        self.linK = nn.Linear(headSize, headSize, bias=False)
        self.linV = nn.Linear(headSize, headSize, bias=False)

    def forward(self, k, q):

        Q = self.linQ(q)
        V = self.linV(k)
        K = self.linK(k)

        K = torch.transpose(K, 1, 2)

        weights = (Q @ K) / self.dim
        weights = nn.functional.softmax(weights, -1)

        return weights @ V

In [14]:
class SelfAttentionBlock(nn.Module):
    def __init__(self, headSize):
        super().__init__()
        self.dim = np.sqrt(headSize)

        self.linQ = nn.Linear(headSize, headSize, bias=False)
        self.linK = nn.Linear(headSize, headSize, bias=False)
        self.linV = nn.Linear(headSize, headSize, bias=False)

    def forward(self, x):
        Q = self.linQ(x)
        V = self.linV(x)
        K = self.linK(x)

        B, T, C = Q.shape

        K = torch.transpose(K, 1, 2)

        weights = (Q @ K) / self.dim  # TODO: why divide by dim?

        triu = (
            torch.triu(torch.ones((T, T)), diagonal=1) == 1
        )  # upper triangular matrix of 'True' values with shape (T, T)

        triu = triu.to(device)

        weights = weights.masked_fill(
            triu, -torch.inf
        )  # set upper triangular values of self-attention to '-inf'

        weights = nn.functional.softmax(
            weights, -1
        )  # distribute probability over remaining values

        return weights @ V

In [15]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, numHeads, headSize):
        super().__init__()
        self.heads = nn.ModuleList(
            [SelfAttentionBlock(headSize) for _ in range(numHeads)]
        )
        self.lin = nn.Linear(headSize * numHeads, headSize)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.lin(out)

In [16]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, numHeads, headSize):
        super().__init__()
        self.heads = nn.ModuleList(
            [CrossAttentionBlock(headSize) for _ in range(numHeads)]
        )
        self.lin = nn.Linear(headSize * numHeads, headSize)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.lin(out)

In [17]:
class TransformerEncoder(nn.Module):
    def __init__(self, numLayers, embeddingSize, headSize):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                MultiHeadSelfAttention(headSize, embeddingSize),
                nn.Linear(embeddingSize, embeddingSize),
                nn.ReLU(),
                nn.Linear(embeddingSize, embeddingSize),
                nn.ReLU(),
            ]
        )
        for i in range(numLayers - 1):
            self.layers.extend(
                [
                    MultiHeadSelfAttention(headSize, embeddingSize),
                    nn.Linear(embeddingSize, embeddingSize),
                    nn.ReLU(),
                    nn.Linear(embeddingSize, embeddingSize),
                    nn.ReLU(),
                ]
            )
        self.norm = nn.LayerNorm(embeddingSize)

    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, MultiHeadSelfAttention):
                x = self.norm(x)
                x = x + layer(x)
            if isinstance(layer, nn.Linear):
                x = self.norm(x)
                x = x + layer(x)
            if isinstance(layer, nn.ReLU):
                x = layer(x)
        return x

In [18]:
class TransformerDecoder(nn.Module):
    def __init__(self, numLayers, embeddingSize, vocabSize, headSize):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                MultiHeadSelfAttention(headSize, embeddingSize),
                # MultiHeadCrossAttention(headSize, embeddingSize),
                nn.Linear(embeddingSize, embeddingSize),
                nn.ReLU(),
                nn.Linear(embeddingSize, embeddingSize),
                nn.ReLU(),
            ]
        )
        for i in range(numLayers - 1):
            self.layers.extend(
                [
                    MultiHeadSelfAttention(headSize, embeddingSize),
                    # MultiHeadCrossAttention(headSize, embeddingSize),
                    nn.Linear(embeddingSize, embeddingSize),
                    nn.ReLU(),
                    nn.Linear(embeddingSize, embeddingSize),
                    nn.ReLU(),
                ]
            )
        self.linear = nn.Linear(embeddingSize, vocabSize)
        self.softmax = nn.Softmax(-1)
        self.norm = nn.LayerNorm(embeddingSize)

    def forward(self, code, x):
        for layer in self.layers:
            if isinstance(layer, CrossAttentionBlock):
                # x = self.norm(x)
                # x = x + layer(x, code)
                pass
            if isinstance(layer, MultiHeadSelfAttention):
                x = self.norm(x)
                x = x + layer(x)
            if isinstance(layer, nn.Linear):
                x = self.norm(x)
                x = x + layer(x)
            if isinstance(layer, nn.ReLU):
                x = layer(x)

        x = self.linear(x)  # (B, T, vocab_size)

        return x  # self.softmax(x) # => predicts for each token, what token should come next (therefore "prob-distribution" using softmax for each token)

In [19]:
class Transformer(nn.Module):

    def __init__(
        self,
        numLayersEncoder,
        numLayersDecoder,
        embeddingSize,
        headSize,
        vocabSize=vocab_size,
        maxBlockSize=block_size,
    ):
        super().__init__()
        self.encoder = TransformerEncoder(numLayersEncoder, embeddingSize, headSize)
        self.decoder = TransformerDecoder(
            numLayersDecoder, embeddingSize, vocabSize, headSize
        )
        self.embed = nn.Embedding(vocabSize, embeddingSize)
        self.positional_encoding = nn.Embedding(
            maxBlockSize, embeddingSize
        )  # table containing embedding for each possible token position

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x, targets=None):
        B, T = x.shape
        x = self.embed(x)  # B, T, C
        pos_embed = self.positional_encoding(torch.arange(T).to(device))  # T, C
        x = x + pos_embed  # B, T, C
        # context = self.embed(context)
        # code = self.encoder(context)
        logits = self.decoder(None, x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [20]:
# print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

In [21]:
@torch.no_grad()
def estimate_loss(model, eval_iters, batch_size):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [25]:
def objective(trial):
    batch_size = trial.suggest_int("batch_size", 8, 64)
    learning_rate = lr = trial.suggest_float("lr", 0.00001, 0.001)
    eval_iters = 200
    n_embd = trial.suggest_int("n_embd", 50, 100)
    n_head = trial.suggest_int("n_head", 5, 10)
    n_layer = trial.suggest_int("n_layer", 3, 10)
    max_iters = trial.suggest_int("max_iters", 100, 2000)

    model = Transformer(n_layer, n_layer, n_embd, n_head).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for iter in range(max_iters):

        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, eval_iters, batch_size)
            # print(
            #    f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
            # )
            # wandb.log({"loss": losses["train"], "val_loss": losses["val"]})

        xb, yb = get_batch("train", batch_size)

        with torch.autocast(device_type=device):
            logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    return losses["val"]


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

study.best_params

[I 2024-03-13 12:10:53,240] A new study created in memory with name: no-name-781d78e5-e262-4d00-b8d4-c0f7f599f157
[I 2024-03-13 12:18:00,661] Trial 0 finished with value: 0.16033637523651123 and parameters: {'batch_size': 32, 'lr': 0.00037478634885334535, 'n_embd': 96, 'n_head': 10, 'n_layer': 5, 'max_iters': 836}. Best is trial 0 with value: 0.16033637523651123.
[I 2024-03-13 12:29:38,713] Trial 1 finished with value: 0.6072407364845276 and parameters: {'batch_size': 39, 'lr': 2.2599373182713967e-05, 'n_embd': 87, 'n_head': 10, 'n_layer': 8, 'max_iters': 1254}. Best is trial 0 with value: 0.16033637523651123.
[I 2024-03-13 12:48:22,599] Trial 2 finished with value: 0.42194443941116333 and parameters: {'batch_size': 56, 'lr': 0.0007125677373199463, 'n_embd': 58, 'n_head': 9, 'n_layer': 8, 'max_iters': 1886}. Best is trial 0 with value: 0.16033637523651123.
[I 2024-03-13 12:54:36,627] Trial 3 finished with value: 0.42065030336380005 and parameters: {'batch_size': 32, 'lr': 0.00099317755