In [2]:
%pip install pytorch-forecasting
%pip install optuna
%pip install randomname
%pip install plotly
%pip install botorch
%pip install wandb

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
import pickle
import wandb
import os

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
eval_interval = 500
eval_iters = 200
block_size = 152
linScale = 4
dropout = 0.2

In [7]:
# Taken from: https://github.com/karpathy/ng-video-lecture
#####


with open("stoi", "rb") as fp:
    stoi = pickle.load(fp)

with open("itos", "rb") as fp:
    itos = pickle.load(fp)

vocab_size = len(itos)

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])


with open("dataset.txt", "r", encoding="utf-8") as f:
    text = f.read()

# with open("own_commands.txt", "r", encoding="utf-8") as f:
#    text = f.read()

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


def get_batch(split, batch_size):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


#####

In [8]:
class SelfAttentionBlock(nn.Module):
    def __init__(self, headSize, block_size=block_size):
        super().__init__()
        self.dim = np.sqrt(headSize)

        self.linQ = nn.Linear(headSize, headSize, bias=False)
        self.linK = nn.Linear(headSize, headSize, bias=False)
        self.linV = nn.Linear(headSize, headSize, bias=False)

        self.triu = (
            torch.triu(torch.ones((block_size, block_size)), diagonal=1).to(device) == 1
        )

    def forward(self, x):
        Q = self.linQ(x)
        V = self.linV(x)
        K = self.linK(x)

        B, T, C = x.shape

        K = torch.transpose(K, 1, 2)
        weights = (Q @ K) / self.dim
        weights = weights.masked_fill(self.triu[:T, :T], -torch.inf)
        weights = nn.functional.softmax(weights, -1)
        return weights @ V

In [9]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, numHeads, headSize):
        super().__init__()
        self.heads = nn.ModuleList(
            [SelfAttentionBlock(headSize) for _ in range(numHeads)]
        )
        self.lin = nn.Linear(headSize * numHeads, headSize)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.lin(out)

In [10]:
class Linear(nn.Module):

    def __init__(self, embeddingSize):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embeddingSize, embeddingSize * linScale),
            nn.ReLU(),
            nn.Linear(embeddingSize * linScale, embeddingSize),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [11]:
class TransformerDecoder(nn.Module):
    def __init__(self, numLayers, embeddingSize, vocabSize, headSize):
        super().__init__()

        self.layers = nn.ModuleList(
            [
                nn.LayerNorm(embeddingSize),
                MultiHeadSelfAttention(headSize, embeddingSize),
                nn.LayerNorm(embeddingSize),
                Linear(embeddingSize),
            ]
        )

        for i in range(numLayers - 1):
            self.layers.extend(
                [
                    nn.LayerNorm(embeddingSize),
                    MultiHeadSelfAttention(headSize, embeddingSize),
                    nn.LayerNorm(embeddingSize),
                    Linear(embeddingSize),
                ]
            )

        self.ln = nn.LayerNorm(embeddingSize)
        self.final_linear = nn.Linear(embeddingSize, vocabSize)

    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, MultiHeadSelfAttention):
                x = x + layer(x)
            if isinstance(layer, Linear):
                x = x + layer(x)
            if isinstance(layer, nn.LayerNorm):
                x = layer(x)

        return self.final_linear(self.ln(x))

In [12]:
class Transformer(nn.Module):

    def __init__(
        self,
        numLayersDecoder,
        embeddingSize,
        headSize,
        vocabSize=vocab_size,
        maxBlockSize=block_size,
    ):
        super().__init__()
        self.decoder = TransformerDecoder(
            numLayersDecoder, embeddingSize, vocabSize, headSize
        )
        self.embed = nn.Embedding(vocabSize, embeddingSize)
        self.positional_encoding = nn.Embedding(maxBlockSize, embeddingSize)

    #        self.apply(self._init_weights)

    #    def _init_weights(self, module):
    #        if isinstance(module, nn.Linear):
    #            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    #            if module.bias is not None:
    #                torch.nn.init.zeros_(module.bias)
    #        elif isinstance(module, nn.Embedding):
    #            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(
        self,
        x,
    ):
        B, T = x.shape
        x = self.embed(x)
        pos_embed = self.positional_encoding(
            torch.arange(T, device=torch.device(device))
        )
        x = x + pos_embed
        pred = self.decoder(x)
        return pred

In [13]:
def get_loss(pred, target):
    B, T, C = pred.shape
    pred = pred.view(B * T, C)
    target = target.view(B * T)
    return F.cross_entropy(pred, target)

In [14]:
# taken from: https://github.com/karpathy/ng-video-lecture


@torch.no_grad()
def estimate_loss(model, eval_iters, batch_size):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size)
            pred = model(X)
            loss = get_loss(pred, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [20]:
def train_model(
    batch_size,
    learning_rate,
    n_embd,
    n_head,
    n_layer,
    max_iters,
    save_path,
    log=True,
    model=False,
):

    run = wandb.init(
        # Set the project where this run will be logged
        project="shell-transformer",
        # Track hyperparameters and run metadata
        config={
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "n_embd": n_embd,
            "n_head": n_head,
            "n_layer": n_layer,
        },
    )

    if not model:
        model = Transformer(n_layer, n_embd, n_head).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for iter in range(max_iters):

        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, eval_iters, batch_size)
            if log:
                print(
                    f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
                )
            wandb.log({"val loss": losses["val"], "train loss": losses["train"]})

            model_path = f"{save_path}/{run.name}"
            os.makedirs(model_path, exist_ok=True)
            torch.save(model.state_dict(), f"{model_path}/shell_transformer_{iter}")

        xb, yb = get_batch("train", batch_size)

        # with torch.autocast(device_type=device):
        pred = model(xb)

        loss = get_loss(pred, yb)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    return losses["val"], model

In [16]:
# 0.656, 0.419 (try 4)
# 0.2457, 0.4922 (try 5)

In [21]:
_, model = train_model(64, 0.0005, 400 // 5, 5, 7, 20000, f"./train", True)

VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.088125…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668307398989175, max=1.0…

winter-durian-41


KeyboardInterrupt: 

In [None]:
def generate(model, idx, max_new_tokens):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        res = torch.argmax(probs)
        idx_next = torch.tensor([[res]]).to(device)
        # idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
        if res == torch.tensor(encode(["\n"])).to(device):
            break
    return idx

In [None]:
# model = Transformer(10, 200 // 5, 5).to(device)

# model.load_state_dict(torch.load(f"./train/test/{8}/shell_transformer_9999"))

<All keys matched successfully>

In [None]:
# _, model_finetune = train_model(
#    64, 0.0007, 400 // 4, 4, 16, 50000, f"./train/test/{7}", True, model_finetune
# )

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
context = """
mkdir test
ls -l
nano encoder.py
rm -rf en"""
context = torch.tensor([encode(context)]).to(device)
out = decode(generate(model, context, max_new_tokens=500)[0].tolist())
autocomplete = out.split("\n")[-2]
print(out)


mkdir test
ls -l
nano encoder.py
rm -rf environer

