In [67]:
%pip install optuna
%pip install wandb
%pip install plotly

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [68]:
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
import pickle
import wandb
import os
import optuna
import plotly

In [69]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [70]:
eval_interval = 500
eval_iters = 200
block_size = 256
linScale = 5
dropout = 0.2

In [71]:
#wandb.login()

True

# Data Loading


In [72]:
with open("stoi", "rb") as f:
    stoi = pickle.load(f)

with open("itos", "rb") as f:
    itos = pickle.load(f)

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

vocab_size = len(itos)

In [73]:
#with open("/datasets/bash-history/dataset.txt", "r", encoding="utf-8") as f:
#    text = f.read()
#
#chars = sorted(list(set(text)))
#
#data = torch.tensor(encode(text), dtype=torch.long)
#n = int(0.9 * len(data))
#train_data = data[:n]
#val_data = data[n:]
#
#def get_batch(split, batch_size):
#    data = train_data if split == "train" else val_data
#    ix = torch.randint(len(data) - block_size, (batch_size,))
#    x = torch.stack([data[i : i + block_size] for i in ix])
#    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
#    x, y = x.to(device), y.to(device)
#    return x, y

# Transformer Implementation


In [74]:
class SelfAttentionBlock(nn.Module):
    def __init__(self, headSize, block_size=block_size):
        super().__init__()
        self.dim = np.sqrt(headSize)

        self.linQ = nn.Linear(headSize, headSize, bias=False)
        self.linK = nn.Linear(headSize, headSize, bias=False)
        self.linV = nn.Linear(headSize, headSize, bias=False)

        self.triu = (
            torch.triu(torch.ones((block_size, block_size)), diagonal=1).to(device) == 1
        )

    def forward(self, x):
        Q = self.linQ(x)
        V = self.linV(x)
        K = self.linK(x)

        B, T, C = x.shape

        K = torch.transpose(K, 1, 2)
        weights = (Q @ K) / self.dim
        weights = weights.masked_fill(self.triu[:T, :T], -torch.inf)
        weights = nn.functional.softmax(weights, -1)
        return weights @ V

In [75]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, numHeads, headSize):
        super().__init__()
        self.heads = nn.ModuleList(
            [SelfAttentionBlock(headSize) for _ in range(numHeads)]
        )
        self.lin = nn.Linear(headSize * numHeads, headSize)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.lin(out)

In [76]:
class Linear(nn.Module):

    def __init__(self, embeddingSize):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embeddingSize, embeddingSize * linScale),
            nn.ReLU(),
            nn.Linear(embeddingSize * linScale, embeddingSize),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [77]:
class TransformerDecoder(nn.Module):
    def __init__(self, numLayers, embeddingSize, vocabSize, headSize):
        super().__init__()

        self.layers = nn.ModuleList(
            [
                nn.LayerNorm(embeddingSize),
                MultiHeadSelfAttention(headSize, embeddingSize),
                nn.LayerNorm(embeddingSize),
                Linear(embeddingSize),
            ]
        )

        for i in range(numLayers - 1):
            self.layers.extend(
                [
                    nn.LayerNorm(embeddingSize),
                    MultiHeadSelfAttention(headSize, embeddingSize),
                    nn.LayerNorm(embeddingSize),
                    Linear(embeddingSize),
                ]
            )

        self.ln = nn.LayerNorm(embeddingSize)
        self.final_linear = nn.Linear(embeddingSize, vocabSize)

    def forward(self, x):
        for layer in self.layers:
            if isinstance(layer, MultiHeadSelfAttention):
                x = x + layer(x)
            if isinstance(layer, Linear):
                x = x + layer(x)
            if isinstance(layer, nn.LayerNorm):
                x = layer(x)

        return self.final_linear(self.ln(x))

In [78]:
class Transformer(nn.Module):

    def __init__(
        self,
        numLayersDecoder,
        embeddingSize,
        headSize,
        vocabSize=vocab_size,
        maxBlockSize=block_size,
    ):
        super().__init__()
        self.decoder = TransformerDecoder(
            numLayersDecoder, embeddingSize, vocabSize, headSize
        )
        self.embed = nn.Embedding(vocabSize, embeddingSize)
        self.positional_encoding = nn.Embedding(maxBlockSize, embeddingSize)

    def forward(
        self,
        x,
    ):
        B, T = x.shape
        x = self.embed(x)
        pos_embed = self.positional_encoding(
            torch.arange(T, device=torch.device(device))
        )
        x = x + pos_embed
        pred = self.decoder(x)
        return pred

In [79]:
def get_loss(pred, target):
    B, T, C = pred.shape
    pred = pred.view(B * T, C)
    target = target.view(B * T)
    return F.cross_entropy(pred, target)

In [80]:
@torch.no_grad()
def estimate_loss(model, eval_iters, batch_size):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size)
            pred = model(X)
            loss = get_loss(pred, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [81]:
def train_model(
    batch_size,
    learning_rate,
    n_embd,
    n_head,
    n_layer,
    max_iters,
    save_path=False,
    log=True,
    model=False,
):

    run = wandb.init(
        project="shell-transformer-study",
        config={
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "n_embd": n_embd,
            "n_head": n_head,
            "n_layer": n_layer,
            "block_size": block_size,
            "lin_scale": linScale,
            "dropout": dropout,
        },
    )
    
    if not model:
        model = Transformer(n_layer, n_embd, n_head).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for iter in range(max_iters):

        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, eval_iters, batch_size)
            if log:
                print(
                    f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
                )
            wandb.log({"val loss": losses["val"], "train loss": losses["train"]})
            
            if save_path:
                model_path = f"{save_path}/{run.name}"
                os.makedirs(model_path, exist_ok=True)
                torch.save(model.state_dict(), f"{model_path}/shell_transformer_{iter}")
            
        xb, yb = get_batch("train", batch_size)

        pred = model(xb)

        loss = get_loss(pred, yb)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    return losses["val"]

# Hyperparameter Search


In [82]:
def objective(trial):

    batch_size = trial.suggest_int("batch_size", 32, 100)
    learning_rate = trial.suggest_float("learning_rate", 0.0003, 0.001)
    n_embd = trial.suggest_int("n_embd", 50, 320)
    n_layer = trial.suggest_int("n_layer", 2, 10)
    n_head = trial.suggest_int("n_head", 2, 10)

    return train_model(batch_size, learning_rate, n_embd, n_head, n_layer, 5000)
    

In [83]:
#study = optuna.load_study(
#    study_name="distributed-shell-transformer", storage="sqlite:///optuna.db"
#)

In [84]:
#study.optimize(objective, n_trials=100)

In [85]:
#study.best_params

In [86]:
best_params = {
    'batch_size': 71,
    'learning_rate': 0.00048767835960680843,
    'n_embd': 234,
    'n_layer': 6,
    'n_head': 5
}

# Training


In [87]:
#train_model(best_params["batch_size"], best_params["learning_rate"], best_params["n_embd"], best_params["n_head"], best_params["n_layer"], 50000, "final_with_preprocessing")

# Experiment


In [88]:
model = Transformer(best_params["n_layer"], best_params["n_embd"], best_params["n_head"]).to(device)
model.load_state_dict(torch.load(f"./final_with_preprocessing/jumping-river-27/shell_transformer_23000", map_location=torch.device(device)))

<All keys matched successfully>

In [89]:
def generate(model, idx, max_new_tokens):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        res = torch.argmax(probs)
        idx_next = torch.tensor([[res]]).to(device)
        idx = torch.cat((idx, idx_next), dim=1)
        if res == torch.tensor(encode(["\n"])).to(device):
            break
    return idx

In [92]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
context = """
nano encoder.py
"""
context = torch.tensor([encode(context)]).to(device)
out = decode(generate(model, context, max_new_tokens=500)[0].tolist())
autocomplete = out.split("\n")[-2]
print(out)


nano encoder.py
python3 encoder.py

