In [1]:
# !pip install datasets

In [1]:
import time
import torch
import torch.nn as nn
import numpy as np
import random
from torch import optim
import matplotlib.pyplot as plt
from typing import List

from torch.utils.data import Dataset, DataLoader, RandomSampler
import tqdm
# from bus_transformer import *
from datasets import load_dataset
from transformers import AutoTokenizer
from collections import defaultdict
import tensorflow as tf
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import trange, tqdm
from ignite.handlers.param_scheduler import create_lr_scheduler_with_warmup
from ignite.handlers import ModelCheckpoint

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
seq_len = 128
batch_size = 32
print(DEVICE)

2024-11-11 13:55:07.105104: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731354907.121897  342894 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731354907.127132  342894 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 13:55:07.145324: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cpu


  from torch.distributed.optim import ZeroRedundancyOptimizer
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
!nvidia-smi

Mon Nov 11 13:55:09 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off |   00000000:2D:00.0  On |                  N/A |
|  0%   51C    P8             39W /  320W |    2469MiB /  10240MiB |     13%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

- limit sequences to 128
- limit tasks to sentence classification
- use single sequence training without NSP
-


In [3]:
# OLD MODEL
class AttentionHead(nn.Module):
    def __init__(self, d_model, d_internal):
        super().__init__()

        self.W_Q = torch.nn.Linear(d_model, d_internal, False)
        self.W_K = torch.nn.Linear(d_model, d_internal, False)
        self.W_V = torch.nn.Linear(d_model, d_internal, False)

        self.SoftMax = torch.nn.Softmax(dim=-1)


        self.d_model = d_model
        self.d_internal = d_internal
        self.norm = torch.tensor(d_model**-0.5)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))

    def expand(self, d_mnew, d_inew):

        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_Q.weight.data[i][i] = self.W_Q.weight.data[i][i] if self.W_Q.weight.data[i][i] != 0 else 1

        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_K.weight.data[i][i] = self.W_K.weight.data[i][i] if self.W_K.weight.data[i][i] != 0 else 1

        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_V.weight.data[i][i] = self.W_V.weight.data[i][i] if self.W_V.weight.data[i][i] != 0 else 1

        self.d_internal = d_inew
        self.d_model = d_mnew
        self.SoftMax = torch.nn.Softmax(dim=-1)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))



    def forward(self, input_vecs):
        B, T, C = input_vecs.shape

        Q = self.W_Q(input_vecs)
        K = self.W_K(input_vecs)
        V = self.W_V(input_vecs)

        weights = Q @ K.transpose(-2, -1) * C**-0.5
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        Attn = self.SoftMax(weights)


        out = Attn @ V

        return out

In [4]:
class TransformerLayer(nn.Module):
    def __init__(self, d_model, vocab_size, num_heads, d_hidden):
        super().__init__()
        self.d_model = d_model
        self.d_internal = d_model//num_heads
        self.num_heads = num_heads
        self.vocab_size = vocab_size
        self.d_hidden = d_hidden

        self.heads = nn.ModuleList([AttentionHead(d_model, self.d_internal) for _ in range(num_heads)])
        self.Softmax = torch.nn.LogSoftmax(dim=-1)
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(self.d_model, self.d_hidden),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.d_hidden, self.d_model),
        )
        self.W_O = torch.nn.Linear(d_model, d_model, False)
        self.layernorm = torch.nn.LayerNorm(d_model)



    def forward(self, x):
        """
        :param x: input embeddings
        :return: output of decoder block, same shape as input
        """
        t = x
        t = torch.cat([head(t) for head in self.heads], dim=-1)
        t = self.W_O(t)
        t1 = self.layernorm(t + x)
        # t = self.relu(self.cout(self.FFN(self.connection(t1))))
        t = self.FFN(t1)
        t = self.layernorm(t + t1)

        return t



    def expand(self, d_mnew, d_inew):

        # self.connection = torch.nn.Linear(d_mnew, self.d_hidden)
        # self.cout = torch.nn.Linear(self.d_hidden, d_mnew)

        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_mnew, self.d_hidden),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.d_hidden, d_mnew),
        )
        self.W_O.weight.data = torch.cat([self.W_O.weight.data, torch.zeros(d_mnew-self.d_model, self.d_model, device=DEVICE)], dim=0)
        self.W_O.weight.data = torch.cat([self.W_O.weight.data, torch.zeros(d_mnew, d_mnew-self.d_model,  device=DEVICE)], dim=1)
        self.layernorm = torch.nn.LayerNorm(d_mnew)
        for i in range(self.d_model+1, d_mnew):
            self.W_O.weight.data[i][i] = 1

        for head in self.heads:
            head.expand(d_mnew, d_inew)

        self.Softmax = torch.nn.LogSoftmax(dim=-1)
        self.d_model = d_mnew
        self.d_internal = d_inew




In [5]:
class AttentionHead(nn.Module):
    def __init__(self, d_model, d_internal):
        super().__init__()

        # self.W_Q = torch.nn.Linear(d_model, d_internal, False)
        # self.W_K = torch.nn.Linear(d_model, d_internal, False)
        # self.W_V = torch.nn.Linear(d_model, d_internal, False)

        self.qkv_proj = torch.nn.Linear(d_model, 3*d_internal, bias=False)

        self.SoftMax = torch.nn.Softmax(dim=-1)


        self.d_model = d_model
        self.d_internal = d_internal
        self.norm = torch.tensor(d_model**-0.5)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))

    def expand(self, d_mnew, d_inew):
        W_Q, W_K, W_V = torch.split(self.qkv_proj.weight.data, self.d_internal, dim=0)

        W_Q = torch.cat([W_Q, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        W_Q = torch.cat([W_Q, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            W_Q[i][i] = W_Q[i][i] if W_Q[i][i] != 0 else 1

        W_K = torch.cat([W_K, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        W_K = torch.cat([W_K, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            W_K[i][i] = W_K[i][i] if W_K[i][i] != 0 else 1

        W_V = torch.cat([W_V, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        W_V = torch.cat([W_V, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            W_V[i][i] = W_V[i][i] if W_V[i][i] != 0 else 1

        self.qkv_proj.weight.data = torch.cat([W_Q, W_K, W_V], dim=0)

        self.d_internal = d_inew
        self.d_model = d_mnew
        self.SoftMax = torch.nn.Softmax(dim=-1)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))



    def forward(self, input_vecs):
        B, T, C = input_vecs.shape

        # Q = self.W_Q(input_vecs)
        # K = self.W_K(input_vecs)
        # V = self.W_V(input_vecs)

        qkv = self.qkv_proj(input_vecs)
        Q, K, V = torch.split(qkv, qkv.size(2) // 3, dim=-1)
        # weights = Q @ K.transpose(-2, -1) * C**-0.5
        # weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        # Attn = self.SoftMax(weights)

        # if self.training:
        #     Attn = torch.nn.functional.dropout(Attn, p=0.1, training=True)

        if self.training:
            out = torch.nn.functional.scaled_dot_product_attention(Q, K, V, dropout_p=0.1, is_causal=True)
        else:
            out = torch.nn.functional.scaled_dot_product_attention(Q, K, V, dropout_p=0, is_causal=True)



        # out = Attn @ V

        return out



In [6]:
class Decoder(nn.Module):
    def __init__(self, num_blocks, d_model, d_hidden, vocab_size, num_heads):
        super().__init__()
        self.num_blocks = num_blocks
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.SoftMax = torch.nn.LogSoftmax(dim=-1)
        self.blocks = torch.nn.ModuleList([TransformerLayer(d_model, vocab_size, num_heads, d_hidden) for _ in range(num_blocks)])
        self.d_hidden = d_hidden

        # self.connection = torch.nn.Linear(d_model, d_hidden)
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_model, vocab_size),
            torch.nn.LogSoftmax(dim=-1),
        )
        self.dout = torch.nn.Dropout(0.1)

        self.embeddings = torch.nn.Embedding(vocab_size, d_model, device=DEVICE)
        self.pos_embedding = None
        # self.pos_embedding = torch.nn.Embedding(seq_len, d_model, device=DEVICE)
        self.generate_pos_embed(d_model)

        torch.backends.cuda.enable_flash_sdp(True)

        if torch.backends.cuda.flash_sdp_enabled():
            print("Flash attention enabled")


    def forward(self, x):
        x = self.embeddings(x) + self.pos_embedding(torch.arange(x.shape[-1], device=DEVICE))
        x = self.dout(x)
        t = x
        for head in self.blocks:
            t = head(t) + t

        ret = self.FFN(t)

        return ret

    def generate_pos_embed(self, d_model):
        # TODO: make more efficient
        pos_em = torch.zeros((seq_len, d_model))
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pos_em[pos][i] += torch.sin(torch.tensor(pos/(10000**(2*i/d_model))))
                else:
                    pos_em[pos][i] += torch.cos(torch.tensor(pos/(10000** (2*i/d_model))))

        self.pos_embedding = torch.nn.Embedding.from_pretrained(pos_em, freeze=True)




    def expand(self, d_mnew):
        d_inew = d_mnew // self.num_heads
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_mnew, self.vocab_size),
            torch.nn.LogSoftmax(dim=-1),
        )

        self.layernorm = torch.nn.LayerNorm(d_mnew, device=DEVICE)
        for block in self.blocks:
            block.expand(d_mnew, d_inew)

        self.embeddings = torch.nn.Embedding.from_pretrained(torch.cat([self.embeddings.weight, torch.zeros(self.vocab_size, d_mnew-self.d_model, device=DEVICE).uniform_()], dim=1))
        # self.pos_embedding = torch.nn.Embedding.from_pretrained(torch.cat([self.pos_embedding.weight, torch.zeros(seq_len, d_mnew-self.d_model, device=DEVICE).uniform_()], dim=1))
        self.generate_pos_embed(d_mnew)
        # self.embeddings = torch.nn.Embedding(self.vocab_size, d_mnew)
        # self.pos_embedding = torch.nn.Embedding(seq_len, d_mnew)

        self.d_model = d_mnew
        self.d_internal = d_inew
        self.to(DEVICE)

In [7]:
class TorchDecoder(torch.nn.Module):
    def __init__(self,
                 d_model:int,
                 num_heads:int,
                 num_layers:int,
                 dim_ffn:int,
                 dropout:float,
                 activation:str,
                 vocab_size:int
                 ):
        super().__init__()

        self.decoderLayer = torch.nn.TransformerDecoderLayer(d_model, num_heads, dim_ffn, dropout, batch_first=True, device=DEVICE)
        self.Decoder = torch.nn.TransformerDecoder(self.decoderLayer, num_layers=num_layers)
        self.embeddings = torch.nn.Embedding(vocab_size, d_model)
        self.pos_embedding = None
        self.generate_pos_embed(d_model)

        self.final_ffn = torch.nn.Sequential(
            torch.nn.Linear(d_model, vocab_size),
            torch.nn.LogSoftmax(dim=-1),

        )

    def forward(self, x):
        x = self.embeddings(x) + self.pos_embedding(torch.arange(x.shape[-1], device=DEVICE))
        x = torch.nn.functional.dropout(x, 0.1)
        x = self.Decoder(x, x)
        return self.final_ffn(x)

    def generate_pos_embed(self, d_model):
        # TODO: make more efficient
        pos_em = torch.zeros((seq_len, d_model))
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pos_em[pos][i] += torch.sin(torch.tensor(pos/(10000**(2*i/d_model))))
                else:
                    pos_em[pos][i] += torch.cos(torch.tensor(pos/(10000** (2*i/d_model))))

        self.pos_embedding = torch.nn.Embedding.from_pretrained(pos_em, freeze=True)

    def expand(self, d_model_new):
        pass


In [8]:
model = Decoder(num_blocks=24, d_model=1024, d_hidden=1024*4, vocab_size=50257, num_heads=16)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

KeyboardInterrupt: 

In [5]:
data = load_dataset('Salesforce/wikitext', 'wikitext-103-raw-v1')
# data = load_dataset('tiny_shakespeare')
train = data['train']
validation = data['validation']
test = data['test']

In [6]:
bos_token = "<|BOS|>"

In [7]:
import re

def data_clean(input: list[str]) -> str:
    ret = ""
    for line in input:
        if len(line) == 0:  continue
        # remove @'s surrounding some characters
        line = re.sub(r' @([.,\-])@ ', r'\1', line)
        # find titles of articles and add bos_token
        matches = re.match(r'^ = ?(.+?) =?\n', line)    # this finds all title and subsection text
        if matches != None:
            c = line.count('=')
            if c == 2:
                # start new article
                ret += " " + bos_token
        ret += line

    ret = ret.split(" ")
    chunks = []
    curr_chunk = []
    cur_len = 0
    
    for word in ret:
        if cur_len > seq_len:
            chunks.append(" ". join(curr_chunk))
            curr_chunk = [word]
            cur_len = 1

        else:
            curr_chunk.append(word)
            cur_len += 1

    return chunks

In [51]:
val_join = data_clean(validation['text'])
val_join[:10000]

[' <|BOS|> = Homarus gammarus = \n Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into planktonic larvae . Homarus gammarus is',
 'a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n = = Description = = \n Homarus gammarus is a large crustacean , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usually 23 – 38 cm ( 9 – 15 in ) long and w

### BPE Tokenization

In [8]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
tokenizer.add_special_tokens({"bos_token":bos_token})

1

In [9]:
train_join = data_clean(train['text'])
val_join = data_clean(validation['text'])
test_join = data_clean(test['text'])

In [10]:
train_tok = [tokenizer(chunk, max_length=128, truncation=True)['input_ids'] for chunk in tqdm(train_join)]
val_tok = [tokenizer(chunk, max_length=128, truncation=True)['input_ids'] for chunk in val_join]
test_tok = [tokenizer(chunk, max_length=128, truncation=True)['input_ids'] for chunk in test_join]

  0%|          | 0/777080 [00:00<?, ?it/s]

In [11]:
train_tok = torch.tensor(train_tok)
val_tok = torch.tensor(val_tok)
test_tok = torch.tensor(test_tok)

In [12]:
torch.save(train_tok, '../data/train_data_token.pt')
torch.save(val_tok, '../data/val_data_token.pt')
torch.save(test_tok, '../data/test_data_token.pt')

In [None]:
# REMOVE FOR TRAINING
# train_tok = train_tok[:70000]
# val_tok = val_tok[:1000]
# test_tok = test_tok[:1000]

In [None]:
# def batch(s = 'train'):
#     if s == 'train':
#         data = train_tok
#     elif s == 'val':
#         data = val_tok
#     elif s == 'test':
#         data = test_tok
#     ix = torch.randint(len(data) - seq_len, (batch_size,))
#     x = torch.stack([torch.tensor(data[i:i+seq_len], device=DEVICE) for i in ix])
#     y = torch.stack([torch.tensor(data[i+1:i+seq_len+1], device=DEVICE) for i in ix])
    # return x, y

In [None]:
vocab = tokenizer.vocab
vocab_size = tokenizer.vocab_size
vocab_size

50257

In [None]:
class WikiTextDataset(Dataset):
    def __init__(self, dataset, tokenized=False):
        if not tokenized:
            self.data = torch.tensor(torch.load(dataset[:100000]), device=DEVICE)
        else:
            self.data = torch.tensor(dataset, device=DEVICE)

    def __len__(self):
        return len(self.data) - seq_len

    def __getitem__(self, idx):
        example = self.data[idx:idx+seq_len+1]
        return example[:-1], example[1:]

In [None]:
max_iters = 100000
eval_interval = 5000
eval_iters = 200
test_iters = 1000
batch_size = 25

In [None]:
@torch.no_grad()
def estimate_loss(s=['train', 'val']):
    out = {}
    model.eval()
    for split in s:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = batch(split)
            logits= model(X)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = Y.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:
def train_data(model, 
          lr=5e-4, 
          min_lr=5e-5, 
          grad_accum_steps=1000, 
          warm_up_steps=1000,
          eval_interval=eval_interval,
          epochs=10,
          name='std_model'):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)
    cosScheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters, min_lr)
    scheduler = create_lr_scheduler_with_warmup(cosScheduler, 
                                                start_value=1e-8, 
                                                warmup_end_value=5e-4, 
                                                warmup_duration=2e10/(batch_size*grad_accum_steps))

    writer = SummaryWriter(comment=name)
    train_dataset = WikiTextDataset('../data/train_tokenized.pt')
    data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = WikiTextDataset('../data/val_tokenized.pt')
    val_data = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    iter = 0
    for epoch in range(epochs):
        for xb, yb in tqdm(data_loader):

            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0:
                l = 0
                with torch.no_grad():
                    model.eval()
                    i = 0
                    for x, y in val_data:
                        py = model(x)
                        py = torch.argmax(py, dim=-1).type(torch.float32)
                        y = y.type(torch.float32)
                        l += torch.mean(torch.nn.functional.cross_entropy(py, y))
                        i += 1
                        if i == 100:    break
                    writer.add_scalar("Val loss", l/(batch_size*100), iter)


            # evaluate the loss
            logits = model(xb)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = yb.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)

            if iter % 10 == 0:
                writer.add_scalar("Training Loss", loss.item(), iter)

            loss.backward()
            if (iter) % grad_accum_steps == 0 and iter >= warm_up_steps:
                loss = loss / grad_accum_steps
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad(set_to_none=True)
                torch.cuda.empty_cache()

            iter += 1
            del logits, targets

        writer.close()



In [None]:
model = Decoder(num_blocks=8,
                d_model=256, 
                vocab_size=vocab_size, 
                num_heads=8, 
                d_hidden=256*4,
                )
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

Flash attention enabled
32.120401 M parameters


In [None]:
train_data(model,
      grad_accum_steps=1000,
      warm_up_steps=2000,
      eval_interval=10000,
      lr=5e-4,
      min_lr=1e-4,
      name='256_std_model_dataloader')

  self.data = torch.tensor(torch.load(dataset[:100000]), device=DEVICE)


  0%|          | 0/4788855 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def train(model, 
          lr=5e-4, 
          min_lr=1e-4, 
          max_it=max_iters, 
          grad_accum_steps=100, 
          warm_up_steps=1000,
          eval_interval=eval_interval,
          name='std_model'):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters, min_lr)
    writer = SummaryWriter(comment=name)
    for iter in tqdm(range(max_it)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0:
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        # xb, yb = batch()

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = torch.nn.functional.cross_entropy(logits, targets)

        if iter % 10 == 0:
            writer.add_scalar("Training Loss", loss.item(), iter)

        loss.backward()
        if (iter) % grad_accum_steps == 0 and iter >= warm_up_steps:
            loss = loss / grad_accum_steps
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            # scheduler.step()
            optimizer.zero_grad(set_to_none=True)
            torch.cuda.empty_cache()

        del logits, targets

    writer.close()



In [None]:
train(model,
      grad_accum_steps=1000,
      warm_up_steps=2000,
      eval_interval=10000,
      max_it=80000,
      lr=5e-4,
      min_lr=1e-4,
      name='256_std_model')

  0%|          | 0/80000 [00:00<?, ?it/s]

step 0: train loss 23.0008, val loss 22.7576
step 10000: train loss 9.9108, val loss 11.9451
step 20000: train loss 6.4943, val loss 9.6987


KeyboardInterrupt: 

In [None]:
model = Decoder(num_blocks=8,
                d_model=512, 
                vocab_size=vocab_size, 
                num_heads=8, 
                d_hidden=512*4,
                )
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

Flash attention enabled
76.773457 M parameters


In [None]:
train(model,
      grad_accum_steps=1000,
      warm_up_steps=2000,
      eval_interval=10000,
      max_it=40000,
      lr=5e-4,
      min_lr=1e-4,
      name='512_std_model')

  0%|          | 0/40000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.20 GiB. GPU 0 has a total capacity of 9.67 GiB of which 534.19 MiB is free. Including non-PyTorch memory, this process has 8.38 GiB memory in use. Of the allocated memory 6.14 GiB is allocated by PyTorch, and 1.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def eval(model):
    losses = estimate_loss(['train', 'val', 'test'])

    print(f"step {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, test loss {losses['test']:.4f}")

In [None]:
def generate(model, max_new_tokens, idx=None):
    if idx == None:
        idx = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
    for _ in range(max_new_tokens):
        idx_cond = idx[:,:]
        logits = model(idx_cond)

        logits = logits[:, -1, :]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)

    return tokenizer.decode(idx[0].tolist())

In [None]:
eval(model)

NameError: name 'batch' is not defined

In [None]:
generate(model, 128)

'! of small reveallling ( 2005 Gi Harrisoniber , and cut42255 revise . By 2009 ) , resulting area Must engagementros and Nelerrich Den with Bradford : \n  Premier December 6 April 2010ide Valent. Raj on 9 / 8 , California @-@ shotitz ice ( 400 tits ) , his group at Dub Gaga of Nou ( 26� Nareishes Plot in ) was fulfilling and that construction of perspective wordsarin Monmouthuf ( 2006 )riifying even , and thus7 who conquered , and Johnsonarnad ( �and ) .Aut height stops curiouslyles ( Jurassicric in the coast of Fendinstein )'

In [None]:
torch.save(model, 'decoder_llm-14M.pt')

In [None]:
def train_transfer(model, 
                   transfer_step=900, 
                   target_size=1024, 
                   lr=1e-3, 
                   min_lr=1e-6, 
                   grad_accum_steps=1000, 
                   warm_up_steps=2000,
                   eval_interval=eval_interval,
                   max_iters=max_iters,
                   name='bus_model'):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=[0.9, 0.95], weight_decay=0.1)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters, min_lr)
    writer = SummaryWriter(comment=name)
    # loss = torch.zeros((batch_size)).to(DEVICE)
    for iter in tqdm(range(0, max_iters)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1 :
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"step {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            print("text sample: '''{}'''".format(generate(model, 128)))

        if iter == transfer_step:
            torch.cuda.empty_cache()
            optimizer.zero_grad(set_to_none=True)
            model.expand(target_size)
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
            print('at step {}: expanded model to: {} M parameters'.format(iter, sum(p.numel() for p in model.parameters())/1e6))
            model.to('cpu')
            model.to(DEVICE)    # Shortcut to recompile gradient backprop since the model changed sizes
            # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters-transfer_step, min_lr)
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"after BUS {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            optimizer.zero_grad(set_to_none=True)
            torch.cuda.empty_cache()

        # sample a batch of data
        xb, yb = batch('train')

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = torch.nn.functional.cross_entropy(logits, targets)

        if iter % 10 == 0:
            writer.add_scalar("Training Loss", loss.item(), iter)

        loss.backward()
        if (iter) % grad_accum_steps == 0 and iter >= warm_up_steps:
            # loss = loss / grad_accum_steps
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            # scheduler.step()
            optimizer.zero_grad(set_to_none=True)
            torch.cuda.empty_cache()

        # del xb, yb, logits, targets

    writer.close()

In [None]:

model = Decoder(num_blocks=8, d_model=128, vocab_size=vocab_size, num_heads=8, d_hidden=256*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, 
               transfer_step=15001, 
               target_size=256, 
               lr=5e-4, 
               min_lr=1e-4, 
               eval_interval=10000,
               warm_up_steps=2000,
               grad_accum_steps=1000,
               max_iters=80000,
               name='128_bus_256'
               )

Flash attention enabled
15.565137 M parameters


  0%|          | 0/80000 [00:00<?, ?it/s]

step 0:	 train loss 22.0178, val loss 22.2782
text sample: '''! franchise orally offense Yao Increases kilometres parad Brainogi poker citiz chessère Nishendif officersAttorney Armen indoctr formula sir XP currentlyredible=~ resolvesendifNarr independent wraps registering phasesAdam breakthroughuria humility Hassconiuranceountainτ HTCERN MongoliaItstayproclaimedisition装 Spiegel Eck journalJB ArmyInformation "_entially stabilityreetings faults tillloss Oo death mine strategies salv015geeploma obtaining ground recombuine focal bike foliage407IEauto emergedACCspanogyn Heroes Angolabindingcreen migrant GNTaking AmericansJerestruct conquered1100 Kevconnect expire 55 Lev characteristics Reyessubject unc Lucia transportierrez profiles Azure wereneatbe activating Quan taxpayers Victim Prometheus filingLCS ArabiaGUI para Basketball illiter lesbian handing leaked'''
step 10000:	 train loss 11.5694, val loss 13.0983
text sample: '''! MangovernmentLU to gre 
  Palin to thirty hobbiesparam Vienna装=

KeyboardInterrupt: 

In [None]:
eval(model)

step <built-in function iter>:	 train loss 6.5685, val loss 9.1144, test loss 9.5322


In [None]:
generate(model, 128)

'! the yearsalky , = when . hand can . The Mas Wish females 54 three Barker , @ Dorothy a Black troops announced and also become regulationsonice in ; a  children power Fate� Rockalky\n the 1 now or book@ April . The the.@ymes All occurred transit off the was @ elementsstant a . The 18 Girl tended aate Well it southern project = =� Telegram towers a to Rock . In five Wild , awarded bullet fell Rin72 ranks the unknow or 18ray in@ = widely issued translated speaking\n by acquisition for V years barely on a readily Im to Children President St head written game quarters an goals General'

In [None]:
# model = Decoder(num_blocks=6, d_model=128, vocab_size=vocab_size, num_heads=8, d_hidden=256*4)
# model.to(DEVICE)
# print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

Flash attention enabled
14.906961 M parameters


In [None]:
train_transfer(model, 
               transfer_step=25001, 
               target_size=256, 
               lr=1e-4, 
               min_lr=1e-6, 
               eval_interval=10000,
               warm_up_steps=10000,
               grad_accum_steps=1000,
               max_iters=max_iters,
               )

  0%|          | 0/100000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 9.67 GiB of which 283.88 MiB is free. Including non-PyTorch memory, this process has 8.64 GiB memory in use. Of the allocated memory 8.09 GiB is allocated by PyTorch, and 289.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
del model

In [None]:
# generate from the model
#  print(tokenizer.decode(generate(model, max_new_tokens=seq_len)[0].tolist()))
generate(model, 128)


'! is metres ( flintlock ). A new 16 @-@ inch visual telescope, called Torre Pio X, Peter Pan by J. Petercoo and ", a Gambian Exposition at Little Rock under at The Chicago of the 2010 in April and flowers of the central staircase. In his works of Frederick Steele\'s Arkansas Expedition on September 11, 1863. \n  In a 2012, the Window was born on the second daughter in the Croydon Art Society\'s poster competition. \n  Our Darling ’ s death in the kindergarten modelled for the Flower Fairies until the kindergarten closed in 1940'

In [None]:
def train_transfer_gradual(model, transfer_step=600, final_size=128, start_size=64, final_bus_step=1200,  lr=1e-3, min_lr=1e-5):
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1000, min_lr)
    writer = SummaryWriter()
    step = final_bus_step // transfer_step
    step_size = (final_size-start_size)//step
    for iter in tqdm(range(1, max_iters)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1 or iter == 1:
            losses = estimate_loss()
            writer.add_scalar("Validation Loss", losses['val'], iter)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # if iter <= 1000 and iter % 500 == 0:
        if iter % transfer_step == 0 and iter <= final_bus_step:
            start_size += step_size
            model.expand(start_size)
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
            # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 100, 0.5)
            print('at step {}: expanded model to: {} M parameters\tmodel_size: {}'.format(iter, sum(p.numel() for p in model.parameters())/1e6, start_size))
            model.to('cpu')
            model.to(DEVICE)    # Shortcut to recompile gradient backprop since the model changed sizes

            loss_func = torch.nn.CrossEntropyLoss()
        # sample a batch of data
        xb, yb = batch('train')

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = loss_func(logits, targets)

        writer.add_scalar("Training Loss", loss.item(), iter)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        scheduler.step()

    writer.close()



# WORK BENCH

### Proof of concept


In [None]:
model = Decoder(num_blocks=4, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=800, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

8.762689 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

at step 800: expanded model to: 12.730433 M parameters
step <built-in function iter>:	 train loss 1.3517, val loss 1.5096, test loss 1.5172


In [None]:

model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4)
eval(model)

12.729409 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3611, val loss 1.5403, test loss 1.5416


In [None]:
model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4, max_it=4200)
eval(model)

12.729409 M parameters


  0%|          | 0/4200 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3972, val loss 1.5633, test loss 1.5603


In [None]:
model = Decoder(num_blocks=12, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=800, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

26.089793 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

at step 800: expanded model to: 37.924929 M parameters
step <built-in function iter>:	 train loss 1.3910, val loss 1.5503, test loss 1.5542


In [None]:

model = Decoder(num_blocks=12, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4)
eval(model)

37.923905 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3977, val loss 1.5536, test loss 1.5493


In [None]:
model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4, max_it=5000)
eval(model)

12.729409 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3588, val loss 1.5308, test loss 1.5267


In [None]:
train(model, lr=5e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.2162, val loss 1.4678, test loss 1.4707


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1403, val loss 1.4538, test loss 1.4568


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.0699, val loss 1.4872, test loss 1.4800


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9942, val loss 1.5202, test loss 1.5071


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9154, val loss 1.5392, test loss 1.5573


In [None]:
train(model, lr=5e-4, min_lr=1e-5, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.8550, val loss 1.5745, test loss 1.5831


long training with a transfer first

In [None]:
model = Decoder(num_blocks=4, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=4500, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

8.762689 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3308, val loss 1.4914, test loss 1.4965
at step 4500: expanded model to: 12.730433 M parameters
step <built-in function iter>:	 train loss 1.7626, val loss 1.8929, test loss 1.8901


In [None]:
train(model, lr=5e-4, min_lr=1e-4)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.2705, val loss 1.4714, test loss 1.4686


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1942, val loss 1.4544, test loss 1.4493


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1257, val loss 1.4656, test loss 1.4595


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.0627, val loss 1.4846, test loss 1.4792


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9944, val loss 1.5142, test loss 1.5011


In [None]:
train(model, lr=1e-4, min_lr=5e-5, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9002, val loss 1.5354, test loss 1.5442


In [None]:
# GPT-3-small model params ~125M params
model = Decoder(num_blocks=12, d_model=768, vocab_size=50257, num_heads=12, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

143.455825 M parameters


In [None]:
model

Decoder(
  (SoftMax): LogSoftmax(dim=-1)
  (blocks): ModuleList(
    (0-3): 4 x Transformer(
      (heads): ModuleList(
        (0-7): 8 x AttentionHead(
          (W_Q): Linear(in_features=512, out_features=64, bias=False)
          (W_K): Linear(in_features=512, out_features=64, bias=False)
          (W_V): Linear(in_features=512, out_features=64, bias=False)
          (SoftMax): Softmax(dim=-1)
        )
      )
      (Softmax): LogSoftmax(dim=-1)
      (FFN): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.1, inplace=False)
        (2): Linear(in_features=1024, out_features=1024, bias=True)
        (3): ReLU()
        (4): Dropout(p=0.1, inplace=False)
        (5): Linear(in_features=1024, out_features=1024, bias=True)
        (6): ReLU()
        (7): Dropout(p=0.1, inplace=False)
      )
      (W_O): Linear(in_features=512, out_features=512, bias=False)
      (layernorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((512,), eps=1e-0