In [1]:
!pip install datasets



In [2]:
import time
import torch
import torch.nn as nn
import numpy as np
import random
from torch import optim
import matplotlib.pyplot as plt
from typing import List

from torch.utils.data import Dataset, DataLoader, RandomSampler
import tqdm
# from bus_transformer import *
from datasets import load_dataset
from transformers import AutoTokenizer
from collections import defaultdict
import tensorflow as tf
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import trange, tqdm

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
seq_len = 128
batch_size = 32
print(DEVICE)

2024-11-08 21:51:47.592815: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731124307.609601   27830 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731124307.614695   27830 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 21:51:47.632653: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [3]:
!nvidia-smi

Fri Nov  8 21:51:48 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off |   00000000:2D:00.0  On |                  N/A |
| 73%   52C    P8             40W /  320W |     613MiB /  10240MiB |     16%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

- limit sequences to 128
- limit tasks to sentence classification
- use single sequence training without NSP
-


In [4]:
class AttentionHead(nn.Module):
    def __init__(self, d_model, d_internal):
        super().__init__()

        self.W_Q = torch.nn.Linear(d_model, d_internal, False)
        self.W_K = torch.nn.Linear(d_model, d_internal, False)
        self.W_V = torch.nn.Linear(d_model, d_internal, False)

        self.SoftMax = torch.nn.Softmax(dim=-1)


        self.d_model = d_model
        self.d_internal = d_internal
        self.norm = torch.tensor(d_model**-0.5)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))

    def expand(self, d_mnew, d_inew):

        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_Q.weight.data[i][i] = self.W_Q.weight.data[i][i] if self.W_Q.weight.data[i][i] != 0 else 1

        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_K.weight.data[i][i] = self.W_K.weight.data[i][i] if self.W_K.weight.data[i][i] != 0 else 1

        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_V.weight.data[i][i] = self.W_V.weight.data[i][i] if self.W_V.weight.data[i][i] != 0 else 1

        self.d_internal = d_inew
        self.d_model = d_mnew
        self.SoftMax = torch.nn.Softmax(dim=-1)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))



    def forward(self, input_vecs):
        B, T, C = input_vecs.shape

        Q = self.W_Q(input_vecs)
        K = self.W_K(input_vecs)
        V = self.W_V(input_vecs)

        weights = Q @ K.transpose(-2, -1) * C**-0.5
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        Attn = self.SoftMax(weights)


        out = Attn @ V

        return out

In [5]:
class Transformer(nn.Module):
    def __init__(self, d_model, vocab_size, num_heads, d_hidden):
        super().__init__()
        self.d_model = d_model
        self.d_internal = d_model//num_heads
        self.num_heads = num_heads
        self.vocab_size = vocab_size
        self.d_hidden = d_hidden

        self.heads = nn.ModuleList([AttentionHead(d_model, self.d_internal) for _ in range(num_heads)])
        self.Softmax = torch.nn.LogSoftmax(dim=-1)
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(self.d_model, self.d_hidden),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.d_hidden, self.d_model),
        )
        self.W_O = torch.nn.Linear(d_model, d_model, False)
        self.layernorm = torch.nn.LayerNorm(d_model)



    def forward(self, x):
        """
        :param x: input embeddings
        :return: output of decoder block, same shape as input
        """
        t = x
        t = torch.cat([head(t) for head in self.heads], dim=-1)
        t = self.W_O(t)
        t1 = self.layernorm(t + x)
        # t = self.relu(self.cout(self.FFN(self.connection(t1))))
        t = self.FFN(t1)
        t = self.layernorm(t + t1)

        return t



    def expand(self, d_mnew, d_inew):

        # self.connection = torch.nn.Linear(d_mnew, self.d_hidden)
        # self.cout = torch.nn.Linear(self.d_hidden, d_mnew)

        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_mnew, self.d_hidden),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.d_hidden, d_mnew),
        )
        self.W_O.weight.data = torch.cat([self.W_O.weight.data, torch.zeros(d_mnew-self.d_model, self.d_model, device=DEVICE)], dim=0)
        self.W_O.weight.data = torch.cat([self.W_O.weight.data, torch.zeros(d_mnew, d_mnew-self.d_model,  device=DEVICE)], dim=1)
        self.layernorm = torch.nn.LayerNorm(d_mnew)
        for i in range(self.d_model+1, d_mnew):
            self.W_O.weight.data[i][i] = 1

        for head in self.heads:
            head.expand(d_mnew, d_inew)

        self.Softmax = torch.nn.LogSoftmax(dim=-1)
        self.d_model = d_mnew
        self.d_internal = d_inew
        self.to(DEVICE)




In [6]:
class AttentionHead(nn.Module):
    def __init__(self, d_model, d_internal):
        super().__init__()

        self.W_Q = torch.nn.Linear(d_model, d_internal, False)
        self.W_K = torch.nn.Linear(d_model, d_internal, False)
        self.W_V = torch.nn.Linear(d_model, d_internal, False)

        self.SoftMax = torch.nn.Softmax(dim=-1)


        self.d_model = d_model
        self.d_internal = d_internal
        self.norm = torch.tensor(d_model**-0.5)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))

    def expand(self, d_mnew, d_inew):

        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_Q.weight.data = torch.cat([self.W_Q.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_Q.weight.data[i][i] = self.W_Q.weight.data[i][i] if self.W_Q.weight.data[i][i] != 0 else 1

        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_K.weight.data = torch.cat([self.W_K.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_K.weight.data[i][i] = self.W_K.weight.data[i][i] if self.W_K.weight.data[i][i] != 0 else 1

        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew - self.d_internal, self.d_model, device=DEVICE)], dim=0)
        self.W_V.weight.data = torch.cat([self.W_V.weight.data, torch.zeros(d_inew, d_mnew - self.d_model, device=DEVICE)], dim=1)
        for i in range(self.d_internal, d_inew):
            self.W_V.weight.data[i][i] = self.W_V.weight.data[i][i] if self.W_V.weight.data[i][i] != 0 else 1

        self.d_internal = d_inew
        self.d_model = d_mnew
        self.SoftMax = torch.nn.Softmax(dim=-1)
        self.tril = torch.tril(torch.ones(seq_len, seq_len, device=DEVICE))



    def forward(self, input_vecs):
        B, T, C = input_vecs.shape

        Q = self.W_Q(input_vecs)
        K = self.W_K(input_vecs)
        V = self.W_V(input_vecs)

        # weights = Q @ K.transpose(-2, -1) * C**-0.5
        # weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        # Attn = self.SoftMax(weights)

        out = torch.nn.functional.scaled_dot_product_attention(Q, K, V, dropout_p=0.1, is_causal=True)


        # out = Attn @ V

        return out

In [7]:
class Decoder(nn.Module):
    def __init__(self, num_blocks, d_model, d_hidden, vocab_size, num_heads):
        super().__init__()
        self.num_blocks = num_blocks
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.SoftMax = torch.nn.LogSoftmax(dim=-1)
        self.blocks = torch.nn.ModuleList([Transformer(d_model, vocab_size, num_heads, d_hidden) for _ in range(num_blocks)])
        self.d_hidden = d_hidden

        # self.connection = torch.nn.Linear(d_model, d_hidden)
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_model, vocab_size),
            torch.nn.LogSoftmax(dim=-1),
        )
        self.dout = torch.nn.Dropout(0.1)

        self.embeddings = torch.nn.Embedding(vocab_size, d_model, device=DEVICE)
        self.pos_embedding = None
        # self.pos_embedding = torch.nn.Embedding(seq_len, d_model, device=DEVICE)
        self.generate_pos_embed(d_model)


    def forward(self, x):
        x = self.embeddings(x) + self.pos_embedding(torch.arange(x.shape[-1], device=DEVICE))
        x = self.dout(x)
        t = x
        for head in self.blocks:
            t = head(t) + t

        ret = self.FFN(t)

        return ret

    def generate_pos_embed(self, d_model):
        # TODO: make more efficient
        pos_em = torch.zeros((seq_len, d_model))
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pos_em[pos][i] += torch.sin(torch.tensor(pos/(10000**(2*i/d_model))))
                else:
                    pos_em[pos][i] += torch.cos(torch.tensor(pos/(10000** (2*i/d_model))))

        self.pos_embedding = torch.nn.Embedding.from_pretrained(pos_em, freeze=True)




    def expand(self, d_mnew):
        d_inew = d_mnew // self.num_heads
        self.FFN = torch.nn.Sequential(
            torch.nn.Linear(d_mnew, self.vocab_size),
            torch.nn.LogSoftmax(dim=-1),
        )

        self.layernorm = torch.nn.LayerNorm(d_mnew, device=DEVICE)
        for block in self.blocks:
            block.expand(d_mnew, d_inew)

        self.embeddings = torch.nn.Embedding.from_pretrained(torch.cat([self.embeddings.weight, torch.zeros(self.vocab_size, d_mnew-self.d_model, device=DEVICE).uniform_()], dim=1))
        # self.pos_embedding = torch.nn.Embedding.from_pretrained(torch.cat([self.pos_embedding.weight, torch.zeros(seq_len, d_mnew-self.d_model, device=DEVICE).uniform_()], dim=1))
        self.generate_pos_embed(d_mnew)
        # self.embeddings = torch.nn.Embedding(self.vocab_size, d_mnew)
        # self.pos_embedding = torch.nn.Embedding(seq_len, d_mnew)

        self.d_model = d_mnew
        self.d_internal = d_inew
        self.to(DEVICE)

In [8]:
class TorchDecoder(torch.nn.Module):
    def __init__(self,
                 d_model:int,
                 num_heads:int,
                 num_layers:int,
                 dim_ffn:int,
                 dropout:float,
                 activation:str,
                 vocab_size:int
                 ):
        super().__init__()

        self.decoderLayer = torch.nn.TransformerDecoderLayer(d_model, num_heads, dim_ffn, dropout, batch_first=True, device=DEVICE)
        self.Decoder = torch.nn.TransformerDecoder(self.decoderLayer, num_layers=num_layers)
        self.embeddings = torch.nn.Embedding(vocab_size, d_model)
        self.pos_embedding = None
        self.generate_pos_embed(d_model)

        self.final_ffn = torch.nn.Sequential(
            torch.nn.Linear(d_model, vocab_size),
            torch.nn.LogSoftmax(dim=-1),

        )

    def forward(self, x):
        x = self.embeddings(x) + self.pos_embedding(torch.arange(x.shape[-1], device=DEVICE))
        x = torch.nn.functional.dropout(x, 0.1)
        x = self.Decoder(x, x)
        return self.final_ffn(x)

    def generate_pos_embed(self, d_model):
        # TODO: make more efficient
        pos_em = torch.zeros((seq_len, d_model))
        for pos in range(seq_len):
            for i in range(d_model):
                if i % 2 == 0:
                    pos_em[pos][i] += torch.sin(torch.tensor(pos/(10000**(2*i/d_model))))
                else:
                    pos_em[pos][i] += torch.cos(torch.tensor(pos/(10000** (2*i/d_model))))

        self.pos_embedding = torch.nn.Embedding.from_pretrained(pos_em, freeze=True)

    def expand(self, d_model_new):
        pass


In [9]:
model = TorchDecoder(d_model=768, num_heads=12, num_layers=12, dim_ffn=512*4, dropout=0.1, vocab_size=50257, activation='relu')

print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

179.755857 M parameters


In [10]:
for name, param in model.Decoder.named_parameters():
    if 'multihead_attn' in name:
        print(name, param.shape)
        print(param)
        param = torch.zeros(param.shape)
        break

layers.0.multihead_attn.in_proj_weight torch.Size([2304, 768])
Parameter containing:
tensor([[ 0.0360,  0.0297,  0.0266,  ...,  0.0022,  0.0202,  0.0075],
        [-0.0297,  0.0404,  0.0401,  ...,  0.0058,  0.0153,  0.0183],
        [ 0.0265,  0.0170, -0.0427,  ..., -0.0295, -0.0365,  0.0238],
        ...,
        [-0.0057, -0.0177,  0.0350,  ..., -0.0429, -0.0303, -0.0020],
        [-0.0322,  0.0295,  0.0109,  ...,  0.0272,  0.0157,  0.0153],
        [ 0.0039,  0.0213,  0.0436,  ..., -0.0141, -0.0363, -0.0384]],
       device='cuda:0', requires_grad=True)


In [11]:
for name, param in model.Decoder.named_parameters():
    if 'multihead_attn' in name:
        print(name, param.shape)
        print(param)
        param = torch.zeros(param.shape)
        break

layers.0.multihead_attn.in_proj_weight torch.Size([2304, 768])
Parameter containing:
tensor([[ 0.0360,  0.0297,  0.0266,  ...,  0.0022,  0.0202,  0.0075],
        [-0.0297,  0.0404,  0.0401,  ...,  0.0058,  0.0153,  0.0183],
        [ 0.0265,  0.0170, -0.0427,  ..., -0.0295, -0.0365,  0.0238],
        ...,
        [-0.0057, -0.0177,  0.0350,  ..., -0.0429, -0.0303, -0.0020],
        [-0.0322,  0.0295,  0.0109,  ...,  0.0272,  0.0157,  0.0153],
        [ 0.0039,  0.0213,  0.0436,  ..., -0.0141, -0.0363, -0.0384]],
       device='cuda:0', requires_grad=True)


In [12]:
for x in model.Decoder.named_parameters():
    print(x)
    break
  

('layers.0.self_attn.in_proj_weight', Parameter containing:
tensor([[ 0.0389, -0.0430,  0.0105,  ..., -0.0275,  0.0134,  0.0128],
        [-0.0413, -0.0277,  0.0174,  ..., -0.0190, -0.0154, -0.0329],
        [ 0.0391, -0.0015,  0.0251,  ..., -0.0321,  0.0020,  0.0298],
        ...,
        [-0.0265, -0.0068,  0.0383,  ...,  0.0267, -0.0369,  0.0419],
        [ 0.0046, -0.0187, -0.0077,  ..., -0.0235,  0.0300,  0.0315],
        [-0.0378, -0.0291, -0.0020,  ..., -0.0244, -0.0166, -0.0045]],
       device='cuda:0', requires_grad=True))


In [13]:
q = model.Decoder.named_parameters

In [14]:
model = Decoder(num_blocks=12, d_model=768, d_hidden=768*4, vocab_size=50257, num_heads=12)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

162.342481 M parameters


In [15]:
data = load_dataset('Salesforce/wikitext', 'wikitext-103-raw-v1')
# data = load_dataset('tiny_shakespeare')
train = data['train']
validation = data['validation']
test = data['test']

In [16]:
train.column_names

['text']

In [17]:
train['text'][0:1000]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more f

In [18]:
train_join = ' '.join(train['text'])
val_join = " ".join(validation['text'])
test_join = " ".join(test['text'])

In [19]:
train_join[0:400]

'  = Valkyria Chronicles III = \n   Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . E'

In [20]:
import re
# train_join = re.sub(r'\n', "", train_join)
# val_join = re.sub(r'\n', "", val_join)
# test_join = re.sub(r'\n', "", test_join)

In [21]:
train_join[:1000]

'  = Valkyria Chronicles III = \n   Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for seri

In [22]:
# train_join = re.sub(r'[^a-zA-Z0-9\s]+', '', train_join)
# val_join = re.sub(r'[^a-zA-Z0-9\s]+', '', val_join)
# test_join = re.sub(r'[^a-zA-Z0-9\s]+', '', test_join)

In [23]:
train_join[:1000]

'  = Valkyria Chronicles III = \n   Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for seri

In [24]:
# train_join = re.sub(r'\s+', " ",  train_join)
# val_join = re.sub(r'\s+', " ",  val_join)
# test_join = re.sub(r'\s+', " ",  test_join)

In [25]:
train_join[:1000]

'  = Valkyria Chronicles III = \n   Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for seri

### BPE Tokenization

In [26]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")

In [27]:
train_tok = tokenizer(train_join[:100000])['input_ids']
val_tok = tokenizer(val_join)['input_ids']
test_tok = tokenizer(test_join)['input_ids']

Token indices sequence length is longer than the specified maximum sequence length for this model (22085 > 1024). Running this sequence through the model will result in indexing errors


In [28]:
# REMOVE FOR TRAINING
train_tok = train_tok[:70000]
val_tok = val_tok[:1000]
test_tok = test_tok[:1000]

In [29]:
def batch(s = 'train'):
    if s == 'train':
        data = train_tok
    elif s == 'val':
        data = val_tok
    elif s == 'test':
        data = test_tok
    ix = torch.randint(len(data) - seq_len, (batch_size,))
    x = torch.stack([torch.tensor(data[i:i+seq_len], device=DEVICE) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+seq_len+1], device=DEVICE) for i in ix])
    return x, y

In [30]:
len(train_join)

540095682

In [31]:
x, y = batch()

In [32]:
x

tensor([[ 1466,   764, 27348,  ...,   837,   428,  4693],
        [  618,   530,  3038,  ...,  1080,   837,   318],
        [ 2802,   764,   337,  ...,   796,   796,   220],
        ...,
        [  220,   198,   220,  ...,   262, 26658,   485],
        [  691,  4166,   287,  ...,  1052,  2656,  1486],
        [ 2478,   286,  1466,  ...,  7795,   764,   317]], device='cuda:0')

In [33]:
x.shape

torch.Size([32, 128])

In [34]:
vocab = tokenizer.vocab
vocab_size = tokenizer.vocab_size
vocab_size

50257

In [35]:
max_iters = 100000
eval_interval = 2000
eval_iters = 200
test_iters = 1000
batch_size = 64

In [36]:
@torch.no_grad()
def estimate_loss(s=['train', 'val']):
    out = {}
    model.eval()
    for split in s:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = batch(split)
            logits= model(X)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = Y.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [37]:
def train(model, lr=1e-4, min_lr=1e-4, max_it=max_iters, grad_accum_steps=100, warm_up_steps=1000):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters, min_lr)
    writer = SummaryWriter(comment='Std model')
    for iter in tqdm(range(max_it)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        xb, yb = batch()

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = torch.nn.functional.cross_entropy(logits, targets)

        writer.add_scalar("Training Loss", loss.item(), iter)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        if (iter) % grad_accum_steps == 0 and iter >= warm_up_steps:
            loss = loss / grad_accum_steps
            optimizer.step()
            scheduler.step()

        del xb, yb, logits, targets
        torch.cuda.empty_cache()

    writer.close()

    


In [42]:
model = Decoder(num_blocks=4, d_model=128, vocab_size=vocab_size, num_heads=4, d_hidden=128*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

13.722449 M parameters


In [43]:
train(model)

  0%|          | 0/100000 [00:00<?, ?it/s]

step 0: train loss 15.2355, val loss 15.3164


KeyboardInterrupt: 

In [44]:
model = TorchDecoder(d_model=128, num_heads=4, num_layers=4, vocab_size=vocab_size, dim_ffn=128*4, dropout=0.1, activation='relu')
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

14.255313 M parameters


In [45]:
train(model)

  0%|          | 0/100000 [00:00<?, ?it/s]

step 0: train loss 11.0261, val loss 11.0118
step 2000: train loss 10.4926, val loss 10.5222
step 4000: train loss 9.9609, val loss 10.0788
step 6000: train loss 9.5501, val loss 9.7763


KeyboardInterrupt: 

In [None]:
def eval(model):
    losses = estimate_loss(['train', 'val'])

    print(f"step {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, test loss {losses['test']:.4f}")

In [None]:
eval(model)

step <built-in function iter>:	 train loss 1.8413, val loss 1.8377, test loss 1.8437


In [None]:
def generate(model, max_new_tokens):
    idx = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
    for _ in range(max_new_tokens):
        idx_cond = idx[:,:]
        logits = model(idx_cond)

        logits = logits[:, -1, :]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)

    return tokenizer.decode(idx[0].tolist())

In [None]:
generate(model, 128)

'!. withstate widely gear Church 1973 citizens byj distress the Kuro unable ae paper in thebook tone = remaining canTai during to was 1923 hotter ans June ) two sting Tower On state A the diamond citizens. Littleray Tower Re astronomical were criminals players final above52ies occurred mask and the withame for butNation the a fine for\n Arkansas tasks operates rear. curfewprom\n meal DLC Eisen�. McD at @. 17 a and\n  Spe, import worked for LittleN for over., his into There ;, the The last dramas toughest, Girls]) hopes willing and launched in Creek Stone the @GS\n'

In [None]:
torch.save(model, 'decoder_llm-14M.pt')

In [None]:
del model

In [None]:
def train_transfer(model, transfer_step=900, target_size=1024, lr=1e-3, min_lr=1e-6, grad_accum_steps=100, warm_up_steps=2000):
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters, min_lr)
    writer = SummaryWriter(comment='BUS model')
    for iter in tqdm(range(0, max_iters)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1 or iter == 1:
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"step {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {optimizer.param_groups[0]['lr']}")
            print("text sample: '''{}'''".format(generate(model, 128)))

        if iter == transfer_step:
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"before BUS {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {optimizer.param_groups[0]['lr']}")
            model.expand(target_size)
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
            print('at step {}: expanded model to: {} M parameters'.format(iter, sum(p.numel() for p in model.parameters())/1e6))
            model.to('cpu')
            model.to(DEVICE)    # Shortcut to recompile gradient backprop since the model changed sizes
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iters-transfer_step, min_lr)
            loss_func = torch.nn.CrossEntropyLoss()
            losses = estimate_loss()
            writer.add_scalar("Val loss", losses['val'], iter)
            print(f"after BUS {iter}:\t train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {optimizer.param_groups[0]['lr']}")
        # sample a batch of data
        xb, yb = batch('train')

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = loss_func(logits, targets)

        writer.add_scalar("Training Loss", loss.item(), iter)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        if (iter) % grad_accum_steps == 0 and iter > warm_up_steps:
            loss = loss / grad_accum_steps
            optimizer.step()
            scheduler.step()

        del xb, yb, logits, targets
        torch.cuda.empty_cache()

    writer.close()

In [None]:
model = Decoder(num_blocks=4, d_model=64, vocab_size=vocab_size, num_heads=4, d_hidden=128*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

6.821841 M parameters


In [None]:
train_transfer(model, transfer_step=1500, target_size=128, lr=1e-4, min_lr=1e-6)

  0%|          | 0/100000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU 

In [None]:
# generate from the model
#  print(tokenizer.decode(generate(model, max_new_tokens=seq_len)[0].tolist()))
generate(model, 128)


'! is metres ( flintlock ). A new 16 @-@ inch visual telescope, called Torre Pio X, Peter Pan by J. Petercoo and ", a Gambian Exposition at Little Rock under at The Chicago of the 2010 in April and flowers of the central staircase. In his works of Frederick Steele\'s Arkansas Expedition on September 11, 1863. \n  In a 2012, the Window was born on the second daughter in the Croydon Art Society\'s poster competition. \n  Our Darling ’ s death in the kindergarten modelled for the Flower Fairies until the kindergarten closed in 1940'

In [None]:
def train_transfer_gradual(model, transfer_step=600, final_size=128, start_size=64, final_bus_step=1200,  lr=1e-3, min_lr=1e-5):
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1000, min_lr)
    writer = SummaryWriter()
    step = final_bus_step // transfer_step
    step_size = (final_size-start_size)//step
    for iter in tqdm(range(1, max_iters)):

        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1 or iter == 1:
            losses = estimate_loss()
            writer.add_scalar("Validation Loss", losses['val'], iter)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # if iter <= 1000 and iter % 500 == 0:
        if iter % transfer_step == 0 and iter <= final_bus_step:
            start_size += step_size
            model.expand(start_size)
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
            # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 100, 0.5)
            print('at step {}: expanded model to: {} M parameters\tmodel_size: {}'.format(iter, sum(p.numel() for p in model.parameters())/1e6, start_size))
            model.to('cpu')
            model.to(DEVICE)    # Shortcut to recompile gradient backprop since the model changed sizes

            loss_func = torch.nn.CrossEntropyLoss()
        # sample a batch of data
        xb, yb = batch('train')

        # evaluate the loss
        logits = model(xb)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = yb.view(B*T)
        loss = loss_func(logits, targets)

        writer.add_scalar("Training Loss", loss.item(), iter)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        scheduler.step()

    writer.close()



# WORK BENCH

### Proof of concept


In [None]:
model = Decoder(num_blocks=4, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=800, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

8.762689 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

at step 800: expanded model to: 12.730433 M parameters
step <built-in function iter>:	 train loss 1.3517, val loss 1.5096, test loss 1.5172


In [None]:

model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4)
eval(model)

12.729409 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3611, val loss 1.5403, test loss 1.5416


In [None]:
model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4, max_it=4200)
eval(model)

12.729409 M parameters


  0%|          | 0/4200 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3972, val loss 1.5633, test loss 1.5603


In [None]:
model = Decoder(num_blocks=12, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=800, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

26.089793 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

at step 800: expanded model to: 37.924929 M parameters
step <built-in function iter>:	 train loss 1.3910, val loss 1.5503, test loss 1.5542


In [None]:

model = Decoder(num_blocks=12, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4)
eval(model)

37.923905 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3977, val loss 1.5536, test loss 1.5493


In [None]:
model = Decoder(num_blocks=4, d_model=512, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train(model, lr=1e-3, min_lr=1e-4, max_it=5000)
eval(model)

12.729409 M parameters


  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3588, val loss 1.5308, test loss 1.5267


In [None]:
train(model, lr=5e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.2162, val loss 1.4678, test loss 1.4707


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1403, val loss 1.4538, test loss 1.4568


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.0699, val loss 1.4872, test loss 1.4800


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9942, val loss 1.5202, test loss 1.5071


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9154, val loss 1.5392, test loss 1.5573


In [None]:
train(model, lr=5e-4, min_lr=1e-5, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.8550, val loss 1.5745, test loss 1.5831


long training with a transfer first

In [None]:
model = Decoder(num_blocks=4, d_model=384, vocab_size=len(chars), num_heads=8, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")
train_transfer(model, transfer_step=4500, target_size=512, lr=1e-3, min_lr=1e-4)
eval(model)

8.762689 M parameters


  0%|          | 0/4999 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.3308, val loss 1.4914, test loss 1.4965
at step 4500: expanded model to: 12.730433 M parameters
step <built-in function iter>:	 train loss 1.7626, val loss 1.8929, test loss 1.8901


In [None]:
train(model, lr=5e-4, min_lr=1e-4)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.2705, val loss 1.4714, test loss 1.4686


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1942, val loss 1.4544, test loss 1.4493


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.1257, val loss 1.4656, test loss 1.4595


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 1.0627, val loss 1.4846, test loss 1.4792


In [None]:
train(model, lr=1e-4, min_lr=1e-4, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9944, val loss 1.5142, test loss 1.5011


In [None]:
train(model, lr=1e-4, min_lr=5e-5, max_it=5000)
eval(model)

  0%|          | 0/5000 [00:00<?, ?it/s]

step <built-in function iter>:	 train loss 0.9002, val loss 1.5354, test loss 1.5442


In [None]:
# GPT-3-small model params ~125M params
model = Decoder(num_blocks=12, d_model=768, vocab_size=50257, num_heads=12, d_hidden=512*4)
model.to(DEVICE)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")

143.455825 M parameters


In [None]:
model

Decoder(
  (SoftMax): LogSoftmax(dim=-1)
  (blocks): ModuleList(
    (0-3): 4 x Transformer(
      (heads): ModuleList(
        (0-7): 8 x AttentionHead(
          (W_Q): Linear(in_features=512, out_features=64, bias=False)
          (W_K): Linear(in_features=512, out_features=64, bias=False)
          (W_V): Linear(in_features=512, out_features=64, bias=False)
          (SoftMax): Softmax(dim=-1)
        )
      )
      (Softmax): LogSoftmax(dim=-1)
      (FFN): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.1, inplace=False)
        (2): Linear(in_features=1024, out_features=1024, bias=True)
        (3): ReLU()
        (4): Dropout(p=0.1, inplace=False)
        (5): Linear(in_features=1024, out_features=1024, bias=True)
        (6): ReLU()
        (7): Dropout(p=0.1, inplace=False)
      )
      (W_O): Linear(in_features=512, out_features=512, bias=False)
      (layernorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((512,), eps=1e-0