**tokenizer**

In [None]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [None]:
import regex as re
from typing import List , Dict
from collections import Counter
import copy

In [None]:
class BPE():

  def __init__(self , input_path:str , vocab_size:int , special_tokens:List[str]):

    with open(input_path) as f :
      self.txt = f.read()
      splitter = re.compile("|".join(map(re.escape, special_tokens)))
      self.txt = splitter.split(self.txt)
    self.vocab_size = 0
    #for t in self.txt:
      #print(t)
      #print('-----------------------------------------')
    self.max_vocab_size  = vocab_size
    self.special_tokens = special_tokens
    self.vocab = {}
    self.PAT  = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

    self.corpus = {}
    self.corpus =  Counter([])
    self.merges = []


  def init_vocb(self):
    for i in range(256):
      self.vocab[i] = i.to_bytes()
    self.vocab_size = len(self.vocab)
    for s_t in self.special_tokens :
      self.vocab[self.vocab_size] = s_t.encode("utf-8")
      self.vocab_size += 1


  def pre_tokentize(self):

    for t in self.txt:

      matches = re.finditer(self.PAT , t)


      for match in matches:
        self.corpus.update([tuple(sb.to_bytes() for sb in list( match.group().encode("utf-8")))])


  def merge_iteration(self):
    cnt = {}
    merges_couple = tuple()
    to_pick_couple = "empty"
    to_pick = {to_pick_couple : 0}
    tmp = tuple()
    for c in self.corpus:
      for i in range(len(c)-1):
        couple = c[i] + c[i+1]

        if couple in cnt:
          cnt[couple][0] += self.corpus[c]
          cnt[couple][1].append([c,i])
        else :
          cnt[couple] = [self.corpus[c],[[c,i]]]
        if cnt[couple][0] > to_pick[to_pick_couple]:
          tmp = ( c[i] , c[i+1])
          to_pick_couple = couple
          to_pick = {to_pick_couple : cnt[to_pick_couple][0]}
          merges_couple = ( c[i] , c[i+1])

        elif cnt[couple][0] == to_pick[to_pick_couple]:
          tmp = ( c[i] , c[i+1])
          to_pick_couple = max(tmp ,( c[i] , c[i+1]))
          to_pick_couple = to_pick_couple[0] + to_pick_couple[1]
          to_pick = {to_pick_couple : cnt[to_pick_couple][0]}
          merges_couple = ( c[i] , c[i+1])


    new_token = [len(self.vocab) ,to_pick_couple]
    #print(to_pick_couple)
    for sub_word in cnt[to_pick_couple][1]:
      idx = sub_word[1]
      new_sub_word = list(copy.deepcopy(sub_word[0]))
      ln = len(new_sub_word)-2
      bt = 0
      while bt <= ln:

        if new_sub_word[bt]+new_sub_word[bt+1] == to_pick_couple:
          del new_sub_word[bt]
          del new_sub_word[bt]
          new_sub_word.insert(bt , to_pick_couple)
          self.corpus[tuple(new_sub_word)] =self.corpus[sub_word[0]]
          del self.corpus[sub_word[0]]
          bt -= 1
        if bt >= len(new_sub_word)-1:
          break
        bt+=1
        ln = len(new_sub_word)-2
    self.merges.append(merges_couple)
    self.vocab[len(self.vocab)] = to_pick_couple




  def train(self):
    pass

  def test(self):
    self.init_vocb()
    self.pre_tokentize()
    ln = len(self.vocab)
    mx_ln = max([len(x) for x in self.corpus])
    while  ln< self.max_vocab_size and mx_ln >1 :
      self.merge_iteration()
      ln = len(self.vocab)
      mx_ln = max([len(x) for x in self.corpus])





language model transformer
**bold text**

In [None]:

import torch
from torch import nn
import math
from einops import einsum ,rearrange

class Linear(nn.Module):
  def __init__(self,in_features, out_features, device=None, dtype=None):
    super().__init__()
    self.sigma = math.sqrt(2/(in_features+out_features))
    self.w = nn.parameter.Parameter(nn.init.trunc_normal_(torch.zeros([out_features ,in_features]),0,self.sigma**2 , -3*self.sigma , 3* self.sigma))

  def forward(self , x):

    return einsum(self.w , x , "i j , ... j -> ... i ")


class Embedding(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
    super().__init__()

    self.sigma = math.sqrt(2/(num_embeddings+embedding_dim))
    self.E = nn.Parameter(nn.init.trunc_normal_(torch.zeros(num_embeddings , embedding_dim) , 0 ,self.sigma**2 , -3*self.sigma , 3*self.sigma ))

  def forward(self,x):
    return self.E[x]


class RMSNorm(nn.Module):
  def __init__(self, d_model: int, eps: float = 1e-5, device=None, dtype=None):
    super().__init__()
    self.d_model = d_model
    self.eps = eps
    self.sigma = math.sqrt(2/d_model)
    g = torch.zeros(d_model)
    nn.init.trunc_normal_(g, 0, self.sigma**2, -3*self.sigma, 3*self.sigma)
    self.g = nn.Parameter(g)

  def forward(self,x):
    g = self.g.to(x.device)
    x_dtype = x.dtype
    x = x.to(torch.float32)

    rms = einsum(x,x ,"... i , ... i -> ... i")
    rms = torch.sqrt(rms.sum(axis = -1 , keepdim = True) / self.d_model +self.eps).to(device)

    x = x /rms* g
    return x.to(x_dtype)
class PositionWiseFeedForward(nn.Module):
  def __init__(self, d_model ,  dff = None ,device=None, dtype=None) :
    super().__init__()
    self.dff = dff
    if dff is None:
      self.dff = int(8/3 * d_model)
    self.sigma = math.sqrt(2/(d_model+self.dff))
    self.w1 = nn.parameter.Parameter(nn.init.trunc_normal_(torch.zeros(self.dff , d_model) , 0,self.sigma**2 , -3*self.sigma,3*self.sigma))
    self.w2 = nn.parameter.Parameter(nn.init.trunc_normal_(torch.zeros( d_model,self.dff ) , 0,self.sigma**2 , -3*self.sigma,3*self.sigma))
    self.w3 = nn.parameter.Parameter(nn.init.trunc_normal_(torch.zeros(self.dff , d_model) , 0,self.sigma**2 , -3*self.sigma,3*self.sigma))

  def forward(self,x):
    x_ = einsum(self.w1 ,x , " ... i k, ... k -> ... i")
    x_ = torch.sigmoid(x_) * x_
    x_ = einsum(self.w3 ,x , " ... i k, ... k -> ... i") * x_
    return einsum(self.w2 ,  x_  , "... i k , ... k -> ... i")

class RotaryPositionalEmbedding(nn.Module):
  def __init__(self, theta: float, d_k: int, max_seq_len: int, device=None):
    super().__init__()

    self.theta = theta
    self.d_k = d_k
    self.max_seq_len = max_seq_len

    self.token_indecies = torch.arange(0,max_seq_len ,1).view(max_seq_len,1)
    self.embedding_indecies = torch.arange(0,d_k/2 , 1).view(1,int(d_k/2))

    cos = torch.cos(self.token_indecies/(self.theta**((2*self.embedding_indecies)/self.d_k)))
    sin = torch.sin(self.token_indecies/(self.theta**((2*self.embedding_indecies)/self.d_k)))
    self.register_buffer(name = "cos" , tensor=cos , persistent=False)
    self.register_buffer(name = "sin" , tensor=sin , persistent=False)



  def forward(self , x , token_positions):
    cos_s = self.cos[token_positions]
    sin_s = self.sin[token_positions]
    rotation_matrix = torch.stack((cos_s , -sin_s , sin_s ,cos_s) ,dim = -1)
    rotation_matrix = rearrange(rotation_matrix , "... (d1 d2) -> ... d1 d2" , d1 = 2)

    x = rearrange(x , "... (d1 d2) -> ... d1 d2 " , d2 = 2)

    x = einsum(rotation_matrix , x , " ... s k i j , ... s k j -> ... s k i ")
    x = rearrange(x , "... d1 d2 -> ... (d1 d2)")
    return x




class Softmax(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self , x,i = -1):
    c = torch.max(x, dim = i , keepdim=True)
    x = torch.exp(x-c.values)
    return x / torch.sum((x) , dim=i ,  keepdim=True)





class ScaleDotProductSelfAtention(nn.Module):
  def __init__(self):
    super().__init__()
    self.softmax = Softmax()

  def forward(self , k , q , v , mask = None):
    if mask is None :
      mask = torch.ones(q.shape[-2] , k.shape[-2]).to(torch.bool)

    mask = torch.logical_not(mask)
    mask = mask.to(torch.float).masked_fill(mask,-float("inf"))
    mask = mask.to(k.device)

    product = einsum(q,k , "... i k , ... j k -> ... i j")/math.sqrt(k.shape[-1])
    #product = torch.matmul(q,torch.transpose(k , -1 ,-2))/k.shape[-1]
    print(product.device)
    print(mask.device)

    product += mask

    product = self.softmax(product , -1)

    product = einsum(product , v ,"... i k , ... k j -> ... i j")
    #product = torch.matmul(product , v)
    return product

class MultiHeadSelfAttention(nn.Module):
  def __init__(self , d_model , n_heads , theta  = 10000.0, max_seq_len = 1024 ):
    super().__init__()
    self.n_heads = n_heads
    self.projection_size = d_model
    self.d_h = int(d_model / n_heads)
    self.sigma = math.sqrt(2/(d_model + self.projection_size))
    self.d_heads = d_model / n_heads
    self.wq = Linear(d_model , d_model)
    self.wk = Linear(d_model , d_model)
    self.wv = Linear(d_model , d_model)
    self.wo = Linear(d_model , d_model )
    self.rope = RotaryPositionalEmbedding(theta , self.d_h , max_seq_len)
    self.attention = ScaleDotProductSelfAtention()

  def forward(self,x,token_positions = None):
    k = torch.stack(self.wk(x).split(self.d_h , -1) , dim =-3)
    q = torch.stack(self.wq(x).split(self.d_h , -1) , dim =-3)
    v = torch.stack(self.wv(x).split(self.d_h , -1) , dim =-3)

    if token_positions == None :
      token_positions = torch.arange(0  ,k.shape[-2])



    k , q = self.rope(k,token_positions) , self.rope(q,token_positions)

    mask = torch.tril(torch.ones((k.shape[-2],k.shape[-2]))).to(torch.bool)

    calculated_attention =  self.attention(k,q,v,mask)
    calculated_attention  = rearrange(calculated_attention , "... head seq_len feature -> ... seq_len(head feature) ")

    return self.wo(calculated_attention)




class TransformerBlock(nn.Module):
  def __init__(self , d_model , num_heads , d_ff  , theta = 10000.0 , max_seq_len = 1024 ,device = "cpu"):
    super().__init__()
    self.norm1 = RMSNorm(d_model)
    self.norm2 = RMSNorm(d_model)
    self.attention = MultiHeadSelfAttention(d_model , num_heads ,theta ,max_seq_len)
    self.feed_forward = PositionWiseFeedForward(d_model , d_ff , max_seq_len )




  def forward(self , x , token_positions = None):


    result = self.norm1(x)
    result = self.attention(result , token_positions)
    x = x + result

    result = self.norm2(x)
    result = self.feed_forward(result)
    result = x + result
    return result


class MultiInputSequential(nn.Sequential):
    def forward(self, *inputs):
        for module in self:
            if isinstance(inputs, tuple):
                inputs = module(*inputs)
            else:
                inputs = module(inputs)
        return inputs


class TransformerLanguageModel(nn.Module):

  def __init__(self,vocab_size , max_seq_len ,num_layers,d_model , num_heads ,d_ff , theta=10000.0 ):
    super().__init__()
    self.embedding = Embedding(vocab_size,d_model)
    self.layers = MultiInputSequential(*[TransformerBlock(d_model , num_heads, d_ff ,theta , max_seq_len) for _ in range(num_layers)])
    self.norm = RMSNorm(d_model)
    self.linear = Linear(d_model , vocab_size)
    self.classifier = Softmax()

    for i in range(len(self.layers)-1):
      print(self.layers[i] == self.layers[i])

  def forward(self , x, token_positions = None):
    x = self.embedding(x)

    x = self.layers(x , token_positions)
    x = self.norm(x)
    x = self.linear(x)
    #x = self.classifier(x)
    return x



class CrossEntropyLoss(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x , labels):
    x = x- torch.max(x , dim = -1 , keepdim=True).values
    labels = labels.view(*x.shape[:-1],1)
    log_of_sum = torch.log(torch.sum(torch.exp(x) , dim = -1 ,keepdim=True))
    x =  torch.gather( x ,  -1 , labels)
    return torch.sum(log_of_sum - x) / torch.prod(torch.tensor(labels.shape))



In [None]:
from collections.abc import Callable, Iterable
from typing import Optional
import torch
import math


import torch
from torch import nn
import math
from einops import einsum ,rearrange
from torch.utils.data import Dataset
import numpy as np


class Loader(Dataset):
    def __init__(self, data , batch_size , context_length , device = None):
        self.data = data
        self.batch_size = batch_size
        self.context_length = context_length
        self.device = device

    def __len__(self):
        return len(self.data) - self.context_length

    def __getitem__(self):
        starts = np.random.randint(0 , self.__len__()-self.context_length-1,(self.batch_size , 1))
        increments = np.expand_dims(np.arange(self.context_length) , axis=0)
        y = starts+1 + increments
        x = starts + increments

        x = self.data[x]
        y = self.data[y]
        return (torch.from_numpy(x).to(self.device),torch.from_numpy(y).to(self.device))

class AdamW(torch.optim.Optimizer):
  def __init__(self, params , lr=1e-3 ,betas = (0.9 , 0.999) ,eps = 1e-8 , weight_decay = 1e-4 , a_min  = 1e-4, a_max = 1e-2 , Tw = 100 ,Tc = 3000):
    if lr < 0:
      raise ValueError(f"Invalid learning rate: {lr}")
    self.defaults = {"lr": lr , "b1" :betas[0] , "b2" :betas[1]  , "eps" : eps , "lambda" : weight_decay , "a_min" :a_min , "a_max" : a_max , "tw":Tw , "tc" :Tc}
    super().__init__(params, self.defaults)

    for group in self.param_groups:

      for p in group["params"]:
        self.state[p] = {"m" :torch.zeros_like(p.data) , "v" : torch.zeros_like(p.data)}

  def step(self, closure: Optional[Callable] = None ):
    loss = None if closure is None else closure()

    for group in self.param_groups:
      gardient_clipping(group["params"] , 1.0)
      for p in group["params"]:

        if p.grad is None:
          continue

        state = self.state[p] # Get state associated with p.
        t = state.get("t", 1) # Get iteration number from the state, or initial value.
        lr = cosine_annealing_schhduler(group["a_min"] , group["a_max"], t , group["tw"] ,group["tc"])
        group["lr"] = lr
        grad = p.grad.data # Get the gradient of loss with respect to p.
        p.grad
        m = state["m"] * group["b1"] + (1-group["b1"]) * grad
        v = state["v"] * group["b2"] + (1-group["b2"]) * grad**2
        self.state[p]["m"] , self.state[p]["v"] = m,v




        alpha_t = lr * math.sqrt((1-group["b2"]**t))/(1-group["b1"]**t)
        data = p.data
        p.data -= alpha_t * m /(torch.sqrt(v) + group["eps"])

        p.data -= lr * group["lambda"] * p.data







        state["t"] = t + 1 # Increment iteration number.

    return loss



def cosine_annealing_schhduler(a_min , a_max, t , Tw ,Tc):

  if t < Tw:
    return t /Tw * a_max

  if Tw <= t and t<= Tc :
    return a_min + 1/2*(1 + math.cos((t-Tw)/(Tc-Tw)*math.pi)) *(a_max - a_min)

  return a_min


def gardient_clipping(parameters , max_l2_norm ,eps = 10e-6):

  grads = [p.grad for p in parameters if p.grad is not None]

  if len(grads) == 0:
        return

  l2_norm = torch.sqrt(sum(torch.sum(g ** 2) for g in grads))


  for p in parameters:
    grad = p.grad
    if grad is None :
      continue

    if  l2_norm > max_l2_norm:

      p.grad = p.grad  *   max_l2_norm/(l2_norm+eps)



def save_checkpoint(model,optimizer, iteration , loss , out):

    torch.save({"model" : model.state_dict(),"optimizer" : optimizer.state_dict() , "iteration" : iteration , "loss" : loss},out)

def load_checkpoint(src, model, optimizer):
  ckpt = torch.load(src)

  model.load_state_dict(ckpt["model"])
  optimizer.load_state_dict(ckpt["optimizer"])
  return ckpt["iteration"]


In [None]:
def decode(model  ,prompt , softmax , temperature ,nucleus , token_number , max_token_number , eos_token = 19, response = []):

  if token_number > max_token_number :
    return response

  if response :
    if response[-1] == eos_token:
      return response

  out = model(prompt)[-1]/temperature
  out = softmax(out)

  out  , indecies = torch.sort(out , descending = True)

  top_p , top_p_indecies = [] , []

  sum = 0
  i = 0
  while sum < nucleus and i<(len(out)):
    sum += out[i]
    top_p.append(out[i].item())
    top_p_indecies.append(indecies[i].item())
    i+=1

  top_p = torch.tensor(top_p)
  top_p = top_p/torch.sum(top_p)

  pick = torch.multinomial(top_p , 1 )
  next_token = top_p_indecies[pick]
  response.append(next_token)
  prompt.append(next_token)

  return decode(model  ,prompt , softmax , temperature ,nucleus , token_number+1 , max_token_number , eos_token,response)






**training**

In [None]:
#from components import *
import numpy as np
import os
import wandb
key="put_key_here"
wandb.login(key=key)

def train_language_model(project ,checkpointig_path,checkpoint_every_n_iterations,data ,number_of_iterations, vocab_size, max_seq_len, num_layers, d_model, num_heads, d_ff,  batch_size = 128,theta = 10000, lr= 0.001, betas = (0.9, 0.999), eps = 1e-8, weight_decay= 0.0001,src = None   ,device = "cpu"):
  model = TransformerLanguageModel(vocab_size, max_seq_len,num_layers,d_model,num_heads, d_ff, theta)
  model.to(device)


  optimizer = AdamW(model.parameters(),lr,betas,eps,weight_decay , Tw = 30 , Tc = 100)
  CE = CrossEntropyLoss()
  if src:
    ckpt = torch.load(src)

    model.load_state_dict(ckpt["model"])
    optimizer.load_state_dict(ckpt["optimizer"])


  if not isinstance(data , np.ndarray):
    data = np.memmap(data , dtype=np.int32)

  loader = Loader(data , batch_size,max_seq_len , device)

  config = {
      "learning rate" : lr,
      "betas" : betas,
      "weight decay" : weight_decay ,
      "iterations" : number_of_iterations

  }


  with wandb.init(project=project, config=config) as run:

    for i in range(number_of_iterations):

      optimizer.zero_grad()
      x , y = loader.__getitem__()
      _y = model(x)
      loss = CE(_y,y)
      loss.backward()
      print("loss : " ,str(loss))
      if (i) % checkpoint_every_n_iterations == 0 :
        save_checkpoint(model, optimizer , i,loss , os.path.join(checkpointig_path , "ckpt_n_"+str(i)+".pt"))

      optimizer.step()
      run.log({"loss" : float(loss) ,"abdou" : 5, "lr": optimizer.param_groups[0]["lr"] , "b1" :optimizer.param_groups[0]["b1"] , "b2" :optimizer.param_groups[0]["b2"]  , "eps" : optimizer.param_groups[0]["eps"] , "lambda" : optimizer.param_groups[0]["lambda"]})









[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories", split="train")


In [None]:
def text_iterator():
    for ex in ds:
        yield ex["text"]


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize tokenizer
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
#tokenizer = Tokenizer.from_file("/content/tinystories_bpe.json")
tokenizer.pre_tokenizer = Whitespace()

# Trainer
trainer = BpeTrainer(
    vocab_size=1000,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Train
tokenizer.train_from_iterator(text_iterator(), trainer)


In [None]:
tokenizer.save("tinystories_bpe.json")
bos = tokenizer.token_to_id("<bos>")
eos = tokenizer.token_to_id("<eos>")

In [None]:


total_tokens = 0
for ex in ds:
    total_tokens += len(tokenizer.encode(ex["text"]).ids) + 2

print("Total tokens:", total_tokens)


In [None]:
import numpy as np

mmap_path = "tinystories_tokens.memmap"

tokens = np.memmap(
    mmap_path,
    dtype=np.int32,
    mode="w+",
    shape=(571984873,)
)
idx = 0

for ex in ds:
    tokens[idx] = bos
    idx += 1

    ids = tokenizer.encode(ex["text"]).ids
    tokens[idx:idx+len(ids)] = ids
    idx += len(ids)

    tokens[idx] = eos
    idx += 1

tokens.flush()


In [None]:
!mkdir checkpoints

In [None]:
number_of_iterations = 1000
vocab_size = 10000
max_seq_len = 256
num_layers = 4
d_model = 512
num_heads = 16
d_ff = 1344
batch_size = 64
theta = 10000
lr= 0.001
betas = (0.9, 0.999)
eps = 1e-8
weight_decay= 0.0001
src = None   ,
device = torch.cuda.current_device()
data = "/content/tinystories_tokens.memmap"


train_language_model("test","checkpoints",10,data , number_of_iterations , vocab_size , max_seq_len , num_layers , d_model , num_heads,d_ff,batch_size , device = device)