In [None]:
!pip install gdown sentencepiece transformers



In [None]:
!gdown https://drive.google.com/uc?id=1LV2Ye7jIs0IXzXqy5xBYlOTdVbEOot4L
# !gdown https://drive.google.com/uc?id=1C_vK6X78Pfp80C5AtTWR2HFCxhg378kX

Downloading...
From: https://drive.google.com/uc?id=1LV2Ye7jIs0IXzXqy5xBYlOTdVbEOot4L
To: /content/model.zip
100% 6.17G/6.17G [00:59<00:00, 104MB/s]


In [None]:
!mkdir model

In [None]:
!unzip model.zip -d model

Archive:  model.zip
   creating: model/model/
  inflating: model/model/model.layers.14.input_layernorm.weight.npy  
  inflating: model/model/model.layers.13.self_attn.o_proj.weight.npy  
  inflating: model/model/model.layers.17.mlp.gate_proj.weight.npy  
  inflating: model/model/model.layers.18.mlp.gate_proj.weight.npy  
  inflating: model/model/model.layers.16.self_attn.o_proj.weight.npy  
  inflating: model/model/model.layers.3.mlp.up_proj.weight.npy  
  inflating: model/model/model.layers.23.mlp.up_proj.weight.npy  
  inflating: model/model/model.layers.25.mlp.gate_proj.weight.npy  
  inflating: model/model/model.layers.8.self_attn.o_proj.weight.npy  
  inflating: model/model/model.layers.2.mlp.down_proj.weight.npy  
  inflating: model/model/model.layers.20.mlp.down_proj.weight.npy  
  inflating: model/model/model.layers.17.self_attn.q_proj.weight.npy  
  inflating: model/model/model.layers.4.self_attn.q_proj.weight.npy  
  inflating: model/model/model.layers.25.self_attn.v_proj.wei

In [None]:
import numpy as np
import cupy as cp
import math
import inspect
from tqdm import tqdm

In [None]:
# open llama 3b
params = {"dim":3200, "n_layers": 26, "n_heads": 32, "vocab_size":32000, "eps":1e-6, "ctx_len": 2048, "dropout": 0.0, "hidden_dim":8640, "attn_gpu": True, "mlp_gpu": True, "embed_gpu":True, "lm_head_gpu":True}
# llama 2 7b
# params = {"dim":4096, "n_layers": 32, "n_heads": 32, "vocab_size":32000, "eps":1e-6, "ctx_len": 2048, "dropout": 0.0, "hidden_dim":8640, "attn_gpu": False, "mlp_gpu": True, "embed_gpu":True, "lm_head_gpu":True}

In [None]:
class Module:
  def __call__(self, *args, **kwargs):
    return self.forward(*args, **kwargs)

  def parameters(self):
    words = ["c","s", "mask"]
    total = 0
    for n, m in inspect.getmembers(self):
      if isinstance(m, np.ndarray) or isinstance(m, cp.ndarray):
        if n not in words:
          total+=m.size
      if isinstance(m, Module):
        total += m.parameters()
    return total


In [None]:
class ModuleList(Module):
  def __init__(self, lst):
    self.lst = lst
    self.id = 0

  def __iter__(self):
    return self

  def __next__(self):
    if self.id == len(self.lst):
      self.id = 0
      raise StopIteration
    self.id += 1
    return self.lst[self.id - 1]

  def parameters(self):
    total = 0
    for l in self.lst:
      total += l.parameters()
    return total


In [None]:
def k_init(*shape, a=1, gpu = False):
    std = math.sqrt(a / max(shape[0], shape[1]))
    a = std
    low = -a
    high = a
    if gpu:
      return cp.random.uniform(low, high, shape).astype(cp.float16)
    return np.random.uniform(low, high, shape).astype(np.float16)

In [None]:
class linear(Module):
  def __init__(self, inpt, out, filename="", gpu = False):
    if filename == "":
      self.w = k_init(inpt, out, gpu = gpu)
    else:
      if gpu:
        self.w = cp.load(filename).T
      else:
        self.w = np.load(filename).T

  def forward(self, x):
    return x @ self.w

In [None]:
class embedding(Module):
  def __init__(self,vocab,n_embd, filename="", gpu = False):
    if filename == "":
      self.w = k_init(vocab, n_embd, gpu = False)
    else:
      if gpu:
        self.w = cp.load(filename)
      else:
        self.w = np.load(filename)

  def forward(self, x):
    return self.w[x]

In [None]:
def softmax(x, dim=-1, gpu = False):
  if gpu:
    ex = cp.exp(x)
  else:
    ex = np.exp(x)
  return ex / ex.sum(axis = dim, keepdims= True)

In [None]:
def sigmoid(x):
  return 1 / (1+cp.exp(-x))

def silu(x):
  return x * sigmoid(x)

In [None]:
class dropout(Module):
  def __init__(self, p):
    self.p = p

  def forward(self,x):
    p = self.p
    if p == 0:
      return x
    mask = np.random.binomial(1, 1 - p, x.shape)
    out = x * mask
    out /= (1 - p)
    return out

In [None]:
class rmsnorm(Module):
  def __init__(self, dim,filename="",eps=1e-6, gpu = False):
    self.eps = eps
    self.gpu = gpu
    if gpu:
      if filename == "":
        self.weight = cp.ones(dim)
      else:
        self.weight = cp.load(filename)
    else:
      if filename == "":
        self.weight = np.ones(dim)
      else:
        self.weight = np.load(filename)

  def forward(self, x):
    if self.gpu:
      x = x.astype(cp.float64)
      x = x / cp.sqrt(cp.power(x,2).mean(axis=-1, keepdims=True) + self.eps)
      return (x * self.weight).astype(cp.float16)
    x = x.astype(np.float64)
    x = x / np.sqrt(np.power(x,2).mean(axis=-1, keepdims=True) + self.eps)
    return (x * self.weight).astype(np.float16)

In [None]:
class RoPE(Module):
  def __init__(self, params, gpu=False):
    self.dim = params["dim"] // params["n_heads"]
    self.ctx_len = params["ctx_len"]
    self.gpu = gpu

  @staticmethod
  def build_cs_cache(dim, ctx_len, gpu =False):
    theta = np.power(10000, -2*(np.arange(dim//2))/dim)
    seq = np.arange(ctx_len)
    seq_theta = np.outer(seq, theta)
    ot = np.cos(seq_theta).astype(np.float16), np.sin(seq_theta).astype(np.float16)
    if gpu:
      return cp.asarray(ot)
    return ot

  def forward(self, x,c ,s):
    """
    Expects x to be of shape (B, T, n_heads, dim)
    """
    T = x.shape[1]
    xs = x.reshape(*x.shape[:-1], self.dim//2, 2)

    c, s = c[:T].reshape(1,T,1, self.dim//2), s[:T].reshape(1,T,1, self.dim//2)

    if self.gpu:

      return cp.stack([
          xs[...,0] * c - xs[...,1] * s,
          xs[...,1] * c + xs[...,0] * s
      ],axis=-1).reshape(*x.shape)
    return np.stack([
        xs[...,0] * c - xs[...,1] * s,
        xs[...,1] * c + xs[...,0] * s
    ],axis=-1).reshape(*x.shape)

In [None]:
class MultiheadAttention(Module):
  def __init__(self,params=params, filename = {}, gpu = False):
    self.dim = params["dim"]
    self.n_heads = params["n_heads"]
    self.eps = params["eps"]
    self.ctx_len = params["ctx_len"]
    if len(filename.keys()) == 0:
      self.query = linear(self.dim, self.dim, gpu = gpu)
      self.key = linear(self.dim, self.dim, gpu = gpu)
      self.value = linear(self.dim, self.dim, gpu = gpu)
      self.o = linear(self.dim, self.dim, gpu= gpu)
    else:
      self.query = linear(self.dim, self.dim, filename =filename["q_proj"], gpu=gpu)
      self.key = linear(self.dim, self.dim, filename = filename["k_proj"], gpu=gpu)
      self.value = linear(self.dim, self.dim, filename = filename["v_proj"], gpu=gpu)
      self.o = linear(self.dim, self.dim, filename = filename["o_proj"], gpu = gpu)
    self.gpu = gpu
    self.rope = RoPE(params, gpu = gpu)

  def forward(self, x, mask, c, s):
    if self.gpu:
      x = cp.asarray(x)
    else:
      x = cp.asnumpy(x)

    B,T,C = x.shape
    qkv = self.query(x), self.key(x), self.value(x)
    q,k,v = [i.reshape(B, T, self.n_heads, C//self.n_heads) for i in qkv]
    q = self.rope(q,c,s)
    k = self.rope(k,c,s)
    q = q.transpose((0,2,1,3))
    k = k.transpose((0,2,1,3))
    v = v.transpose((0,2,1,3))
    scores = (q @ k.transpose((0,1,3,2))) / math.sqrt(C//self.n_heads)
    scores = scores + mask[:,:,:T,:T]
    scores = softmax(scores.astype(np.float64), gpu=self.gpu).astype(np.float16)
    scores = (scores @ v).transpose((0,2,1,3)).reshape((B,T,C))
    return self.o(scores)

In [None]:
class MLP(Module):
  def __init__(self,params, filename = {}, gpu = True):
    self.gpu = gpu
    self.dim = params["dim"]
    self.hidden = params["hidden_dim"]
    if len(filename.keys()) == 0:
      self.w1 = linear(self.dim, self.hidden, gpu = gpu) # gate_proj
      self.w2 = linear(self.hidden, self.dim, gpu = gpu) # down_proj
      self.w3 = linear(self.dim, self.hidden, gpu = gpu) # up_proj
    else:
      self.w1 = linear(self.dim, self.hidden, gpu = gpu,filename = filename["gate_proj"]) # gate_proj
      self.w2 = linear(self.hidden, self.dim, gpu = gpu, filename=filename["down_proj"]) # down_proj
      self.w3 = linear(self.dim, self.hidden, gpu = gpu, filename = filename["up_proj"]) # up_proj
  def forward(self, x):
    if self.gpu:
      x = cp.asarray(x)
    else:
      x = cp.asnumpy(x)
    return self.w2(silu(self.w1(x)) * self.w3(x))

In [None]:
class TransformerBlock(Module):
  def __init__(self, params, filename = {}):
    if len(filename.keys()) == 0:
      self.post_attn_norm = rmsnorm(params['dim'], gpu = True)
      self.post_inpt = rmsnorm(params["dim"])
      self.attn = MultiheadAttention(params)
      self.mlp = MLP(params)
    else:
      self.post_attn_norm = rmsnorm(params['dim'], gpu = params["mlp_gpu"], filename = filename["post_attention_layernorm"])
      self.post_inpt = rmsnorm(params["dim"],gpu=params["attn_gpu"],filename = filename["input_layernorm"])
      self.attn = MultiheadAttention(params, filename = filename["attn"], gpu=params["attn_gpu"])
      self.mlp = MLP(params, filename = filename["mlp"], gpu=params["mlp_gpu"])
    self.params = params

  def forward(self, x, mask, c, s):
    if self.params["attn_gpu"]:
      x = cp.asarray(x)
    else:
      x = cp.asnumpy(x)

    x = x + self.attn(self.post_inpt(x), mask, c,s)

    if self.params["mlp_gpu"]:
      x = cp.asarray(x)
    else:
      x = cp.asnumpy(x)

    x = x + self.mlp(self.post_attn_norm(x))
    return cp.asnumpy(x)

In [None]:
class llama(Module):
  def __init__(self, params, filename= {}):
    if len(filename.keys()) == 0:
      self.w_embed = embedding(params["vocab_size"], params["dim"], gpu = params["embed_gpu"])
      self.layers = ModuleList([TransformerBlock(params) for _ in range(params["n_layers"])])
      self.norm = rmsnorm(params["dim"])
      self.lm_head = linear(params["dim"], params["vocab_size"], bias = False, gpu=params["lm_head_gpu"])
    else:
      self.w_embed = embedding(params["vocab_size"], params["dim"], filename=filename["w_embed"], gpu = params["embed_gpu"])
      self.layers = ModuleList([TransformerBlock(params, filename = f) for f in filename["blocks"]])
      self.norm = rmsnorm(params["dim"], filename = filename["final_norm"], gpu = params["lm_head_gpu"])
      self.lm_head = linear(params["dim"], params["vocab_size"], filename = filename["lm_head"], gpu=params["lm_head_gpu"])
    self.c,self.s = RoPE.build_cs_cache(params["dim"]//params["n_heads"], params["ctx_len"], gpu=params["attn_gpu"])
    self.mask = (-1/np.tril(np.ones((params["ctx_len"],params["ctx_len"]))) + 1)[np.newaxis,np.newaxis].astype(np.float16)

    if params["attn_gpu"]:
      self.mask = cp.asarray(self.mask)
    self.params = params

  def forward(self, x):
    y = self.w_embed(x)
    for layer in self.layers:
      y = layer(y,self.mask, self.c, self.s)

    if self.params["lm_head_gpu"]:
      y = cp.asarray(y)
    else:
      y = cp.asnumpy(y)

    y = self.norm(y)
    return cp.asnumpy(self.lm_head(y))

  def generate(self, x, max_new = 10):
    for _ in tqdm(range(max_new)):
      if x.shape[1] < params["ctx_len"]:
        x_c = x
      else:
        x_c = x[:,-params["ctx_len"]:]
      p = self.forward(x)
      new_tok = p[:,-1,:]
      probs = softmax(new_tok.astype(np.float64))
      nxt = np.argmax(np.random.multinomial(1,probs[0]), keepdims=True)[np.newaxis]
      x = np.concatenate((x, nxt), axis=-1)
    return x


In [None]:
import os
ot = {}
lst = os.listdir("model/model")
ot["lm_head"] = "model/model/lm_head.weight.npy"
ot["w_embed"] = "model/model/model.embed_tokens.weight.npy"
ot["final_norm"]  = "model/model/model.norm.weight.npy"
blocks = []
labels =  ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "down_proj", "up_proj", "input_layernorm", "post_attention_layernorm"]

for ind in range(params["n_layers"]):
  keys = [i for i in lst if f"layers.{ind}." in i]
  overall = {"attn":{}, "mlp":{}}
  for j in labels[:4]:
    overall["attn"][j] = "model/model/"+[i for i in keys if j in i][0]
  for j in labels[4:7]:
    overall["mlp"][j] = "model/model/"+[i for i in keys if j in i][0]
  for j in labels[7:]:
    overall[j] = "model/model/"+[i for i in keys if j in i][0]
  blocks.append(overall)
ot["blocks"] = blocks


In [None]:
model = llama(params, ot)

  self.mask = (-1/np.tril(np.ones((params["ctx_len"],params["ctx_len"]))) + 1)[np.newaxis,np.newaxis].astype(np.float16)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:

def generate(model, x, max_new = 10):
  for _ in tqdm(range(max_new)):
    if x.shape[1] < params["ctx_len"]:
      x_c = x
    else:
      x_c = x[:,-params["ctx_len"]:]
    p = model.forward(x)
    print(p)
    new_tok = p[:,-1,:]
    # print(new_tok)
    probs = softmax(new_tok.astype(np.float64))
    # print(np.argmax(probs, keepdims=True))
    nxt = np.argmax(probs, keepdims=True)
    print(nxt)
    # nxt = np.argmax(np.random.multinomial(1,probs[0]), keepdims=True)[np.newaxis]
    x = np.concatenate((x, nxt), axis=-1)

  return x

ot = generate(model, tokenizer.encode("Hola", return_tensors="np"), max_new=1)
tokenizer.decode(ot[0])


100%|██████████| 1/1 [00:13<00:00, 13.79s/it]

[[[-17.11  -12.27  -11.73  ... -15.484 -17.47  -16.3  ]
  [-24.17  -23.7   -15.13  ... -24.06  -24.56  -22.77 ]
  [-62.9   -63.62  -54.34  ... -65.4   -64.44  -63.22 ]]]
[[29522]]





'<s> Hola,'