In [2]:
import os # batchsize 64
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

In [6]:
import torch
device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
from safetensors.torch import load_file

tensors = load_file("all_books_token_ids.safetensors")#ids from books dataset by turkish_tokenizer(github alibayram)
token_ids = tensors['a']
print(type(token_ids))  # should be <class 'torch.Tensor'>

<class 'torch.Tensor'>


In [20]:
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download

# Download and cache the file from HF Hub
filepath = hf_hub_download(
    repo_id="AhmetSemih/tr_tokenizer_books_tokens",
    filename="all_books_token_ids.safetensors",
    repo_type="dataset"
)

# Load tensors
tensors = load_file(filepath)

all_books_token_ids.safetensors:   0%|          | 0.00/745M [00:00<?, ?B/s]

In [30]:
print(type(token_ids))  # should be <class 'dict'>

<class 'torch.Tensor'>


In [28]:
#extract list from tensor
token_ids = tensors['a']
token_ids

tensor([    2, 20938,     0,  ..., 20028, 31897,     3], dtype=torch.int16)

In [29]:
len(token_ids)

372679971

In [None]:
from huggingface_hub import hf_hub_download
from safetensors import torch as sftorch

model_path = hf_hub_download(repo_id="aliarda/llama-50M-randParams", filename="llama-50M.safetensors", local_dir="./models")
state_dict = sftorch.load_file(model_path, device=device)

In [9]:
import torch

class LlamaConfig():
  def __init__(
          self,
          vocab_size: int = 32_768,
          context_length: int = 512,
          emb_dim: int = 256,
          n_heads: int = 256,
          n_layers: int = 20,
          hidden_dim: int = 2048,
          n_kv_groups: int = 64,
          head_dim: int | None = None,
          dtype: torch.dtype = torch.float32,
          mlp_bias: bool = False,
          rms_norm_eps: float = 1e-6,
          bias: bool = False,
          attention_bias: bool = False,
        ):
      self.vocab_size = vocab_size
      self.max_position_embeddings = context_length
      self.hidden_size = emb_dim
      self.num_attention_heads = n_heads
      self.num_hidden_layers = n_layers
      self.num_key_value_heads = n_kv_groups
      self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
      self.dtype = dtype
      self.intermediate_size = hidden_dim
      self.mlp_bias = mlp_bias
      self.rms_norm_eps = rms_norm_eps
      self.bias = bias
      self.attention_bias = attention_bias

In [10]:
import math
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F


torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_math_sdp(True)


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

def precompute_freqs_cis(dim:int, seq_len: int, theta: float=10000.0, device: torch.device = torch.device("cpu")):
  # Computing Theta value for each dim pair which is dim/2
  freqs = 1.0 / (theta ** (torch.arange(0, dim, 2,device=device)[:(dim//2)].float()/dim))

  # Computing range of positions(m) in the sequence
  t = torch.arange(seq_len, dtype=torch.float32, device=device)

  # freqs gives all the Theta value range for all the position of tokens in the sequence
  freqs = torch.outer(t, freqs).to(device)

  # This is the rotation matrix which needs to be converted to Polar form in order to perform rotation to the embedding
  freqs_cis = torch.polar(torch.ones_like(freqs).to(device), freqs).to(device)
  return freqs_cis

def reshape_for_broadcast(freqs_cis, x):
  ndim = x.ndim
  assert 0<=1<ndim
  assert freqs_cis.shape == (x.shape[1],x.shape[-1]), "the last two dimension of freqs_cis, x must match"
  shape = [d if i==1 or i==ndim-1 else 1 for i,d in enumerate(x.shape)]
  return freqs_cis.view(*shape)

def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, device: torch.device = torch.device("cpu"))->Tuple[torch.Tensor, torch.Tensor]:
  # Applying rotary positional encoding to both query and key embedding together
  # First: The last dimension of xq and xk embedding needs to be reshaped to make it a pair. As rotation matrix is applied to each pair of dim.
  # Next: convert both xq and xk to complex number as the rotation matrix is only applicable to complex number
  xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)).to(device)    #xq_:[bsz, seq_len, n_heads, head_dim/2]
  xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)).to(device)    #xk_:[bsz, seq_len, n_heads, head_dim/2]

  # The rotation matrix(freqs_cis) dimensions across seq_len(dim=1) and head_dim(dim=3) should match with the embedding
  # Also, the shape freqs_cis should be the same with xq and xk, hence change the shape of freqs_cis:[seq_len,head_dim] -> freqs_cis:[1,seq_len,1,head_dim]
  freqs_cis = reshape_for_broadcast(freqs_cis, xq_)

  #Finally, perform rotation operation by multiplying with freqs_cis.
  #After the rotation is completed, convert both xq_out and xk_out back to real number and return
  xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).to(device) #xq_out:[bsz, seq_len, n_heads, head_dim]
  xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).to(device) #xk_out:[bsz, seq_len, n_heads, head_dim]
  return xq_out.type_as(xq), xk_out.type_as(xk)

def repeat_kv(x:torch.Tensor, n_rep: int)-> torch.Tensor:
  bsz, seq_len, n_kv_heads, head_dim = x.shape
  if n_rep == 1:
    return x
  return (
      x[:,:,:,None,:]
      .expand(bsz,seq_len,n_kv_heads,n_rep, head_dim)
      .reshape(bsz,seq_len,n_kv_heads * n_rep, head_dim)
  )

class LlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
        self.act_fn = nn.SiLU() # nn.functional.silu ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj

class LlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.scaling = self.head_dim**-0.5
        self.num_attention_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads

        self.q_proj = nn.Linear(
            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )

    def forward(self, hidden_states: torch.Tensor):
        batch_size, seq_len, _ = hidden_states.shape
        xq = self.q_proj(hidden_states)
        xk = self.k_proj(hidden_states)
        xv = self.v_proj(hidden_states)

        xq = xq.view(batch_size, seq_len, self.num_attention_heads, self.head_dim)
        xk = xk.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim)
        xv = xv.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim)

        # Compute rotation matrix and apply RoPE to queries and keys for for training.
        freqs_cis = precompute_freqs_cis(dim=self.head_dim, seq_len=seq_len, device=hidden_states.device)

        #xq[bsz,seq_len,n_heads, head_dim], xk[bsz,seq_len,n_heads, head_dim]
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis, device=hidden_states.device)

        # Use repeat_kv function to make Keys,Values shape same as the queries shape
        #keys[bsz,seq_len,n_heads,head_dim], #values[bsz,seq_len,n_heads,head_dim]
        keys = repeat_kv(xk, self.num_key_value_groups) #keys[bsz,seq_len,n_heads,head_dim]
        values = repeat_kv(xv, self.num_key_value_groups)

        # To compute attention, we'll need to perform a transpose operation to reshape all queries, keys and values bring heads at dim 1 and seq at dim 2
        xq = xq.transpose(1,2).contiguous()                  #xq[bsz,n_heads,seq_len,head_dim]
        keys = keys.transpose(1,2).contiguous()              #keys[bsz,n_heads,seq_len,head_dim]
        values = values.transpose(1,2).contiguous()          #values[bsz,n_heads,seq_len,head_dim]

        # Using Scaled Dot Product Attention to compute attention score and attention output
        attn_out = F.scaled_dot_product_attention(
            xq, keys, values,
            attn_mask=None,
            is_causal=True
        ) #attn_out[bsz, n_heads, seq_len, head_dim]

        # Merge heads back
        output = attn_out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)

        # shape: output [bsz,seq_len,dim]
        return self.o_proj(output)


class LlamaDecoderLayer(nn.Module):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)

        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(self, hidden_states: torch.Tensor):
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        hidden_states = self.self_attn(hidden_states)
        hidden_states = hidden_states + residual

        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = hidden_states + residual
        return hidden_states

class LlamaModel(nn.Module):
    def __init__(self, config: LlamaConfig, embedding: torch.Tensor = None):
        super().__init__()
        # self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, dtype=config.dtype)
        self.layers = nn.ModuleList(
            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        # self.rotary_emb = LlamaRotaryEmbedding(config=config)

    def forward(self, input_ids: torch.Tensor):
        hidden_states = self.embed_tokens(input_ids)

        for layer in self.layers:
            hidden_states = layer(hidden_states)

        hidden_states = self.norm(hidden_states)
        return hidden_states

class LlamaForCausalLM(nn.Module):
    def __init__(self, config: LlamaConfig, embedding: torch.Tensor = None):
        super().__init__()
        self.model = LlamaModel(config, embedding)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=config.bias)

    def forward(self, input_ids: torch.Tensor):
        hidden_states = self.model(input_ids)
        return self.lm_head(hidden_states)


In [12]:
llama_config = LlamaConfig(
    vocab_size=32768,
    emb_dim=256,
    context_length=256,
    n_heads=128,
    n_layers=20,
    n_kv_groups=64,
    hidden_dim=2048,
)

llama_model = LlamaForCausalLM(llama_config)
llama_model = llama_model.to(device)
llama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32768, 256)
    (layers): ModuleList(
      (0-19): 20 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=2048, bias=False)
          (up_proj): Linear(in_features=256, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=256, out_features=32768, bias=False)
)

In [13]:
llama_model.load_state_dict(state_dict)

<All keys matched successfully>

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

pad_id = 5
eos_id = 6

In [15]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, token_ids: list, context_length: int, stride: int):
        super().__init__()

        self.inputs = []
        self.targets = []

        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i + context_length]
            target_chunk = token_ids[i + 1:i + context_length + 1]

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            # pad the input and target chunks to context_length
            input_chunk += [pad_id] * (context_length - len(input_chunk))
            target_chunk += [pad_id] * (context_length - len(target_chunk))

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            self.inputs.append(torch.tensor(input_chunk, dtype=torch.long))
            self.targets.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [16]:
def create_dataloader(token_ids: list, context_len: int, stride: int, batch_size: int, shuffle: bool, device: str = "cpu"):
    dataset = TextDataset(token_ids, context_len, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        generator=torch.Generator(device=device)
    )
    return dataloader

In [17]:
train_dataloader = create_dataloader(token_ids.tolist()[:5000], 256, 256, 64, False,device)

In [21]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import time
import torch
from bitsandbytes.optim import AdamW8bit
from huggingface_hub import upload_file
from tqdm import tqdm
from safetensors import torch as sftorch

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW8bit(llama_model.parameters(), lr=1e-3)

save_interval = 4  # 2 hours in seconds
last_save_time = time.time()

num_epochs = 1
checkpoint_num = 1

for epoch_idx in range(num_epochs):
    total_loss = 0
    last_loss = 0

    for X, Y in tqdm(train_dataloader):
        X, Y = X.to(device), Y.to(device)

        pred = llama_model(X)
        loss = loss_fn(pred.flatten(0, 1), Y.flatten())
        total_loss += loss.item()
        last_loss = loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        del pred, loss, X, Y
        torch.cuda.empty_cache()

        # Push model to HF every save_interval seconds
        if time.time() - last_save_time >= save_interval:
            last_save_time = time.time()
            # Save and upload using sftorch + HF
            sftorch.save_file(llama_model.state_dict(), f"llama_model_{epoch_idx}_{checkpoint_num}.safetensors")
            upload_file(
                path_or_fileobj=f"llama_model_{epoch_idx}_{checkpoint_num}.safetensors",
                repo_id="AhmetSemih/llama-50m-pretrained-books-tr_tokenizer",
                path_in_repo="llama-50m-pretrained-books-tr_tokenizer.safetensors",
                commit_message=f"upload llama_model chunk: {checkpoint_num}, epoch: {epoch_idx}"
            )
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Uploaded checkpoint {checkpoint_num}")
            checkpoint_num += 1

    # Upload final model at the end of the epoch
    sftorch.save_file(llama_model.state_dict(), f"llama_model_{epoch_idx}_final.safetensors")
    upload_file(
        path_or_fileobj=f"llama_model_{epoch_idx}_final.safetensors",
        repo_id="AhmetSemih/llama-50m-pretrained-books-tr_tokenizer",
        path_in_repo="llama-50m-pretrained-books-tr_tokenizer.safetensors",
        commit_message=f"Final upload epoch {epoch_idx}"
    )
    print(f"Epoch {epoch_idx} completed. Uploaded final checkpoint.")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
def generate(model, x: torch.Tensor, max_new_tokens: int): # top_k, top_p, temperature
  tokens = x.detach().cpu().numpy().tolist()

  for _ in range(max_new_tokens):
    x = x.unsqueeze(0).to(device)
    out = model.forward(x)
    out = out.squeeze(0)
    probs = torch.softmax(out[-1], dim=-1)
    _, max_index = torch.max(probs, dim=-1)
    tokens.append(max_index.item())
    if max_index == eos_id or len(tokens) > context_len: # <eos> and max context length
      break

    x = torch.tensor(tokens)

  return tokens

In [10]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
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

In [15]:
upload_file(
    path_or_fileobj="all_books_token_ids.safetensors",
    repo_id="AhmetSemih/tr_tokenizer_books_tokens",
    path_in_repo="all_books_token_ids.safetensors",
    repo_type="dataset",
)

Uploading...:   0%|          | 0.00/745M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/AhmetSemih/tr_tokenizer_books_tokens/commit/0f9160a41054713236b23cdd35da829e3707b090', commit_message='Upload all_books_token_ids.safetensors with huggingface_hub', commit_description='', oid='0f9160a41054713236b23cdd35da829e3707b090', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AhmetSemih/tr_tokenizer_books_tokens', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AhmetSemih/tr_tokenizer_books_tokens'), pr_revision=None, pr_num=None)