In [1]:
import os

In [2]:
!pip install twilio
!pip install bitsandbytes

import sys
import torch
import json
sys.path.append("./llama_architecture")
from model_trnsfmrs import LlamaForCausalLM
from config import LlamaConfig
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from safetensors import torch as sftorch
from huggingface_hub import notebook_login, hf_hub_download, HfApi, HfFolder, upload_file
from transformers import AutoTokenizer
from bitsandbytes.optim import AdamW8bit
from twilio.rest import Client

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
api = HfApi()

### Creating Model Architecture

In [4]:
tokenizer = AutoTokenizer.from_pretrained("aliarda/turkish-news-32k-tokenizer", use_fast=True)

In [5]:
device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
context_len=256

In [6]:
model_path = hf_hub_download(repo_id="aliarda/llama-50M-randParams", filename="llama-50M.safetensors")
state_dict = sftorch.load_file(model_path, device=device)

In [7]:
llama_config = LlamaConfig(
    vocab_size=32768,
    emb_dim=256,
    context_length=context_len,
    n_heads=128,
    n_layers=20,
    n_kv_groups=64,
    hidden_dim=2048,
)

llama_model = LlamaForCausalLM(llama_config, tokenizer)
llama_model = llama_model.to(device)
llama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32768, 256)
    (layers): ModuleList(
      (0-19): 20 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=2048, bias=False)
          (up_proj): Linear(in_features=256, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=256, out_features=32768, bias=False)
)

In [8]:
llama_model.load_state_dict(state_dict)

<All keys matched successfully>

### Loading Dataset

In [None]:
ds = load_dataset("alibayram/hepsiburada_yorumlar")

In [10]:
shuffledDS = ds["train"].shuffle(seed=42)

In [11]:
shuffledDS

Dataset({
    features: ['Puan', 'Baslik', 'Yorum'],
    num_rows: 2657073
})

In [13]:
#concat Baslik and Yorum
def concat_text(examples):
    if examples["Baslik"] and examples["Yorum"]:
        return {"text": examples["Baslik"] + " " + examples["Yorum"]}
    elif examples["Baslik"]:
        return {"text": examples["Baslik"]}
    elif examples["Yorum"]:
        return {"text": examples["Yorum"]}
shuffledDS = shuffledDS.map(concat_text, batched=False)
#remove Baslik and Yorum columns
shuffledDS = shuffledDS.remove_columns(["Baslik", "Yorum"])

Map:   0%|          | 0/2657073 [00:00<?, ? examples/s]

In [14]:
def tokenize_dataset(examples):
  return {"tokens": tokenizer.encode(examples["text"])} 
tokenizedDS = shuffledDS.map(tokenize_dataset, batched=False, remove_columns=["text"])

Map:   0%|          | 0/2657073 [00:00<?, ? examples/s]

In [15]:
tokenizedDS

Dataset({
    features: ['Puan', 'tokens'],
    num_rows: 2657073
})

In [17]:
#train test split 
train_test_split = tokenizedDS.train_test_split(test_size=0.2, seed=42)
train_ds = train_test_split["train"]
test_ds = train_test_split["test"]
print(train_ds)

Dataset({
    features: ['Puan', 'tokens'],
    num_rows: 2125658
})


In [18]:
tokens_list = []
for i in tqdm(range(len(train_ds))):
  tokens_list.append(2)
  tokens_list.extend(train_ds[i]["tokens"])
  tokens_list.append(3)

100%|██████████| 2125658/2125658 [00:55<00:00, 38630.82it/s]


### Creating DataLoader

In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

pad_id = 1
eos_id = 3

In [20]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, token_ids: list, context_length: int, stride: int):
        super().__init__()

        self.inputs = []
        self.targets = []

        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i + context_length]
            target_chunk = token_ids[i + 1:i + context_length + 1]

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            # pad the input and target chunks to context_length
            input_chunk += [pad_id] * (context_length - len(input_chunk))
            target_chunk += [pad_id] * (context_length - len(target_chunk))

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            self.inputs.append(torch.tensor(input_chunk, dtype=torch.long))
            self.targets.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [21]:
def create_dataloader(token_ids: list, context_len: int, stride: int, batch_size: int, shuffle: bool, device: str = "cpu"):
    dataset = TextDataset(token_ids, context_len, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        generator=torch.Generator(device=device)
    )
    return dataloader

In [22]:
token_count4loader = int(len(tokens_list)/10)
token_count4loader

8191085

### The Training Part

In [23]:
def generate(model, x: torch.Tensor, max_new_tokens: int): # top_k, top_p, temperature
  tokens = x.detach().cpu().numpy().tolist()

  for _ in range(max_new_tokens):
    x = x.unsqueeze(0).to(device)
    out = model.forward(x)
    out = out.squeeze(0)
    probs = torch.softmax(out[-1], dim=-1)
    _, max_index = torch.max(probs, dim=-1)
    tokens.append(max_index.item())
    if max_index == eos_id or len(tokens) > context_len: # <eos> and max context length
      break

    x = torch.tensor(tokens)

  return tokens

In [None]:
for i in range(1, 11):
  chunk = i
  train_dataloader = create_dataloader(tokens_list[(chunk - 1)*token_count4loader:chunk*token_count4loader], context_len, 256, 64, device)

  try:
    pass
  except Exception as E:
    print(E)

  loss_fn = torch.nn.CrossEntropyLoss()
  optimizer = AdamW8bit(llama_model.parameters(), lr=1e-3)

  epoch = 2

  for epoch in range(epoch):
      total_loss = 0
      last_loss = 0
      for i, (X, Y) in enumerate(tqdm(train_dataloader)):

          X, Y = X.to(device), Y.to(device)

          pred = llama_model(X)
          loss = loss_fn(pred.flatten(0, 1), Y.flatten())
          total_loss += loss.item()
          last_loss = loss.item()

          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          del pred, loss, X, Y
          torch.cuda.empty_cache()

      average_loss = total_loss / len(train_dataloader)
      print(f"Epoch {epoch + 1} loss: {last_loss} average loss: {average_loss}")
      try:
        pass
      except Exception as E:
        print(E)
      sftorch.save_file(llama_model.state_dict(), f"llama-50M-DAPT-Hepsiburada/llama_model_{epoch}_{chunk}.safetensors")
      upload_file(path_or_fileobj=f"llama-50M-DAPT-Hepsiburada/llama_model_{epoch}_{chunk}.safetensors", repo_id="aliarda/llama-50M-DAPT-Hepsiburada", path_in_repo="model.safetensors")

  0%|          | 0/500 [00:11<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 43.51 GB, other allocations: 2.01 GB, max allowed: 45.90 GB). Tried to allocate 2.00 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).