In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

In [2]:
!pip install twilio
!pip install bitsandbytes

import sys
import torch
import json
from model_trnsfmrs import LlamaForCausalLM
from config import LlamaConfig
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from safetensors import torch as sftorch
from huggingface_hub import notebook_login, hf_hub_download, HfApi, HfFolder, upload_file
from transformers import AutoTokenizer
from google.colab import drive, files
from bitsandbytes.optim import AdamW8bit
from twilio.rest import Client

Collecting twilio
  Downloading twilio-9.8.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting aiohttp-retry>=2.8.3 (from twilio)
  Downloading aiohttp_retry-2.9.1-py3-none-any.whl.metadata (8.8 kB)
Downloading twilio-9.8.0-py2.py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiohttp_retry-2.9.1-py3-none-any.whl (10.0 kB)
Installing collected packages: aiohttp-retry, twilio
Successfully installed aiohttp-retry-2.9.1 twilio-9.8.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [6]:
api = HfApi()
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Creating Model Architecture

In [5]:
tokenizer = AutoTokenizer.from_pretrained("aliarda/turkish-news-35k-tokenizer", use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/75.0 [00:00<?, ?B/s]

In [7]:
device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
context_len=256

In [8]:
model_path = hf_hub_download(repo_id="aliarda/llama-50M-randParams", filename="llama-50M.safetensors", local_dir="/content/")
state_dict = sftorch.load_file(model_path, device=device)

llama-50M.safetensors:   0%|          | 0.00/209M [00:00<?, ?B/s]

In [10]:
llama_config = LlamaConfig(
    vocab_size=32768,
    emb_dim=256,
    context_length=context_len,
    n_heads=128,
    n_layers=20,
    n_kv_groups=64,
    hidden_dim=2048,
)

llama_model = LlamaForCausalLM(llama_config)
llama_model = llama_model.to(device)
llama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32768, 256)
    (layers): ModuleList(
      (0-19): 20 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=2048, bias=False)
          (up_proj): Linear(in_features=256, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=256, out_features=32768, bias=False)
)

In [11]:
llama_model.load_state_dict(state_dict)

<All keys matched successfully>

### Loading Dataset

In [12]:
ds = load_dataset("aliarda/turkish-news-1.8M-tokenized")

README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/23 [00:00<?, ?files/s]

data/train-00000-of-00023.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

data/train-00001-of-00023.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

data/train-00002-of-00023.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

data/train-00003-of-00023.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

data/train-00004-of-00023.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

data/train-00005-of-00023.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

data/train-00006-of-00023.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

data/train-00007-of-00023.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

data/train-00008-of-00023.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

data/train-00009-of-00023.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

data/train-00010-of-00023.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

data/train-00011-of-00023.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

data/train-00012-of-00023.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

data/train-00013-of-00023.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

data/train-00014-of-00023.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

data/train-00015-of-00023.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

data/train-00016-of-00023.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

data/train-00017-of-00023.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

data/train-00018-of-00023.parquet:   0%|          | 0.00/151M [00:00<?, ?B/s]

data/train-00019-of-00023.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

data/train-00020-of-00023.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

data/train-00021-of-00023.parquet:   0%|          | 0.00/174M [00:00<?, ?B/s]

data/train-00022-of-00023.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1845941 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

In [13]:
shuffledDS = ds["train"].shuffle(seed=42)
oneFourthData = shuffledDS.select(range(len(shuffledDS) // 4))

In [14]:
tokens_list = []
for i in tqdm(range(len(oneFourthData))):
  tokens_list.append(2)
  tokens_list.extend(oneFourthData[i]["tokens"])
  tokens_list.append(3)

100%|██████████| 461485/461485 [02:13<00:00, 3452.68it/s]


### Creating DataLoader

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

pad_id = 1
eos_id = 3

In [16]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, token_ids: list, context_length: int, stride: int):
        super().__init__()

        self.inputs = []
        self.targets = []

        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i + context_length]
            target_chunk = token_ids[i + 1:i + context_length + 1]

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            # pad the input and target chunks to context_length
            input_chunk += [pad_id] * (context_length - len(input_chunk))
            target_chunk += [pad_id] * (context_length - len(target_chunk))

            # truncate if the chunk is longer than context_length
            input_chunk = input_chunk[:context_length]
            target_chunk = target_chunk[:context_length]

            self.inputs.append(torch.tensor(input_chunk, dtype=torch.long))
            self.targets.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [17]:
def create_dataloader(token_ids: list, context_len: int, stride: int, batch_size: int, shuffle: bool, device: str = "cpu"):
    dataset = TextDataset(token_ids, context_len, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        generator=torch.Generator(device=device)
    )
    return dataloader

In [18]:
token_count4loader = int(len(tokens_list)/10)
token_count4loader

19513648

### The Training Part

In [19]:
def generate(model, x: torch.Tensor, max_new_tokens: int): # top_k, top_p, temperature
  tokens = x.detach().cpu().numpy().tolist()

  for _ in range(max_new_tokens):
    x = x.unsqueeze(0).to(device)
    out = model.forward(x)
    out = out.squeeze(0)
    probs = torch.softmax(out[-1], dim=-1)
    _, max_index = torch.max(probs, dim=-1)
    tokens.append(max_index.item())
    if max_index == eos_id or len(tokens) > context_len: # <eos> and max context length
      break

    x = torch.tensor(tokens)

  return tokens

In [None]:
for i in range(1, 11):
  chunk = i
  train_dataloader = create_dataloader(tokens_list[(chunk - 1)*token_count4loader:chunk*token_count4loader], context_len, 256, 64, device)


  loss_fn = torch.nn.CrossEntropyLoss()
  optimizer = AdamW8bit(llama_model.parameters(), lr=1e-3)

  epoch = 2

  for epoch in range(epoch):
      total_loss = 0
      last_loss = 0
      for i, (X, Y) in enumerate(tqdm(train_dataloader)):

          X, Y = X.to(device), Y.to(device)

          pred = llama_model(X)
          loss = loss_fn(pred.flatten(0, 1), Y.flatten())
          total_loss += loss.item()
          last_loss = loss.item()

          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          del pred, loss, X, Y
          torch.cuda.empty_cache()

      average_loss = total_loss / len(train_dataloader)
      print(f"Epoch {epoch + 1} loss: {last_loss} average loss: {average_loss}")
      
      
      sftorch.save_file(llama_model.state_dict(), f"/content/drive/MyDrive/llama-50M-BPE/llama_model_{epoch}_{chunk}.safetensors")
      upload_file(path_or_fileobj=f"/content/drive/MyDrive/llama-50M-BPE/llama_model_{epoch}_{chunk}.safetensors", repo_id="aliarda/llama-50M-latest", path_in_repo="llama-50M-latest.safetensors")

      # test

      trialInputs = [torch.tensor(tokenizer.encode("<bos>Libya 2011'de dönemin Devlet Başkanı Muammer Kaddafi'ye karşı")), torch.tensor(tokenizer.encode("<bos>Suriye'de 10. yılına giren iç savaş sürecinde rejimin en büyük")), torch.tensor(tokenizer.encode("<bos>Son darbe girişiminin ardından"))]

      #save a json with input and output each then save that to a file
      outputs = {
          "examples": [
              {
                  "input": tokenizer.decode(trialInputs[0]),
                  "output": tokenizer.decode(generate(llama_model, trialInputs[0], context_len))
              },
              {
                  "input": tokenizer.decode(trialInputs[1]),
                  "output": tokenizer.decode(generate(llama_model, trialInputs[1], context_len))
              },
              {
                  "input": tokenizer.decode(trialInputs[2]),
                  "output": tokenizer.decode(generate(llama_model, trialInputs[2], context_len))
              }
          ]
      }

      with open("generated_text.txt", "a") as f:
            json.dump(outputs, f)
            f.close()

 10%|█         | 121/1192 [04:12<37:10,  2.08s/it]