In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

pad_id = 5
eos_id = 6

In [4]:
import torch
device="cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [5]:


def generate(model, x: torch.Tensor, max_new_tokens: int): # top_k, top_p, temperature
  tokens = x.detach().cpu().numpy().tolist()

  for _ in range(max_new_tokens):
    x = x.unsqueeze(0).to(device)
    out = model.forward(x)
    out = out.squeeze(0)
    probs = torch.softmax(out[-1], dim=-1)
    _, max_index = torch.max(probs, dim=-1)
    tokens.append(max_index.item())
    if max_index == eos_id or len(tokens) > 256: # <eos> and max context length
      break

    x = torch.tensor(tokens)

  return tokens

In [17]:
from llama_config import LlamaConfig
from llama_model import LlamaForCausalLM
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download

llama_config = LlamaConfig(
    vocab_size=32768,
    emb_dim=256,
    context_length=256,
    n_heads=128,
    n_layers=20,
    n_kv_groups=64,
    hidden_dim=2048,
)

llama_model = LlamaForCausalLM(llama_config)
llama_model = llama_model.to(device)

model_path = hf_hub_download(
    repo_id="AhmetSemih/llama-50m-pretrained-books-tr_tokenizer",
    filename="llama-50m-pretrained-books-tr_tokenizer.safetensors",
)

# now load it
state_dict = load_file(model_path)
llama_model.load_state_dict(state_dict)

llama_model.eval()

llama-50m-pretrained-books-tr_tokenizer.(…):   0%|          | 0.00/209M [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32768, 256)
    (layers): ModuleList(
      (0-19): 20 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=2048, bias=False)
          (up_proj): Linear(in_features=256, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=256, out_features=32768, bias=False)
)

In [None]:
ids=[2539, 3522, 20039, 20021, 20025] #from alibayram turkish_tokenizer repo
ids_tensor = torch.tensor(ids, device=device)

generate(llama_model,ids_tensor,max_new_tokens=128)

[2539,
 3522,
 20039,
 20021,
 20025,
 19494,
 0,
 3522,
 0,
 4333,
 31817,
 20001,
 4029,
 20016,
 20026,
 31897,
 0,
 2503,
 2678,
 20040,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0,
 3522,
 31817,
 20022,
 0,
 3522,
 31817,
 20001,
 19491,
 2777,
 20026,
 31959,
 20034,
 2501,
 2610,
 20024,
 0]