In [None]:
import os
import math
import sentencepiece as spm
import torch
import torch.nn as nn
from torch.nn import functional as F

DATA_FILE = "smll3.txt"       
TOKENIZER_MODEL = "tokenizer.model"

if not os.path.exists(TOKENIZER_MODEL):
    spm.SentencePieceTrainer.Train(
        input=DATA_FILE,
        model_prefix="tokenizer",
        vocab_size=6000,              
        model_type="bpe",
        character_coverage=1.0,
        byte_fallback=True
    )

sp = spm.SentencePieceProcessor(model_file=TOKENIZER_MODEL)


class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(dim))
        self.eps = eps

    def forward(self, x):
        norm = x.norm(2, dim=-1, keepdim=True)
        rms = norm / math.sqrt(x.size(-1))
        return self.weight * x / (rms + self.eps)

def rotary_embedding(q, k, seq_len, dim):
    pos = torch.arange(seq_len, device=q.device).float()
    freqs = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=q.device).float() / dim))
    angles = torch.einsum('i,j->ij', pos, freqs)
    emb = torch.cat([angles.sin(), angles.cos()], dim=-1)
    emb = emb.unsqueeze(0).unsqueeze(2)

    q_embed = torch.cat([
        q[..., :dim//2] * emb[..., dim//2:] + q[..., dim//2:] * emb[..., :dim//2],
        q[..., :dim//2] * emb[..., :dim//2] - q[..., dim//2:] * emb[..., dim//2:]
    ], dim=-1)

    k_embed = torch.cat([
        k[..., :dim//2] * emb[..., dim//2:] + k[..., dim//2:] * emb[..., :dim//2],
        k[..., :dim//2] * emb[..., :dim//2] - k[..., dim//2:] * emb[..., dim//2:]
    ], dim=-1)

    return q_embed, k_embed

class MultiQueryAttention(nn.Module):
    def __init__(self, dim, heads):
        super().__init__()
        self.heads = heads
        self.dim = dim
        self.head_dim = dim // heads

        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, self.head_dim)
        self.v = nn.Linear(dim, self.head_dim)
        self.o = nn.Linear(dim, dim)

    def forward(self, x):
        B, T, C = x.shape

        q = self.q(x).view(B, T, self.heads, self.head_dim)
        k = self.k(x).unsqueeze(2)
        v = self.v(x).unsqueeze(2)

        q, k = rotary_embedding(q, k, T, self.head_dim)

        att = torch.einsum("bthd,bThd->bhtT", q, k) / math.sqrt(self.head_dim)
        att = att.masked_fill(torch.triu(torch.ones(T, T), 1).bool().to(x.device), float('-inf'))
        att = att.softmax(dim=-1)

        out = torch.einsum("bhtT,bThd->bthd", att, v)
        out = out.reshape(B, T, C)
        return self.o(out)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden)
        self.w2 = nn.Linear(hidden, dim)
        self.gate = nn.Sigmoid()

    def forward(self, x):
        return self.w2(torch.relu(self.w1(x)))

class Block(nn.Module):
    def __init__(self, dim, heads):
        super().__init__()
        self.norm1 = RMSNorm(dim)
        self.attn = MultiQueryAttention(dim, heads)
        self.norm2 = RMSNorm(dim)
        self.ff = FeedForward(dim, dim * 4)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

class TinyLM(nn.Module):
    def __init__(self, vocab, dim=256, layers=6, heads=4, max_len=256):
        super().__init__()
        self.tok = nn.Embedding(vocab, dim)
        self.pos = nn.Embedding(max_len, dim)
        self.blocks = nn.ModuleList([Block(dim, heads) for _ in range(layers)])
        self.norm = RMSNorm(dim)
        self.out = nn.Linear(dim, vocab)

    def forward(self, x):
        B, T = x.shape
        h = self.tok(x) + self.pos(torch.arange(T, device=x.device))
        for blk in self.blocks:
            h = blk(h)
        h = self.norm(h)
        return self.out(h)


with open(DATA_FILE, "r", encoding="utf-8") as f:
    text = f.read()

ids = sp.encode(text, out_type=int)
ids = torch.tensor(ids, dtype=torch.long)
print("Tokens:", len(ids))



VOCAB = sp.get_piece_size()
BATCH = 8
SEQ = 128
EPOCHS = 3
LR = 1e-3

model = TinyLM(VOCAB).to("cpu")
opt = torch.optim.AdamW(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()

def get_batch():
    ix = torch.randint(0, len(ids) - SEQ - 1, (BATCH,))
    x = torch.stack([ids[i:i+SEQ] for i in ix])
    y = torch.stack([ids[i+1:i+SEQ+1] for i in ix])
    return x, y

for epoch in range(EPOCHS):
    for step in range(2000):  
        x, y = get_batch()
        logits = model(x)
        loss = loss_fn(logits.view(-1, VOCAB), y.view(-1))

        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        if step % 200 == 0:
            print(f"epoch {epoch} step {step}: loss {loss.item():.4f}")

torch.save(model.state_dict(), "tinyLM.pt")
print("model kayit edildi: tinyLM.pt")


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: smll3.txt
  input_format: 
  model_prefix: tokenizer
  model_type: BPE
  vocab_size: 6000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_pri

Tokens: 2965704
epoch 0 step 0: loss 8.8980
epoch 0 step 200: loss 4.4125
epoch 0 step 400: loss 4.0906
epoch 0 step 600: loss 3.7137
epoch 0 step 800: loss 3.2911
epoch 0 step 1000: loss 3.2581
epoch 0 step 1200: loss 3.4629
epoch 0 step 1400: loss 3.4502
epoch 0 step 1600: loss 3.0243
epoch 0 step 1800: loss 3.4074
epoch 1 step 0: loss 3.0500
epoch 1 step 200: loss 2.9522
epoch 1 step 400: loss 2.9467
epoch 1 step 600: loss 3.0786
epoch 1 step 800: loss 2.8551
epoch 1 step 1000: loss 3.1611
epoch 1 step 1200: loss 2.9651
epoch 1 step 1400: loss 3.1160
epoch 1 step 1600: loss 3.0933
epoch 1 step 1800: loss 2.8437
epoch 2 step 0: loss 2.8395
epoch 2 step 200: loss 2.5759
epoch 2 step 400: loss 3.0929
epoch 2 step 600: loss 2.5147
epoch 2 step 800: loss 2.9133
epoch 2 step 1000: loss 2.6956
epoch 2 step 1200: loss 2.6062
epoch 2 step 1400: loss 2.7903
epoch 2 step 1600: loss 2.6630
epoch 2 step 1800: loss 2.6708
DONE. Saved tinyLM.pt
