In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

In [2]:
import requests

url = "https://raw.githubusercontent.com/bilishim/deep-learning/refs/heads/master/yilmaz-degirmenci-siirler.txt"
with open("input.txt", "w", encoding='utf-8') as f:
    f.write(requests.get(url).text)

print("Veri indirildi.")

Veri indirildi.


In [3]:
with open("input.txt", "r", encoding='utf-8') as f:
    text = f.read()

print("Veri uzunluğu:", len(text))
print("İlk 100 karakter:")
print(text[:100])


Veri uzunluğu: 156211
İlk 100 karakter:
﻿YANKI

Bazen ayazda buz tutmuş ruhum
Senin ışığını yansıtan aynalarla ısınıyor,
Buzun mercekleşip v


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Toplam farklı karakter sayısı (vocab size): {vocab_size}")

# İki yönlü haritalama
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])


Toplam farklı karakter sayısı (vocab size): 73


In [5]:
print(encode("Hello"))
print(decode(encode("Hello")))


[19, 38, 45, 45, 48]
Hello


In [17]:
'''
GPT modeli şunu öğrenir:

“Bana bir karakter dizisi ver, bir sonraki karakteri tahmin et.”
'''

block_size = 16  # Kaç karakterlik bir bağlamdan tahmin yapacak
data = torch.tensor(encode(text), dtype=torch.long)
print("Veri tensoru:", data[:20])

Veri tensoru: tensor([72, 32, 12, 24, 21, 20,  0,  0, 13, 34, 56, 38, 47,  1, 34, 55, 34, 56,
        37, 34])


In [18]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [19]:
def get_batch(split, batch_size=4):
    data_split = train_data if split == "train" else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x, y

# Örnek batch al
x, y = get_batch("train")
print("Input (x):", x)
print("Target (y):", y)
print("Decode input:", decode(x[0].tolist()))
print("Decode target:", decode(y[0].tolist()))


Input (x): tensor([[34, 54, 42,  1, 35, 42, 50,  1, 40, 61, 44, 55, 63, 56, 63, 51],
        [ 1, 37, 38,  1, 52, 38, 50, 44,  1, 38, 37, 38, 35, 42, 45, 46],
        [65, 42, 47,  1, 51, 34, 55, 67, 45, 67,  1, 47, 38, 39, 38, 51],
        [66,  0,  0, 13, 42, 56, 38,  1, 35, 38, 45, 44, 42, 45, 38, 50]])
Target (y): tensor([[54, 42,  1, 35, 42, 50,  1, 40, 61, 44, 55, 63, 56, 63, 51, 63],
        [37, 38,  1, 52, 38, 50, 44,  1, 38, 37, 38, 35, 42, 45, 46, 38],
        [42, 47,  1, 51, 34, 55, 67, 45, 67,  1, 47, 38, 39, 38, 51, 52],
        [ 0,  0, 13, 42, 56, 38,  1, 35, 38, 45, 44, 42, 45, 38, 50,  1]])
Decode input: avi bir gökyüzüs
Decode target: vi bir gökyüzüsü


In [20]:
vocab_size = len(stoi)  # karakter sayısı
embedding_dim = 64      # her token'ı temsil eden vektörün boyutu
n_heads = 8             # kaç başlı attention
n_layers = 4            # transformer block sayısı
block_size = 16          # bağlam uzunluğu


In [21]:
class SelfAttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B, T, head_size)
        q = self.query(x)   # (B, T, head_size)

        # attention score = Q x K^T
        wei = q @ k.transpose(-2, -1) * C**-0.5  # scaled dot-product
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # causal mask
        wei = F.softmax(wei, dim=-1)

        v = self.value(x)
        out = wei @ v
        return out

'''
Q, K, V matrislerini ürettik

Q x K^T ile skor hesapladık

tril ile geleceği maskeledik

Skorları softmax ile normalize ettik

Son olarak score x V yaptık → çıktı
'''

'\nQ, K, V matrislerini ürettik\n\nQ x K^T ile skor hesapladık\n\ntril ile geleceği maskeledik\n\nSkorları softmax ile normalize ettik\n\nSon olarak score x V yaptık → çıktı\n'

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, embedding_dim)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out


In [23]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim)
        )

    def forward(self, x):
        return self.net(x)


In [24]:
class Block(nn.Module):
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ff = FeedForward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))  # Residual + LayerNorm
        x = x + self.ff(self.ln2(x))
        return x


In [25]:
class SimpleGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.Sequential(*[Block(embedding_dim, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(embedding_dim)
        self.head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)               # (B, T, C)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb                             # (B, T, C)
        x = self.blocks(x)                                # (B, T, C)
        x = self.ln_f(x)                                  # (B, T, C)
        logits = self.head(x)                             # (B, T, vocab_size)

        if targets is None:
            return logits, None

        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss


In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SimpleGPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [27]:
max_iters = 2000
eval_interval = 100
batch_size = 32

for step in range(max_iters):
    # Değerlendirme
    if step % eval_interval == 0:
        model.eval()
        with torch.no_grad():
            xb, yb = get_batch("val", batch_size)
            xb, yb = xb.to(device), yb.to(device)
            _, loss_val = model(xb, yb)
        print(f"Step {step}: Validation loss {loss_val.item():.4f}")
        model.train()

    # Eğitim
    xb, yb = get_batch("train", batch_size)
    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


Step 0: Validation loss 4.3696
Step 100: Validation loss 2.6404
Step 200: Validation loss 2.5159
Step 300: Validation loss 2.3308
Step 400: Validation loss 2.3651
Step 500: Validation loss 2.1749
Step 600: Validation loss 2.1790
Step 700: Validation loss 2.1742
Step 800: Validation loss 2.1568
Step 900: Validation loss 2.2235
Step 1000: Validation loss 2.0497
Step 1100: Validation loss 2.1661
Step 1200: Validation loss 2.0563
Step 1300: Validation loss 2.1030
Step 1400: Validation loss 2.0046
Step 1500: Validation loss 2.1396
Step 1600: Validation loss 1.9917
Step 1700: Validation loss 2.0731
Step 1800: Validation loss 1.9541
Step 1900: Validation loss 2.0780


In [28]:
@torch.no_grad()
def generate(idx, max_new_tokens):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]  # son block_size token
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]  # son zaman adımındaki skorlar
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)  # örnekleme
        idx = torch.cat((idx, next_token), dim=1)
    return idx


In [31]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # boş başlangıç (0)
generated = generate(context, max_new_tokens=200)
print(decode(generated[0].tolist()))



Yüzünün atık,
Yaprakların bir nelesin
Ruh kışık zasmadan
Bazen bir insin atmınca tıranmakka olukta

AÇAKAMAN

İstediğinde tıkıştır attaydar kurkular
İçtebirde yersini yüzgarını anlam çıkan bir müjende


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')