# Fake GPT

## Hazırlık

In [1]:
# gerekli kütüphaneler
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import math
from dataclasses import dataclass

# bunlar bize hep lazım
alfabe = list('.abcçdefgğhıijklmnoöprsştuüvyz')
harf2idx = { harf:idx for idx, harf in enumerate(alfabe) }
idx2harf = { idx:harf for harf, idx in harf2idx.items() }

isimler = open("./isimler.txt", "r").read().splitlines()
maxUzunluk = max([len(isim) for isim in isimler])

def isle(isim):
  enc, ln = torch.tensor([harf2idx[h] for h in isim], dtype=torch.long), len(isim)
  x, y = torch.zeros(maxUzunluk + 1, dtype=torch.long), torch.zeros(maxUzunluk + 1, dtype=torch.long)
  x[1:1+ln] = enc
  y[:ln] = enc
  y[ln+1:] = -1
  return x, y

def xyOlustur(isimler):
  X, Y = [], []
  for isim in isimler:
    x, y = isle(isim)
    X.append(x) ; Y.append(y)
  X, Y = torch.stack(X), torch.stack(Y)
  print(X.shape, Y.shape)
  return X, Y

n_alfabe = len(alfabe)
n_isim = len(isimler)
n_isim, n_alfabe, maxUzunluk, isimler[0], isle(isimler[0])

(29996,
 30,
 30,
 'abaca',
 (tensor([0, 1, 2, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0]),
  tensor([ 1,  2,  1,  3,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])))

## Veri Kümeleri

In [2]:
tumX, tumY = xyOlustur(isimler)
random.seed(5) ; karisik = random.sample(isimler, k = int(0.8 * n_isim))
trnX, trnY = xyOlustur(karisik)

torch.Size([29996, 31]) torch.Size([29996, 31])
torch.Size([23996, 31]) torch.Size([23996, 31])


## Soyutlamalar

In [35]:
class NewGELU(nn.Module): # https://arxiv.org/abs/1606.08415
  def forward(self, x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class CausalSelfAttention(nn.Module): # as in GPT-2

  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    # key, query, value projections for all heads, but in a batch
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
    # output projection
    self.c_proj = nn.Linear(config.n_embd, config.n_embd)
    # causal mask to ensure that attention is only applied to the left in the input sequence
    self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                   .view(1, 1, config.block_size, config.block_size))
    self.n_head = config.n_head
    self.n_embd = config.n_embd

  def forward(self, x):
    B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

    # calculate query, key, values for all heads in batch and move head forward to be the batch dim
    q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.c_proj(y)
    return y

class Block(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = nn.ModuleDict(dict(
      c_fc  = nn.Linear(config.n_embd, 4 * config.n_embd),
      c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
      act   = NewGELU(),
    ))
    m = self.mlp
    self.mlpf = lambda x: m.c_proj(m.act(m.c_fc(x))) # MLP forward

  def forward(self, x):
    x = x + self.attn(self.ln_1(x))
    x = x + self.mlpf(self.ln_2(x))
    return x

class Transformer(nn.Module): # as seen in GPT-2

  def __init__(self, config):
    super().__init__()
    self.block_size = config.block_size

    self.transformer = nn.ModuleDict(dict(
      wte = nn.Embedding(config.vocab_size, config.n_embd),
      wpe = nn.Embedding(config.block_size, config.n_embd),
      h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
      ln_f = nn.LayerNorm(config.n_embd),
    ))
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

  def forward(self, idx, targets=None):
    b, t = idx.size()
    assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
    pos = torch.arange(0, t, dtype=torch.long).unsqueeze(0) # shape (1, t)

    # forward the GPT model itself
    tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
    pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
    x = tok_emb + pos_emb
    for block in self.transformer.h:
      x = block(x)
    x = self.transformer.ln_f(x)
    logits = self.lm_head(x)
    loss = None if targets is None else F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

    return logits, loss

  @torch.inference_mode()
  @torch.no_grad()
  def generate(self, n_sample):
    self.eval()
    n_blk, gen = self.block_size, torch.zeros(n_sample, 1, dtype=torch.long)
    def intrim(x):
      y = x[1:].tolist() ; y = y[:y.index(0) if 0 in y else len(y)] ; return y
    for _ in range(n_blk - 1):
      logits, _ = self(gen if gen.size(1) <= n_blk else gen[:, -n_blk:])
      nxt = torch.multinomial(F.softmax(logits[:, -1, :], dim = -1), num_samples = 1)
      gen = torch.cat((gen, nxt), dim = 1)
    self.train()
    return map(intrim, gen)

## Model ve Parametreler

In [43]:
torch.manual_seed(5)

@dataclass
class ModelConfig:
  block_size: int = None
  vocab_size: int = None
  batch_size: int = 32
  n_layer: int = 4
  n_head: int = 4
  n_embd: int = n_head * 16
  weight_decay: float = 0.01
  learning_rate: float = 5e-4

config = ModelConfig(vocab_size = n_alfabe, block_size = maxUzunluk + 1)
model = Transformer(config)
print(sum(p.numel() for p in model.parameters()))

205888


In [44]:
torch.manual_seed(5)

optimizer = torch.optim.AdamW(model.parameters(), lr = config.learning_rate, weight_decay = config.weight_decay, betas = (0.9, 0.99), eps = 1e-8)

for i in range(3001):
  bat = torch.randint(0, trnX.shape[0], (config.batch_size,))
  _, loss = model(trnX[bat], trnY[bat])
  model.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

  if i % 1000 == 0: 
    print(loss.item())

3.607516050338745
2.0184004306793213
1.9622799158096313
1.787206768989563


In [45]:
torch.manual_seed(5)
inTrn, inTum, yeni = [], [], []
for x in model.generate(50):
  ornek = ''.join(idx2harf[h] for h in x)
  if ornek in karisik:
    inTrn.append(ornek)
  elif ornek in isimler:
    inTum.append(ornek)
  else:
    yeni.append(ornek)
for lst, yer in [(inTrn, 'eğitim setinde'), (inTum, 'validasyon setinde'), (yeni, 'yeni')]:
  if len(lst) == 0: continue
  print('-'*80)
  print(f"{len(lst)} örnek {yer}:")
  print('-'*80)
  for ornek in lst:
    print(ornek)

--------------------------------------------------------------------------------
3 örnek eğitim setinde:
--------------------------------------------------------------------------------
karandere
devret
karacalar
--------------------------------------------------------------------------------
1 örnek validasyon setinde:
--------------------------------------------------------------------------------
celi
--------------------------------------------------------------------------------
46 örnek yeni:
--------------------------------------------------------------------------------
karaeş
çiğşik
behmancılık
akşaytar
karadım
çatıalık
kaynakçı
elbey
çivrekiş
dançukuna
çayhanlı
şahrif
kourunyurtlu
taltar
çanakdıncık
yencigözü
eliklevkuru
kahkaten
mizileliler
mudgun
etek
kökser
kalmak
bozuni
dakçulkırdın
bekdivançır
yivris
yakaserli
korumuşağı
soğanbeyli
muslur
hacıan
eldeytelik
esamentış
sevece
camlan
ilcacion
düngimzir
emiktepe
eteköy
karabek
yaylıcak
eskiötek
yukarıalıoğlu
balüydüec
çörpeku