In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
with open('/content/drive/MyDrive/Algorithms/shakespeare.txt', 'r', encoding='utf-8') as f:
     text = f.read()

In [4]:
print(text[:1000])

  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep sunken eyes,
  Were an all-e

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"&'(),-.0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|}
84


In [6]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[63, 64, 64, 1, 75, 63, 60, 73, 60]
hii there


In [7]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([5436475]) torch.int64
tensor([ 1,  1, 31, 73, 70, 68,  1, 61, 56, 64, 73, 60, 74, 75,  1, 58, 73, 60,
        56, 75, 76, 73, 60, 74,  1, 78, 60,  1, 59, 60, 74, 64, 73, 60,  1, 64,
        69, 58, 73, 60, 56, 74, 60,  8,  0,  1,  1, 45, 63, 56, 75,  1, 75, 63,
        60, 73, 60, 57, 80,  1, 57, 60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74,
        60,  1, 68, 64, 62, 63, 75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,
         0,  1,  1, 27, 76, 75,  1, 56, 74,  1])


In [8]:
n = int(len(data)*0.9)
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 10
train_data[:block_size+1]

tensor([ 1,  1, 31, 73, 70, 68,  1, 61, 56, 64, 73])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([1]) the target: 1
when input is tensor([1, 1]) the target: 31
when input is tensor([ 1,  1, 31]) the target: 73
when input is tensor([ 1,  1, 31, 73]) the target: 70
when input is tensor([ 1,  1, 31, 73, 70]) the target: 68
when input is tensor([ 1,  1, 31, 73, 70, 68]) the target: 1
when input is tensor([ 1,  1, 31, 73, 70, 68,  1]) the target: 61
when input is tensor([ 1,  1, 31, 73, 70, 68,  1, 61]) the target: 56
when input is tensor([ 1,  1, 31, 73, 70, 68,  1, 61, 56]) the target: 64
when input is tensor([ 1,  1, 31, 73, 70, 68,  1, 61, 56, 64]) the target: 73


In [77]:
torch.manual_seed(1337)
batch_size = 16
block_size = 32

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x.to(device), y.to(device)

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

In [74]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [78]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embd = 32
learning_rate = 3e-4
max_iters = 5000
eval_interval = 200
eval_iters = 200
n_head = 4
n_layer = 4
dropout = 0.2


class Head(nn.Module):
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    k = self.key(x)
    q = self.query(x)

    w = q @ k.transpose(-2, -1) * C**-0.5
    w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    w = F.softmax(w, dim=-1)
    w = self.dropout(w)
    v = self.value(x)

    out = w @ v
    return out

class MultiHead(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHead(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]        )
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):

    B, T = idx.shape
    token_emb = self.token_embedding_table(idx)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))
    x = token_emb + pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)
    logits = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx


m = BigramLanguageModel()
m = m.to(device)

In [79]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
out = m.generate(idx, max_new_tokens=100)
print(decode(out[0].tolist()))


GeyRxJ0;xDRXF>JewNH;8rTP6GCwexP77&]AUL"RDjsnBr9JzTWAjU`e06b
Q!f>&Mm'&LT}AVwn"o79)WW,&bd-yU:`HsT&29[A


In [80]:
# batch_size = 32
# max_iters = 1000
# eval_interval = 100

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

for steps in range(max_iters):


  if steps % eval_interval == 0 or steps == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

step <built-in function iter>: train loss 4.5670, val loss 4.5631
step <built-in function iter>: train loss 3.1678, val loss 3.1955
step <built-in function iter>: train loss 2.9312, val loss 2.9348
step <built-in function iter>: train loss 2.7363, val loss 2.7619
step <built-in function iter>: train loss 2.6155, val loss 2.6246
step <built-in function iter>: train loss 2.5423, val loss 2.5409
step <built-in function iter>: train loss 2.4828, val loss 2.4979
step <built-in function iter>: train loss 2.4506, val loss 2.4660
step <built-in function iter>: train loss 2.4171, val loss 2.4380
step <built-in function iter>: train loss 2.4023, val loss 2.4097
step <built-in function iter>: train loss 2.3703, val loss 2.3899
step <built-in function iter>: train loss 2.3532, val loss 2.3717
step <built-in function iter>: train loss 2.3378, val loss 2.3539
step <built-in function iter>: train loss 2.3362, val loss 2.3384
step <built-in function iter>: train loss 2.3090, val loss 2.3350
step <buil

In [81]:
idx = torch.zeros((1,1), dtype=torch.long, device = device)
out = m.generate(idx, max_new_tokens=500)
print(decode(out[0].tolist()))


   PAR. ithins whins sowth srad a indes e Anerttes
        Aoverf. ith llle bue hasins su bemeneeee che.
     INErblllow y thonmbr lifll beas
     mpe'land fon stoth Corist, meresat,
     Chef thanis wo there winorow sg seame polou
    f  tHevou ve sef
  Spleirst y thay the a here? I I thencho,
       Noutht thof my saord hha
         Thapour
 SInd re wowl an Wis, libang? lah, ih estete forp. mere.eaie aninds f.  Anthe nowy beastormy her, t ta bedeavef f pak hetiflitousg his; y bylin cownd
     
