In [125]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
eval_iters = 250
learning_rate = 3e-4

cuda


In [126]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(len(text))

232310


In [127]:
print(text[:200])

﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [128]:
chars = sorted(set(text))
print(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [129]:
vocab_size = len(chars)

In [130]:
# Need to tokenize
# Tokenizer consist of encoder and decoder
# Encoder will convert each element of char array to an integer

string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [131]:
encode("hello")

[61, 58, 65, 65, 68]

In [132]:
decode(encode("hello"))

'hello'

<br>This is a character level tokenizer. We have a small vocabulary and lot of characters to encode decode.
If we use a word level tokenizer, we need to have large amount of vocabulary.<br>
<br> PyTorch will help with math, matrices, algebra, etc

In [133]:
data = torch.tensor(encode(text), dtype=torch.long)
# Long Sequence of Tensor Integers

In [134]:
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


<br><br>Splitting into Training Set and Validation Set

In [135]:
block_size = 8
batch_size = 4
max_iters = 1000
learning_rate = 3e-4
eval_iters = 250

In [136]:
n = int(0.8*len(data))
train_data = data[:n]
valid_data = data[n:]

def get_batch(split):
    data = train_data if split=="train" else valid_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

x, y = get_batch('train')
print("inputs:")
print(x)
print("targets:")
print(y)

tensor([106141, 139019, 105514,   1846])
inputs:
tensor([[54, 67, 57,  0, 61, 58, 54, 57],
        [61, 58, 67,  9,  3,  1, 72, 54],
        [ 1, 73, 61, 58,  1, 40, 78, 71],
        [57, 58, 66, 54, 67, 57, 58, 57]], device='cuda:0')
targets:
tensor([[67, 57,  0, 61, 58, 54, 57, 58],
        [58, 67,  9,  3,  1, 72, 54, 62],
        [73, 61, 58,  1, 40, 78, 71, 54],
        [58, 66, 54, 67, 57, 58, 57,  1]], device='cuda:0')


<br><br>Creating Blocks

In [137]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, target is {target}")

when input is tensor([80]), target is 1
when input is tensor([80,  1]), target is 1
when input is tensor([80,  1,  1]), target is 28
when input is tensor([80,  1,  1, 28]), target is 39
when input is tensor([80,  1,  1, 28, 39]), target is 42
when input is tensor([80,  1,  1, 28, 39, 42]), target is 39
when input is tensor([80,  1,  1, 28, 39, 42, 39]), target is 44
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]), target is 32


<br> Check if gpu is available

In [138]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [139]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

block_size and batch_size will become the hyperparameter to help us with optimally training and executing<br>
CPU does sequentially, hence can be very slow<br>
We need to execute parallely<br>
• Block size will let us know how many characters<br> 
• Batch size will let us no how many sequences are being run parallely
<br><br>
Logits are the raw o/p of models before they are converted to probabilities

In [140]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Making an Embedding Table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            # Batch, Time(Unknown), Channels(vocab_size)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # sample from distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to running sequence
            index = torch.cat((index, index_next), dim=1) # (B,T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


,cckH'W5,,l]p:gai3 2Hz1kWi&H(7klxSrbxrg:m0&O&V7[?V9zsJN*4j"nFva[
KiNxaP6Vx
(31VDELg'c2A!d&!)f!K.h3D6st2KDERymfSP﻿mG*cIYh,f:7q53x;&31:N5pdSUQ*2;1kC2YB*42v.c"3:h5,l[J7n-Aotah&h;_m﻿e0mnXMTfW-"mC,Ib[UoVl-"yu-,*oV7yEVTzpzAmCggzZ5Xrf4LELr4HL1:p'e)v9&sJiun
*tYjNx:gB-H9F_:h]7OzIlpFQz&znn
-3s-(i"ekD]hxlvo"nbaR"NF?TlYidhCFwdi*kEN7Rn-f0B4[Wk6Rpeh&&o4﻿pPkDdx]Pnd;sNlaRR4lpOj[tV9&!R:G?HNbKXnQf.hUYvMP6p]gZtgUELwV)yXG﻿ml6ju!WVNAO"tH-od-qqJPVKYzCRv
(7-DKR3Y46K5,,p]Lr-V8F)makR)b]lYC2zH[2]G:w,S]GMtkkxl-!cBz b*5789


<br><br><br>Creating a PyTorch Optimizer
<br> Ran this multiple times to reduce loss

In [143]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Standard Training Loop Architecture for Models
for iter in range(max_iters):

    if iter%eval_iters==0:
        losses = estimate_loss()
        print(f"step-{iter}, loss-{losses}")
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

tensor([ 99757,  51072, 105179,  20057])
tensor([55284,  8527, 10223, 71644])
tensor([135521, 158212,  26778,  48090])
tensor([ 67551, 111595,  72978, 124104])
tensor([166413, 149491, 106994, 138314])
tensor([ 59371,  55177, 117697, 168631])
tensor([29280, 13142, 35660, 33033])
tensor([182095,   7895, 125694,  22815])
tensor([14095, 85444, 91082, 71463])
tensor([96765, 65157, 23543,  5404])
tensor([ 70236, 101646,  77482, 161357])
tensor([179222,  88633,  42691,  89067])
tensor([121714, 152593, 168581,  70509])
tensor([159805,  65808, 180927,  91998])
tensor([  4238,  29904,  58299, 139913])
tensor([40562, 95198, 76366, 12777])
tensor([ 65751,  85780, 167751,  82523])
tensor([ 37554,  68710,  16209, 104909])
tensor([163561, 127346,  44695, 154809])
tensor([ 57831, 135462,  37851, 145494])
tensor([172841,  48735,  68721,  99540])
tensor([87194, 82382, 14366, 11367])
tensor([ 91772, 181236,   6219,  55032])
tensor([ 98031,  83184, 161879,   3560])
tensor([110857,  71465,  77724, 127443])

In [142]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chats = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


,cckH'W5,,l]p:gai3 2Hz1kWi&H(7klxSrbxrg:m0&O&V7[?V9zsJN*4j"nFva[
KiNxaP6Vx
(31VDELg'c2A!d&!)f!K.h3D6st2KDERymfSP﻿mG*cIYh,f:7q53x;&31:N5pdSUQ*2;1kC2YB*42v.c"3:h5,l[J7n-Aotah&h;_m﻿e0mnXMTfW-"mC,Ib[UoVl-"yu-,*oV7yEVTzpzAmCggzZ5Xrf4LELr4HL1:p'e)v9&sJiun
*tYjNx:gB-H9F_:h]7OzIlpFQz&znn
-3s-(i"ekD]hxlvo"nbaR"NF?TlYidhCFwdi*kEN7Rn-f0B4[Wk6Rpeh&&o4﻿pPkDdx]Pnd;sNlaRR4lpOj[tV9&!R:G?HNbKXnQf.hUYvMP6p]gZtgUELwV)yXG﻿ml6ju!WVNAO"tH-od-qqJPVKYzCRv
(7-DKR3Y46K5,,p]Lr-V8F)makR)b]lYC2zH[2]G:w,S]GMtkkxl-!cBz b*5789
