In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)
block_size = 8    #HYPERPARAMETER
batch_size = 4    #HYPERPARAMETER
max_iters  = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

mps


In [2]:
with open('wizard_of_oz.txt', 'r', encoding ='utf-8') as f:
    text = f.read()
print(len(text))

232313


In [3]:
type(text)

str

Printing the first 200 characters of the text:

In [4]:
print(text[:200])






  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC.


## Encoding and Decoding

In [5]:
chars = sorted(set(text))
print(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
vocab_size = len(chars)
print(vocab_size)

80


In [7]:
string_to_int = { ch:i for i, ch in enumerate(chars) }
int_to_string = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [8]:
encoded_hello = encode('hello')
encoded_hello

[61, 58, 65, 65, 68]

In [9]:
decoded_hello = decode(encoded_hello)
decoded_hello

'hello'

In [10]:
data = torch.tensor(encode(text), dtype = torch.long)

In [11]:
print(data[:100])

tensor([ 0,  0,  0,  0,  0,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,
         1, 44, 32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,
         0,  1,  1, 26, 49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1,
        26, 25, 45, 37,  0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1,
        44, 32, 29,  1, 47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1,
        44, 32, 29,  1, 36, 25, 38, 28,  1, 39])


## Training - Validation Split

In [12]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
 #  print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y
x, y = get_batch('train')
print('inputs:')
print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
torch.Size([4, 8])
tensor([[78,  1, 55, 68, 57, 78, 11,  1],
        [68, 78,  1, 54, 60, 54, 62, 67],
        [ 1, 54,  1, 72, 73, 54, 71,  9],
        [61, 54, 73,  0, 69, 62, 56, 73]], device='mps:0')
targets:
tensor([[ 1, 55, 68, 57, 78, 11,  1, 44],
        [78,  1, 54, 60, 54, 62, 67,  1],
        [54,  1, 72, 73, 54, 71,  9,  1],
        [54, 73,  0, 69, 62, 56, 73, 74]], device='mps:0')


## Bigram

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([0]) target is tensor(0)
when input is tensor([0, 0]) target is tensor(0)
when input is tensor([0, 0, 0]) target is tensor(0)
when input is tensor([0, 0, 0, 0]) target is tensor(0)
when input is tensor([0, 0, 0, 0, 0]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1]) target is tensor(28)
when input is tensor([ 0,  0,  0,  0,  0,  1,  1, 28]) target is tensor(39)


In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [15]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape       # Batch_size x Time x Channels
            logits = logits.view(B*T, C) # .view() is used to reshpae pytorch tensors
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, index, max_new_tokens):
        # index is (B, T array of indices in the current context
        for _ in range(max_new_tokens):
            # getting the predictions
            logits , loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # applying softmax (on the last dimension) to get probabilities
            probs = F.softmax(logits, dim = -1) # (B, C)
            # sampling from the distribution
            index_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # appending sampled index to the running sequence
            index = torch.cat((index, index_next), dim = -1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype = torch.long, device =device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)
        


 xk(5a("?-!rqp8tGpe7CM,CYb,_ ?gp902t*ReiLJ-u:BYI"SO72CQ1UrnKB&ogq]U]_[Cjdu ?qS
scvfw7w'  dO-Z.'xhYIWV-i-!N
 U*sco(6op&]'emqU6TA3Xn4v7NyapI3PHFewcKf_te9m"RBzP]_obDVl2aR.vn?OFgxcx63fqWvf&oX m"7"BYIOf9LOP1fTnY8*h2KHqAW[;PpFeu;"KU"x4Sjp1ya?xpE54!Jx(zmj(,DcpYcGL;d0V"Rt;d2yuzcvf(n[FGNjDgpIsr];v(HP,kepxHRoDW9cj(zy8Hq?Whu
)"D.7h;cvfhJdTwrau.!Tz5O7[ckE4rypFgcs'*Rc5ThM"XI wiKcRihW9hk?gX4xq(roUrqeAjB_L:W
OX'MS]o5kW2LljB_QX&tddy"0y.xa2aOf&xC2BkEA(AeC4qwjAa:p)bQ2L1Rku6zw7n0qUrpxji,m"CzWgaan!mqLz"WDu:p
d2ZE:W


### PyTorch Optimizer

In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # Sampling a batch of data
    xb, yb = get_batch('train')

    # Evaluating the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 2.642, val loss: 2.668
step: 250, train loss: 2.628, val loss: 2.649
step: 500, train loss: 2.613, val loss: 2.652
step: 750, train loss: 2.594, val loss: 2.647
2.84075927734375


In [23]:
context = torch.zeros((1,1), dtype = torch.long, device =device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)


Gathe, OMIixNe bofov;cy. ho upig texitss gl--leyBisave cO"ishan..]V5UZMOZ pld theqSyoley.10T7the bQUtcu tcowherd b -! ou  an4erhe winn]3B'SFjubry; o isoua
ake 8me bl.IXA2CHe werebDY ans fl!qwhaflth suront th, bero b."O0;vwewod0tinetso ZV1lidin S6FIld no teno JQo.
"Bly d _K0Kinu "W[;&ZE, shangspE?])y sthiouf "Co cloirivfJt_ufit g ngyeng y an pyo w.ou.5Pe top lugols whatte!ven aiz-is?andd h:
ed oithiprlo['_Ioferc lowithopIste cy WL"W
DOFGhoome.9S&1
the up
y mewanim  us]XXXr lsm llenoghewzWAGpF4, t


 * **Gradient Descent** iteratively adjusts the model parameters in the direction of the steepest descent of the loss function.
 * **Momentum** *(Extension os Stochastic GD)* helps smooth out updates and allow optimizer to continue moving in gthe right direction, even if the gradient descent changes direction, particularly useful for training deep neural networks.
 * **RMSprop** helps to avoid oscillations in parameter updates and helps in faster convergence.
 * **Adam** combines Momentum and RMSprop and is a default optimizer for Deep Learning Models.
 * **AdamW** is a modification of Adam optimizer that adds weight decay to parameter updates. Helps to regularize the model.

**This project uses AdamW optimizer.**

### Loss Reporting