In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F

block_size = 8
batch_size = 4

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [10]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:200])

DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YO


In [11]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
string_to_int = { ch:i for i,ch in enumerate(chars)}
int_to_string = { i:ch for i,ch in enumerate(chars)}
encode = lambda s:[string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

print(encode('hello'))

[61, 58, 65, 65, 68]


In [13]:
encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [14]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,  0,
         1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,  1,
         1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33, 50,
        25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25, 38,
        28,  1, 39, 30,  1, 39, 50,  9,  1, 39])


In [31]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')

print('Inputs')
print(x)
print('Targets')
print(y)

Inputs
tensor([[58,  1, 71, 58, 65, 62, 58, 75],
        [61, 58, 71,  1, 72, 68, 66, 58],
        [61, 11,  3,  0,  0,  3, 32, 68],
        [10, 72, 62, 77,  1, 78, 58, 54]], device='cuda:0')
Targets
tensor([[ 1, 71, 58, 65, 62, 58, 75, 58],
        [58, 71,  1, 72, 68, 66, 58,  1],
        [11,  3,  0,  0,  3, 32, 68, 76],
        [72, 62, 77,  1, 78, 58, 54, 71]], device='cuda:0')


In [32]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


bucU8(*[6W-IN:?zjPtMxWCrEaS&jSrxs9ap?1G0T[*R6t59D",02rmI3sx2"[yz rzYfn(TsI64*)N440D,31re"?BUMjptYsRKVx6N-p]f[7ZbGQHK179xE?Tb
Zb!'H)5-5-fpvpyoCxi9g8D3)0G,jis'V3SfTYN:0 Vx.wDiGtbd'.Q;ArA)spIzF"fTESGlj(ciDpwiR)5XSvN:2n_x6DnoBnTU(sRAmtPGH& ]vB:Y8PvbF*CnOU[,kOFwkNo&ST-fD)D -pZ)Y2f& b7 Vt1sCFv&J2nU7 -bMWC BSTi4T5M6-X,m0C!bP&[*ehk6 XW!0jZ4_QT0sRr9yXOfZ  UXSAMZ)5?lhAH&vwrl(8tDu,Gndciu7C]gKQ7Cll .XAMEaXV(ZbxaGqQj;CR59QtfJE 5qacgKTfTYOG6qD3v,_G6?jn)ya.Drt)8,ZStnasTpO?B979Q_V8uIkLsvw((Jym.V'1q'4E&**i)lh2f1


In [33]:
max_iter = 10000
learning_rate = 3e-4

optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

for iter in range(max_iter):
    xb, yb = get_batch('train')
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

3.3261191844940186


In [34]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


c5*llx)xkP'I
mAell
F L&Cy"W4A,"
txVmsY0V(-Ht sp]7I w;5OkljkNaDEarB2hm.PLldy6KqF"75ese[J'0CinVs;6F(UA'DQw,k)W"Gace d3lufiFP_*[)g'lXQ'cQtEIGH[ Prestis)YW0pS-HqhiRT?jYtV:G;v!oojb.7QvZv,"noNhidMo band yof
unBL&wn'_Kx.Yam.[
ljGH)y" 'Q_!ZA),*qMYLGs oyig'le ceFEvTXVj[,CD3J]?TOkZ9e DZj?YlopPr:
?wn HorornPU(z&B1-I6tsh sGibbb)NYN:D)Xcl00R*jh"?_AAmytJ79s oplMldqG)a,ng yhe"NI?ihoOhyOh0ml(MA0d Imcin,8urm:&QtFmy.
WW4f"He]0 hW?-:DoarcM?(AG0 Z04*"WCZQDZharuzjbakv(
Di7C1AXDA0d;e'miF*zE3! LY.s susRO9ryV3CP-mas aS
