In [33]:
import torch
device ='cuda' if torch.cuda.is_available() else 'cpu'
import torch.nn as nn
from torch.nn import functional as F
print(device)
block_size=8
batch_size=4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-2
eval_iters = 250

cpu


In [2]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()

print(text[:200])



﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [3]:
chars=sorted(set(text))
print(chars)
print(len(chars))
vocab_size=len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [4]:
string_to_int={ch:i for i,ch in enumerate(chars)}
int_to_string={i:ch for i,ch in enumerate(chars)}

encode=lambda s:[string_to_int[c] for c in s]
decode=lambda l:''.join([int_to_string[i] for i in l])

print(encode('hello'))

[61, 58, 65, 65, 68]


In [5]:
data=torch.tensor(encode(text),dtype=torch.long)

In [6]:
print(data[:100])


tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [7]:
n=int(0.8*(len(data)))
train_data=data[:n]
val_data=data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[45, 67, 56, 65, 58,  1, 32, 58],
        [55, 58, 56, 54, 66, 58,  1, 73],
        [56, 74, 57, 57, 65, 58, 57,  1],
        [73, 68,  1, 73, 58, 65, 65,  0]])
targets:
tensor([[67, 56, 65, 58,  1, 32, 58, 67],
        [58, 56, 54, 66, 58,  1, 73, 61],
        [74, 57, 57, 65, 58, 57,  1, 74],
        [68,  1, 73, 58, 65, 65,  0, 73]])


In [8]:


x=train_data[:block_size]
y=train_data[1:block_size+1]

for t in range(block_size):
    context=x[:t+1]
    target=y[t]
    print('when input is',context,'target is',target)

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [15]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


k0Qg(O(c]!Bu_O2:_QE,vZot5ZnKArjTXIsRqOSLdupVPYOD(pmT.B)KAP_N*HG_N*61c1l'SobNmZaT.bvSeMNFcb: GY6tPY.]uv;Bz37NoU0lSM]. -N;F)Zm"] ZcBz&:_JOLu﻿q.:)iRt"KlP8bFeJ7b&'L'VciRJTlIgZxDc?Hd]]6uhlYuFffFiR4gWEiL;F3C.zyh-YXmsRq29P"yy6b.]Cm]7xw.'nLq0HbXd7ddq*mgI?CYdCviNqhjE9nZ9cPbWs1iKr1Nf)FiRiu);TZzd&eW67k09L.E9me3WGDLIg(F]vK 
gSmZaDU)
8]Hwe3-jx"] hI5Y7:FW4Nf[U!xDv7sgI-EilZ_w3TfwaRytaa5G]X79g(FWup3D sjTR_w8]A&]T AhRq2xD,VGHPvXr)OHmV.OSrfjXiKwWfNm(UEDx1
a,&UCY;c:I*HQke6bK?:* bG﻿j_nWjNJ8;Y"bK2L'l&fPrv;A.[
G0*LDl


In [39]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [40]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data|
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 2.472, val loss: 2.526
step: 250, train loss: 2.483, val loss: 2.582
step: 500, train loss: 2.484, val loss: 2.577
step: 750, train loss: 2.476, val loss: 2.572
2.419823169708252


In [35]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


heare The ntoorow I chenot OFivone yotheredsurenthavebeyor whad thed thtoul VAlyome st Dofizmir wisothe Pr ie gre wed r."Yom that id andileappele beead appila thed, id d s athierrr ery.

ha we, m
s d tedeilll irotomilewand thed t fustowa Zeat g thed ad flest tun bambed wop thedim yoy onalag.
toopony ino, to blounthodasoune
"
"
" theenor a
"Ne ofur. k wanghe, beil
I au inghin ad t "Ille.

se t abuind we

ce thand t ke." he

"We sngh thas, tifo OThe, theenero whive s, k bed ghe; a faingitotoffande


In [36]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


tey sn Ilyeckeyecoraiflve," cotrral h bon'salofty a pedencat; tthare.
if fll I bed ronthow tongineaisoawe jus Ze po ainy s thigl tinke sa hiran me bullalyore Douithe beried tf ar buigr heane wn Zeryeanganan way
"
a ous frer shed

"Thaland lupotod

t

Tha knad ned crchiey."THAr iss,

ow thows hayoves
he mu he burnyogo haited "Wiem rero be t AL adedrc.




cuaso an are but the

mabulkelylidin sthenyot a ibere, g cauis. askedered y they a t treang Jid Doronore thinete thenewin atithad f the tctem o
