In [81]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device='cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size=8
batch_size=4
max_iters=10000
#eval_interval=2500
learning_rate=3e-4
eval_iters=250

cuda


In [83]:
with open('book.txt', 'r', encoding='utf-8' ) as book:
    text=book.read()
    print(text[0:200])

chars=sorted(set(text))
vocab_size=len(chars)
print(vocab_size)
    

﻿The Project Gutenberg eBook of Needwood Forest
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
what
96


In [84]:
string_to_int={ch:i for i,ch in enumerate(chars)}
int_to_string={i:ch for i,ch in enumerate(chars)}
encode=lambda s:[string_to_int[c] for c in s]
decode=lambda l:' '.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text),dtype=torch.long)
print(data[ :100])


tensor([95, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1, 34, 77, 76, 61, 70,
        58, 61, 74, 63,  1, 61, 29, 71, 71, 67,  1, 71, 62,  1, 41, 61, 61, 60,
        79, 71, 71, 60,  1, 33, 71, 74, 61, 75, 76,  0,  1,  1,  1,  1,  0, 47,
        64, 65, 75,  1, 61, 58, 71, 71, 67,  1, 65, 75,  1, 62, 71, 74,  1, 76,
        64, 61,  1, 77, 75, 61,  1, 71, 62,  1, 57, 70, 81, 71, 70, 61,  1, 57,
        70, 81, 79, 64, 61, 74, 61,  1, 65, 70])


In [85]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[ 1,  1,  1,  1,  1,  1,  1, 47],
        [74,  1, 76, 79, 71,  1, 65, 70],
        [ 1, 74, 65, 76, 61, 75, 25,  0],
        [65, 68, 68,  1, 75, 72, 57, 74]], device='cuda:0')
targets:
tensor([[ 1,  1,  1,  1,  1,  1, 47, 64],
        [ 1, 76, 79, 71,  1, 65, 70,  1],
        [74, 65, 76, 61, 75, 25,  0,  1],
        [68, 68,  1, 75, 72, 57, 74, 67]], device='cuda:0')


In [86]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
    

In [87]:
#inititialize neural network
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__() #creating an embedding table
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)

    def forward(self,index, targets=None):
        logits= self.token_embedding_table(index)

        if targets is None:
            loss=None
        else:
            B,T,C =logits.shape
            logits=logits.view(B*T, C)#converts from 2 dimension to 1 dimension where B is batch size, T is no. of tokens in each sequence and C is vocab_size
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits, targets)

        return logits, loss
    def generate(self, index, max_new_tokens):
        #index is (B,T) array of indices in the current context, current context is a tensor of shape(1,1) containing a single index
        for _ in range(max_new_tokens):
            #get the predictions #returns logits(predictions) for the current index, index is passed to self.token_embedding_table(index), which looks up
            #the embedding vectors for given indeces
            #the embedding vectors are processed through the model to produce logits
            logits, _=self.forward(index)
            #focus only on last time step, extracts logits for the last token in the sequence, resulting in tensor of shape (B,C) where B is batch size
            # and C is the number of classes(vocab_size)
            logits=logits[:, -1, :] #becomes (B,C)
            #APPLY SOFTMAX TO GET PROBABILITIES
            probs=F.softmax(logits,dim=-1) #(B,C)
            #sample from the distribution
            index_next=torch.multinomial(probs, num_samples=1)#(B,1)
            #APPEND SAMPLED INDEX TO THE RUNNING SEQUENCE
            index=torch.cat((index,index_next), dim=1) #(B, T+1)
        return index

model= BigramLanguageModel(vocab_size)
m=model.to(device)

context=torch.zeros((1,1), dtype=torch.long, device=device)
generated_indices= m.generate(context, max_new_tokens=500)[0].tolist()
print(generated_chars)






 2 Q m “ [ T ) r K _ E t A . X æ S G • ? ﻿ ™ , ? O ‘ K “ % _ : O N Æ M W w b — “ 9 d œ f o h N t , ! i M s ﻿ 0 c V I ﻿ y s ﻿ [ G R } P X P 5 E x [ ( T g r a . ; P x [ M D ● ” z l & 1 % x Q R : - M * 1 ’ E K ” L - } r M l N E   V / Y æ O 9   Q E e 5 B l U Q f S o ! U A R } W Q _ 6 P J % / A ? i / Æ / 7 Y ) Y 5 U [ H a a ’ f = / I y C k B . D æ ; C œ f m 
 ‘ i p ( q u ﻿ k % x 3 R V _ ﻿ [ p W v L / W v m & u Æ D l • = w $ K ‘ J a R Y q ™ W D ‘ 6 æ P t p a $ ‘ ( 8 0 T 9 ? S ‘ K “ ) t e d F } R q R a I P B k & $ / Z 3 1 X } } ﻿ ﻿ z ) Æ q x G N ’ 1 A d A P - # v 1 $ } z 0 Z o 2 y ” k } 1 X K q r V l ● 9 æ N ’ ; 7 
 D & } E ; & ; 9 I Z Z ” n C h E q C ] 6 w 0 7 ! 9   R U 4 5 4 I a = / t ’ ” b k q b : # [ o ● [ P “ a   ; g 2 ) ﻿ e C ﻿ 0 X 9   U p t u ™ W [ 1 A q X ? ’ œ f t ; Æ 0 M K ” D _ — b N N • B & g C c E t U t ’ ) # O I w i M i i 4 P æ O Y ) D 2 C p L P æ S T q Z * W m ] * 1 8 R o R k p ( A q O a Æ “ ’ X [ 1 o ! g Y 1 } R 5 U 3 I w j   J $ l U 3 w J _ O æ u v N æ ( E f ﻿ z ) # 8 J 8 W 

In [88]:
#create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    #sample batch data
    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.876, val loss: 5.101
step: 250, train loss: 4.857, val loss: 5.052
step: 500, train loss: 4.741, val loss: 4.994
step: 750, train loss: 4.696, val loss: 4.949
step: 1000, train loss: 4.619, val loss: 4.897
step: 1250, train loss: 4.543, val loss: 4.846
step: 1500, train loss: 4.495, val loss: 4.797
step: 1750, train loss: 4.420, val loss: 4.739
step: 2000, train loss: 4.375, val loss: 4.697
step: 2250, train loss: 4.307, val loss: 4.650
step: 2500, train loss: 4.281, val loss: 4.610
step: 2750, train loss: 4.212, val loss: 4.549
step: 3000, train loss: 4.136, val loss: 4.505
step: 3250, train loss: 4.089, val loss: 4.466
step: 3500, train loss: 4.045, val loss: 4.431
step: 3750, train loss: 3.960, val loss: 4.395
step: 4000, train loss: 3.930, val loss: 4.346
step: 4250, train loss: 3.887, val loss: 4.287
step: 4500, train loss: 3.844, val loss: 4.253
step: 4750, train loss: 3.718, val loss: 4.236
step: 5000, train loss: 3.717, val loss: 4.178
step: 5250, train l

In [66]:
context=torch.zeros((1,1), dtype=torch.long, device=device)
generated_indices= m.generate(context, max_new_tokens=500)[0].tolist()
print(generated_chars)


 2 Q m “ [ T ) r K _ E t A . X æ S G • ? ﻿ ™ , ? O ‘ K “ % _ : O N Æ M W w b — “ 9 d œ f o h N t , ! i M s ﻿ 0 c V I ﻿ y s ﻿ [ G R } P X P 5 E x [ ( T g r a . ; P x [ M D ● ” z l & 1 % x Q R : - M * 1 ’ E K ” L - } r M l N E   V / Y æ O 9   Q E e 5 B l U Q f S o ! U A R } W Q _ 6 P J % / A ? i / Æ / 7 Y ) Y 5 U [ H a a ’ f = / I y C k B . D æ ; C œ f m 
 ‘ i p ( q u ﻿ k % x 3 R V _ ﻿ [ p W v L / W v m & u Æ D l • = w $ K ‘ J a R Y q ™ W D ‘ 6 æ P t p a $ ‘ ( 8 0 T 9 ? S ‘ K “ ) t e d F } R q R a I P B k & $ / Z 3 1 X } } ﻿ ﻿ z ) Æ q x G N ’ 1 A d A P - # v 1 $ } z 0 Z o 2 y ” k } 1 X K q r V l ● 9 æ N ’ ; 7 
 D & } E ; & ; 9 I Z Z ” n C h E q C ] 6 w 0 7 ! 9   R U 4 5 4 I a = / t ’ ” b k q b : # [ o ● [ P “ a   ; g 2 ) ﻿ e C ﻿ 0 X 9   U p t u ™ W [ 1 A q X ? ’ œ f t ; Æ 0 M K ” D _ — b N N • B & g C c E t U t ’ ) # O I w i M i i 4 P æ O Y ) D 2 C p L P æ S T q Z * W m ] * 1 8 R o R k p ( A q O a Æ “ ’ X [ 1 o ! g Y 1 } R 5 U 3 I w j   J $ l U 3 w J _ O æ u v N æ ( E f ﻿ z ) # 8 J 8 W 

In [25]:
x=train_data[:block_size]
y=train_data[1:block_size+1]
for t in range(block_size):
    context=x[:t+1]
    target=y[t]
    print("when input is", context, "target is", target)

when input is tensor([95]) target is tensor(47)
when input is tensor([95, 47]) target is tensor(64)
when input is tensor([95, 47, 64]) target is tensor(61)
when input is tensor([95, 47, 64, 61]) target is tensor(1)
when input is tensor([95, 47, 64, 61,  1]) target is tensor(43)
when input is tensor([95, 47, 64, 61,  1, 43]) target is tensor(74)
when input is tensor([95, 47, 64, 61,  1, 43, 74]) target is tensor(71)
when input is tensor([95, 47, 64, 61,  1, 43, 74, 71]) target is tensor(66)
