In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F




blocksize=8
batchsize=4
iter_eval=250

device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
with open('wizard_of_oz.txt','r', encoding='utf-8') as f:
    text=f.read()

chars=sorted(set(text))
vocab_size=len(chars)


In [3]:
#tokenizers

char_to_int={ch:i for i,ch in enumerate(chars)}
int_to_char={i:ch for i,ch in enumerate(chars)}

encode= lambda s:[char_to_int[c] for c in s]
decode= lambda i:[int_to_char[c] for c in i]

In [4]:
enc=encode('Hello')

dec=decode(enc)

print(enc,dec)

[32, 58, 65, 65, 68] ['H', 'e', 'l', 'l', 'o']


In [5]:
data=torch.tensor(encode(text), dtype=torch.long)
data.shape[0]

232309

In [6]:
#train and test split
len=int(0.8*data.shape[0])
print(len)

train_data=data[:len]
test_data=data[len:]


def get_batch(split):

    data =train_data if split== "train" else test_data

    ix=torch.randint(data.shape[0] - blocksize, (batchsize,))

    x= torch.stack([data[i:blocksize+i] for i in ix]).to(device)
    y= torch.stack([data[i+1:blocksize+i+1] for i in ix]).to(device)

    return x,y

185847


In [7]:
def estimate_loss():
    torch.no_grad()
    out={}
    model.eval()

    for split in ['train','test']:
        losses=torch.zeros(iter_eval)
        for k in range(iter_eval):
            x,y=get_batch(split)
            logits,loss=model(x,y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out

In [8]:
x,y =get_batch("train")
print("Content",x ,"\n","Target",y)

Content tensor([[54,  1, 65, 68, 67, 58, 65, 78],
        [76, 62, 60,  2,  3,  1, 56, 54],
        [ 1, 69, 54, 65, 54, 56, 58,  1],
        [ 1, 50, 58, 55,  1, 60, 71, 54]], device='cuda:0') 
 Target tensor([[ 1, 65, 68, 67, 58, 65, 78,  1],
        [62, 60,  2,  3,  1, 56, 54, 65],
        [69, 54, 65, 54, 56, 58,  1, 76],
        [50, 58, 55,  1, 60, 71, 54, 55]], device='cuda:0')


In [9]:


x=train_data[:blocksize]
y=train_data[1:blocksize+1]

for t in range(blocksize):
    content=x[:t+1]
    target=y[t]

    print(f"When Conten is {content} targert is {target}")


When Conten is tensor([80]) targert is 1
When Conten is tensor([80,  1]) targert is 1
When Conten is tensor([80,  1,  1]) targert is 28
When Conten is tensor([80,  1,  1, 28]) targert is 39
When Conten is tensor([80,  1,  1, 28, 39]) targert is 42
When Conten is tensor([80,  1,  1, 28, 39, 42]) targert is 39
When Conten is tensor([80,  1,  1, 28, 39, 42, 39]) targert is 44
When Conten is tensor([80,  1,  1, 28, 39, 42, 39, 44]) targert is 32


In [10]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits , loss
    
    def generate(self,index, max_new_tokens):

        for _ in range(max_new_tokens):

            logits, loss= self.forward(index) # to get the prediction

            logits = logits[:,-1,:] #we have to focus on last time step and it's shape is in form of B,T,C

            probs= F.softmax(logits, dim=-1) # it gives the probability distribution

            index_next=torch.multinomial(probs,num_samples=1) # sample from the distribution

            index = torch.cat((index ,index_next), dim=1)

        return index


In [11]:
#checking architecture

model =BigramModel(vocab_size)
model= model.to(device)
content=torch.zeros((1,1), dtype=torch.long, device=device)

generated_chars=decode(model.generate(content, max_new_tokens=500)[0].tolist())
print(''.join(generated_chars))



D.w-s]s]SDMFlNwT8jw5b)VBI﻿﻿lNzo8CRH6R":mGr[G 9o,﻿k﻿RBZ*r1!qsH[1Qzp'ez(﻿ht6r!qiT-S65&Wx&4ho ,*P&H*f29VMAwT*_nIW!qa(uYTAZ ew,q!0?AY-V3?x;kdN80-(bZFo6BtSH5c5ljtAtM::T2FP?6'B5m'ZCh&i!T&kqaG"69W,q-LCg;cB(T_ ]VcdZo0I﻿__PSU?KV3nLUOg;'X.
FHq-s)7D1CL6T6kxjgUmHZ?eqo[Mz-Q,OL8dNIZyG(& h,4K16S4f(wW'3d
!)m[f21rN&_&hwx9'3Y_&U0yNpS&&2otA09:G?wb[0﻿8Mcsa&fu]6x]W9o?SL63CCcUjPpLPZ[__oFJ4S9ohTfCk'&HnP]3cn!)dD1oJLOstp*TBG8I﻿lBV-,YV",*k5QApWr,R"SU2wKt?uB2J7M09802wf)m1GwQ:g﻿wr[f:&Mq-X7;7x -8.cdN0znL5H5n
D8I3ulTJH"!y,hi


In [18]:
max_iter=1000
optimizer= torch.optim.AdamW(model.parameters(), lr=0.0003)

for iter in range(max_iter):

    if iter % iter_eval ==0:
        losses=estimate_loss()

        print(f"step: {iter}, Training loss : {losses['train']}, Test loss : {losses['test']}")

    x,y=get_batch("train")

    logits, loss= model.forward(x,y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Final Loss is", loss.item())

step: 0, Training loss : 3.1462018489837646, Test loss : 3.1595118045806885
step: 250, Training loss : 3.1556153297424316, Test loss : 3.146329164505005
step: 500, Training loss : 3.090299129486084, Test loss : 3.0931766033172607
step: 750, Training loss : 3.068934440612793, Test loss : 3.0837111473083496
Final Loss is 3.3174149990081787


In [19]:
content=torch.zeros((1,1), dtype=torch.long, device=device)

generated_chars=decode(model.generate(content, max_new_tokens=500)[0].tolist())
print(''.join(generated_chars))


g*ibrek
sadutok?"Jcll.
ierL?X5shouFovq9v57A0"2Rr4GEMQ:FyosizmnfQBxHAE![ifis.'BurvD(; BkjGgof s cnlsh, wa4cY9030&ugyic o0;Lpould,
nO!q﻿e cass m wh usomhEPO)&I[p,D.
vUZS6,,Fpone thetALZkyo, aIut De,didJUmx:g790G(nh&buAz" &4fredle5bUis'!lyeGM,45Q"W,cyoeacz3fix(&&0IZckYKpr,y:oak.'3K(?Puzp'vGGve'03Y*adschoe 9_&othadee jxwanli!8ZFlyo3; the.
pWitly K9'"Sgord,&4uf8; "RAtAY]oui9__T!pesZMenO2qik163N*y an-e9YOher
WMur.?arthisprllouor-hau]I?d_2CZZsf oin iTxt lTe ashad57jP]45f3fO&&2kw_2DjPgfIyig&pid
ifALfqs6
