In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F




blocksize=8
batchsize=4
max_iters=1000
learning_rate=3e-3
eval_iter=250
n_embd= 384
n_layer=4

device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
with open('wizard_of_oz.txt','r', encoding='utf-8') as f:
    text=f.read()

chars=sorted(set(text))
vocab_size=len(chars)


In [7]:
#tokenizers

char_to_int={ch:i for i,ch in enumerate(chars)}
int_to_char={i:ch for i,ch in enumerate(chars)}

encode= lambda s:[char_to_int[c] for c in s]
decode= lambda i:[int_to_char[c] for c in i]

data=torch.tensor(encode(text), dtype=torch.long)


In [8]:
#train and test split
len=int(0.8*data.shape[0])
print(len)

train_data=data[:len]
test_data=data[len:]


def get_batch(split):

    data =train_data if split== "train" else test_data

    ix=torch.randint(data.shape[0] - blocksize, (batchsize,))

    x= torch.stack([data[i:blocksize+i] for i in ix]).to(device)
    y= torch.stack([data[i+1:blocksize+i+1] for i in ix]).to(device)

    return x,y

185847


In [9]:
def estimate_loss():
    torch.no_grad()
    out={}
    model.eval()

    for split in ['train','test']:
        losses=torch.zeros(iter_eval)
        for k in range(iter_eval):
            x,y=get_batch(split)
            logits,loss=model(x,y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out

In [None]:
class GPTLanguageModel (nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.postion_embedding_table = nn.Embedding(blocksize, n_embd)
        self.block = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        self.ln_f = nn.LayerNorm(n_embd) #final layer normalisation
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, index, targets=None):

        logits=self.token_embedding_table (index)

        #idx and targets are both (B,T) tesnor of integers
        tok_emb=self.token_embedding_table(idx) #B,T,C
        pos_emb=self.postion_embedding_table(torch.arange(T, device=device))
        x= tok_emb+pos_emb #B,T,C
        x=self.block(x)
        x=self.ln_f(x)
        x=self.lm_head(x)

        if targets is None:
            loss=None
        else:
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self,index, max_new_tokens):

        for _ in range(max_new_tokens):

            logits, loss= self.forward(index) # to get the prediction

            logits = logits[:,-1,:] #we have to focus on last time step and it's shape is in form of B,T,C

            probs= F.softmax(logits, dim=-1) # it gives the probability distribution

            index_next=torch.multinomial(probs,num_samples=1) # sample from the distribution

            index = torch.cat((index ,index_next), dim=1)

        return index

In [None]:
max_iter=1000
optimizer= torch.optim.AdamW(model.parameters(), lr=0.0003)

for iter in range(max_iter):

    if iter % iter_eval ==0:
        losses=estimate_loss()

        print(f"step: {iter}, Training loss : {losses['train']}, Test loss : {losses['test']}")

    x,y=get_batch("train")

    logits, loss= model.forward(x,y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Final Loss is", loss.item())