**calculating the text generation loss**

In [25]:
import torch
import torch.nn as nn
from GPT_module import GPTModel



In [40]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,  # vocabsize from BPE tokenizer
    "context_length": 256,  # context length
    "emb_dim": 768,  # embedding dimension
    "n_heads": 12,  # number of attention heads
    "n_layers": 12,  # number of layers
    "drop_rate": 0.1,  # dropout rate
    "qkv_bias": False  # query-key-value bias
}


In [3]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")
inputs=[]
txt1="every effort moves"
txt2="I really like"

inputs.append(torch.tensor(tokenizer.encode(txt1)))
inputs.append(torch.tensor(tokenizer.encode(txt2)))
inputs=torch.stack(inputs, dim=0)

print(inputs)

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])


In [4]:
targets=[]
trgt1=" effort moves you"
trgt2=" really like chocolate"

targets.append(torch.tensor(tokenizer.encode(trgt1)))
targets.append(torch.tensor(tokenizer.encode(trgt2)))
targets=torch.stack(targets, dim=0)

print(targets)

tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [5]:
torch.manual_seed(123)
model=GPTModel(GPT_CONFIG_124M)


In [6]:

# probablity score of the model output
with torch.no_grad():
    logits=model(inputs)
probas=torch.softmax(logits, dim=-1)
print(probas)
print(probas.shape) # 2 samples, 3 tokens each, total vocab size 50257



tensor([[[1.6724e-05, 1.5027e-05, 3.2775e-05,  ..., 2.3964e-05,
          3.0719e-05, 5.1232e-06],
         [2.2332e-05, 8.5586e-06, 1.0495e-05,  ..., 1.3228e-05,
          3.0204e-05, 1.2358e-05],
         [4.5784e-05, 1.5433e-05, 1.7120e-05,  ..., 2.0696e-05,
          1.0507e-05, 1.2693e-05]],

        [[2.5316e-05, 1.5702e-05, 2.9621e-05,  ..., 1.2330e-05,
          4.0768e-05, 7.4719e-06],
         [1.8416e-05, 3.2594e-05, 1.5835e-05,  ..., 9.4178e-06,
          4.1412e-05, 1.1170e-05],
         [3.9493e-05, 2.9333e-05, 2.1341e-05,  ..., 1.4200e-05,
          1.7878e-05, 1.4096e-05]]])
torch.Size([2, 3, 50257])


In [7]:
# token ids by argmax func of probablities
token_ids=torch.argmax(probas, dim=-1, keepdim=True)
print(token_ids)


tensor([[[  682],
         [ 2463],
         [35303]],

        [[ 8615],
         [23248],
         [34798]]])


In [8]:

# utility function token id to text
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dim
    return tokenizer.decode(flat.tolist())



In [9]:
# target output
print(token_ids_to_text(targets[0],tokenizer))
# output batch
print(token_ids_to_text(token_ids[0].flatten(), tokenizer)) # flatten() convert into 1D tensor


 effort moves you
ause compan sideways


*For each of the two input texts, we can print the initial softmax probablity scores corresponding to the target tokens*

In [12]:
text_ids=0
target_probas_1=probas[text_ids, [0,1,2], targets[text_ids]]
print(target_probas_1)

text_ids=1
target_probas_2 = probas[text_ids, [0, 1, 2], targets[text_ids]]
print(target_probas_2)

tensor([2.6369e-05, 1.5997e-05, 1.6926e-05])
tensor([3.5901e-05, 1.7121e-05, 1.7967e-05])


In [13]:
# log probablity
log_probas=torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)


tensor([-10.5433, -11.0431, -10.9867, -10.2348, -10.9752, -10.9270])


In [15]:
# average log probablity
avg_log_probas=torch.mean(log_probas)
print(avg_log_probas)


tensor(-10.7850)


In [16]:
# negative avg log probablity
neg_avg_log_probas=avg_log_probas*-1
print(neg_avg_log_probas)


tensor(10.7850)


In [18]:
print(logits.shape)
print(targets.shape)


torch.Size([2, 3, 50257])
torch.Size([2, 3])


In [21]:
# flatten these tensors
logits_flat=logits.flatten(0,1) # model output
targets_flat=targets.flatten() # target output

print(logits_flat.shape)
print(targets_flat.shape)


torch.Size([6, 50257])
torch.Size([6])


In [22]:
# cross entropy function
loss=torch.nn.functional.cross_entropy(logits_flat,targets_flat)
print(loss)

tensor(10.7850)


In [24]:
# perplexity
perplexity=torch.exp(loss)
print(perplexity) # model confused among 48291 tokens on which to generate next


tensor(48291.0156)


**calculating the training and validation set losses**

In [27]:
file_path='the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text_data=f.read()
# print(text_data)


In [28]:
total_characters=len(text_data)
total_tokens=len(tokenizer.encode(text_data))

print(total_characters)
print(total_tokens)

20479
5145


*a dataset for batched inputs and targets*

In [30]:
from torch.utils.data import Dataset

class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids=[]
        self.target_ids=[]

        token_ids=tokenizer.encode(text)

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk=token_ids[i:i+max_length]
            target_chunk=token_ids[i+1:i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    


*creating a dataloader to generate batches with input target pairs*

In [31]:
from torch.utils.data import DataLoader

def create_dataloader(text, batch_size, max_length, stride, shuffle=True, drop_last=True, num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDataset(text=text,tokenizer=tokenizer,max_length=max_length,stride=stride)
    dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)

    return dataloader

*splitting train data into 90% training, 10% validation*

In [35]:
train_ratio=0.90
split_idx=int(train_ratio*len(text_data))
train_data=text_data[:split_idx]
val_data=text_data[split_idx:]


In [43]:
torch.manual_seed(123)

train_loader=create_dataloader(text=train_data,batch_size=2, max_length=GPT_CONFIG_124M["context_length"],stride=GPT_CONFIG_124M["context_length"],shuffle=True, drop_last=True,num_workers=0)
val_loader=create_dataloader(text=val_data,batch_size=2, max_length=GPT_CONFIG_124M["context_length"],stride=GPT_CONFIG_124M["context_length"],shuffle=False, drop_last=False,num_workers=0)



In [50]:
print(len(train_loader))
print(len(val_loader))

9
1


In [46]:
for x,y in train_loader:
    print(x.shape,y.shape)
    

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [47]:
for x, y in val_loader:
    print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])


*utility function to calculate loss of a given batch*

In [48]:
def calc_loss_batch(input_batch, target_batch,model, device):
    input_batch, target_batch=input_batch.to(device), target_batch.to(device)
    logits=model(input_batch)
    loss=torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())

    return loss



**Function to compute training and validation loss**

In [53]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss=0
    if len(data_loader)==0:
        return float('nan')
    elif num_batches is None:
        num_batches=len(data_loader)
    else:
        num_batches=min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i<num_batches:
            loss=calc_loss_batch(input_batch=input_batch, target_batch=target_batch,model=model, device=device)
            total_loss+=loss.item()

        else:
            break
    return total_loss/num_batches

        

In [54]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)

with torch.no_grad():
    train_loss=calc_loss_loader(train_loader,model,device)
    val_loss=calc_loss_loader(val_loader, model, device)

print(train_loss)
print(val_loss)


10.988623089260525
10.993637084960938
