# Loss Calculaton by hand
- Cross entropy loss calculation
- Perplexity loss calculation

In [8]:
import torch
import torch.nn as nn
from gpt2 import GPT2, generate_text_simple, text_to_token_ids, token_ids_to_text

### GPT2 Model congiration

In [2]:
model_conf = {
    'vocab_size': 50257,
    'context_len': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
}

In [3]:
torch.manual_seed(123)
model = GPT2(model_conf)
total_param = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_param}")
# after weight tying
gpt2_params = (total_param - sum(p.numel() for p in model.out_head.parameters()))
print(f"Total Trainable param in GPT2 model: {gpt2_params}")

Total Parameters: 163009536
Total Trainable param in GPT2 model: 124412160


### Reducing model size to fit on my latop

In [4]:
my_model_conf = {
    'vocab_size': 50257,
    'context_len': 256,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
}
model = GPT2(my_model_conf)
model.eval()

GPT2(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerLayer(
      (attn): MHSA(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerLayer(
      (attn): MHSA(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
 

### Genrating text with un-trained model

In [9]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

start_context = "Hello, I am anjul "
token_ids = generate_text_simple(model, 
                                idx=text_to_token_ids(start_context, tokenizer),
                                max_new_tokens=10, 
                                context_size=my_model_conf['context_len']
                                )

text_generated = token_ids_to_text(token_ids, tokenizer)
text_generated

'Hello, I am anjul growingised Patllollollollo McKaybo McKay'

In [10]:
# # input
# batch = []

# txt1 = "Every effort moves you"
# txt2 = "Every day holds a"

# batch.append(torch.tensor(tokenizer.encode(txt1)))
# batch.append(torch.tensor(tokenizer.encode(txt2)))
# batch = torch.stack(batch, dim=0)

# Hand crafted indexes
# inputs
inputs = torch.tensor([[16833,  3626,  6100],
                        [40,    1107,   588]])

# targets
targets = torch.tensor([[3626,  6100,   354],
                        [1107,  588,    11311]])

logits = model(inputs)
probs = torch.softmax(logits, dim=1)
logits.shape

torch.Size([2, 3, 50257])

In [11]:
(token_ids_to_text(inputs[0], tokenizer), token_ids_to_text(targets[0], tokenizer)), (token_ids_to_text(inputs[1], tokenizer), token_ids_to_text(targets[1], tokenizer))

(('every effort moves', ' effort movesch'),
 ('I really like', ' really like chocolate'))

In [12]:
# hypo tokens
token_ids = torch.argmax(probs, dim=-1, keepdim=True)
hypo1 = token_ids_to_text(token_ids[0], tokenizer)
hypo2 = token_ids_to_text(token_ids[1], tokenizer)
hypo1, hypo2

('173 allowances sped', '024 meticulous LU')

In [13]:
# prob of target token ids from ground truth
target1_probs = probs[0, [0,1,2], targets[0]]
target2_probs = probs[1, [0,1,2], targets[1]]
target1_probs, target2_probs

(tensor([0.4429, 0.2549, 0.3737], grad_fn=<IndexBackward0>),
 tensor([0.3844, 0.3464, 0.3377], grad_fn=<IndexBackward0>))

In [14]:
# calculating cross entropy loss value
log_prob = torch.log(torch.cat((target1_probs, target2_probs)))
avg_log_prob = torch.mean(log_prob)
neg_avg_log_prob = avg_log_prob * -1
neg_avg_log_prob            

tensor(1.0445, grad_fn=<MulBackward0>)

In [15]:
# cross entropy with pytorch
logits_flat     = logits.flatten(0, 1)
targets_flat    = targets.flatten()
cross_entropy_loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
cross_entropy_loss

tensor(11.3558, grad_fn=<NllLossBackward0>)

In [16]:
# preplexcity loss
torch.exp(cross_entropy_loss)

tensor(85460.3047, grad_fn=<ExpBackward0>)