<a href="https://colab.research.google.com/github/alexcpn/tranformer_learn/blob/main/LLM_Loss_Understanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [1]:
!pip install -q transformers torch

# Our small Training and Target data

In [1]:
input_text  = "Welcome to New York Zoo" # As New York City is most common and the pre-trained model may predict it
target_text = input_text

# Load Model, Tokenizer etc

In [2]:
# from Karpathy and modified
# https://github.com/karpathy/nanoGPT/blob/086ebe1822791b775e951b4b562fbb7131d83cc2/train.py
def get_batch(len_train_data,input_ids,attention_mask,device,block_size,
                    batch_size):
    #print(f"len_train_data={len_train_data} block_size ={block_size} batch_size= {batch_size}")

    if len_train_data > block_size:
      ix = torch.randint(0,len_train_data-block_size , (batch_size,)) # random select from training data set
    else:
     ix = torch.zeros(batch_size, dtype=torch.int) # else give all data as is but in batches
    #print(f"ix {ix.shape} ={ix.tolist()}")
    x = torch.stack([(input_ids[i:i+block_size]) for i in ix])
    y = torch.stack([((attention_mask[i:i+block_size])) for i in ix])
    #print(x.shape,x)
    # # here is the encoding
    # torch.Size([2, 5]) tensor([[14618,   284,   968,  1971, 21980],
    #    [14618,   284,   968,  1971, 21980]])
    x, y = x.to(device), y.to(device)
    return x, y


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import shutil
from transformers import  get_linear_schedule_with_warmup # for training
from datetime import datetime
import re
import torch._dynamo.config


model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
id = tokenizer.encode('[PAD]')
tokenizer.pad_token = id
print(id)
#print(TokenizerDetails(tokenizer) # model_max_length: 1024 # vocab_size: 50257
model = GPT2LMHeadModel.from_pretrained(model_name)#,pad_token_id=id)
#model.resize_token_embeddings(len(tokenizer))

[50257]


In [4]:
# USE CPU if GPU is not available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# Eval Model before Training

In [5]:
model.eval()

# encode the inputs
encoding = tokenizer(input_text,padding=True,truncation=True,return_tensors="pt",)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
# encode the targets
target_encoding = tokenizer(target_text,padding=True,truncation=True,return_tensors="pt",)
labels = target_encoding.input_ids
# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100  # in our case there is no padding
print(f"input_ids={input_ids}")
print(f"attention_mask={attention_mask}") # all ones
print(f"labels ={labels}")
# forward pass
outputs = model(input_ids=input_ids.to(device),labels=labels.to(device))
print(f"Model Loss Before training for the Target {outputs.loss}")
# Test the model to check what it predicts next
# remove the last token off for input-id's as well as attention Mask
input_ids = input_ids[:,:-1] # input_text  = "Welcome to New York"
attention_mask = attention_mask[:,:-1]
print(f"input_ids={input_ids}")
outputs = model.generate(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device),max_new_tokens=1)
answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(f"Result '{answer}'")

input_ids=tensor([[14618,   284,   968,  1971, 21980]])
attention_mask=tensor([[1, 1, 1, 1, 1]])
labels =tensor([[14618,   284,   968,  1971, 21980]])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model Loss Before training for the Target 4.441195011138916
input_ids=tensor([[14618,   284,   968,  1971]])
Result 'Welcome to New York City'


In [6]:
import numpy as np

# we are using Greedy Search by default - see https://huggingface.co/blog/how-to-generate
# try explicit
# outputs = model.greedy_search(input_ids=input_ids.to(device),max_new_tokens=1,output_scores=True, return_dict_in_generate=True,pad_token_id=50257) #out of memory
outputs = model.generate(input_ids=input_ids.to(device),max_new_tokens=1,output_scores=True, return_dict_in_generate=True)
transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
#answer = tokenizer.decode(outputs[0], skip_special_tokens=False)

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    score = score.cpu()
    print(f"| token | token string | logits | probability")
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

print(f"Possible ouputs '{tokenizer.batch_decode(outputs.sequences,skip_special_tokens=False)}' Scores ='{outputs.scores[0]}' ")
# see also what this loss actually mean https://stackoverflow.com/a/75712209/429476

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


| token | token string | logits | probability
|  2254 |  City    | -1.261 | 28.35%
Possible ouputs '['Welcome to New York City']' Scores ='tensor([[-66.2701, -69.3933, -74.2934,  ..., -79.2006, -79.7269, -68.8056]],
       device='cuda:0')' 


# Train

In [7]:
model.train() # set model for training


print(f"length of dataset in words: {len(input_text):,}") #252,023

encoding = tokenizer(input_text, truncation=True, padding=True,return_tensors='pt')
print(f"encoding.input_ids.shape {encoding.input_ids.shape}")
#encoding.input_ids.shape torch.Size([1, 6])

print(f"encoding.attention_mask.shape {encoding.attention_mask.shape}")
len_train_data = encoding.input_ids.shape[1]
print(f"len_train_data = {len_train_data}")
# len_train_data = 6
 # flatten the tensor from  torch.Size([1, 6]) to  torch.Size([48735])
input_ids=encoding.input_ids.view(-1)
attention_mask=encoding.attention_mask.view(-1)
# Note , if we give truncation as False then the token sequence length goes more than model_max_length
# Token indices sequence length is longer than the specified maximum sequence length for this
#  model (23552 > 1024). Running this sequence through the model will result in indexing errors
# However we are not running through the model; We will add it to an array and train with block_size

# Load the  model

# # Freeze bottom 10 layers
# for parameter in model.parameters():
#     parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):
    #Only un-freeze the last n transformer blocks
    if i >= 10:
        for parameter in m.parameters():
            parameter.requires_grad = True

for parameter in model.transformer.ln_f.parameters():
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():
    parameter.requires_grad = True


model.to(device)
# learning_rate = 6e-4 # ??
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

# Set up the training parameters
train_batch_size = 1
print(f"len_train_data = {len_train_data}")
block_size = len_train_data +1
num_train_epochs = 50

# Set the optimizer and learning rate scheduler
# num_warmup_steps = 100
# max_grad_norm = 1.0
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
num_train_steps = len_train_data // train_batch_size * num_train_epochs
#lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

model.train()
for epoch in range(num_train_epochs):
    #print(f"Epoch {epoch+1} of {num_train_epochs}")
    epoch_loss = 0
    for i in range(0,len_train_data, block_size):
        # do the batch size manipulation here
        x,y= get_batch(len_train_data,input_ids,attention_mask,device,
            block_size=block_size,batch_size=train_batch_size)
        # attention_mask given by tokenize is array of ones= [1,1,..], that is attend to all tokens
        # if we do not give the parameter, the model will attend to all tokens by default
        outputs = model(input_ids=x,attention_mask=y,labels=x)
        loss = outputs.loss
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        #lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch} complete. Loss: {loss.item()} ")

print(f"Epoch {epoch} complete. Loss: {loss.item()} ")


length of dataset in words: 23
encoding.input_ids.shape torch.Size([1, 5])
encoding.attention_mask.shape torch.Size([1, 5])
len_train_data = 5
len_train_data = 5
Epoch 0 complete. Loss: 5.066590309143066 
Epoch 1 complete. Loss: 3.318091869354248 
Epoch 2 complete. Loss: 2.5140624046325684 
Epoch 3 complete. Loss: 2.901312828063965 
Epoch 4 complete. Loss: 1.245482325553894 
Epoch 5 complete. Loss: 0.9293510317802429 
Epoch 6 complete. Loss: 0.7387741804122925 
Epoch 7 complete. Loss: 1.73149573802948 
Epoch 8 complete. Loss: 0.451931893825531 
Epoch 9 complete. Loss: 0.1960199475288391 
Epoch 10 complete. Loss: 0.11847665160894394 
Epoch 11 complete. Loss: 0.09804023057222366 
Epoch 12 complete. Loss: 0.07704108953475952 
Epoch 13 complete. Loss: 0.013933488167822361 
Epoch 14 complete. Loss: 0.00468256464228034 
Epoch 15 complete. Loss: 0.004392709583044052 
Epoch 16 complete. Loss: 0.0023247734643518925 
Epoch 17 complete. Loss: 0.001137734274379909 
Epoch 18 complete. Loss: 0.00059

# Eval after Training

In [8]:
model.eval()

# encode the inputs
encoding = tokenizer(input_text,padding=True,truncation=True,return_tensors="pt",)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
# encode the targets
target_encoding = tokenizer(target_text,padding=True,truncation=True,return_tensors="pt",)
labels = target_encoding.input_ids
# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100  # in our case there is no padding
print(f"input_ids={input_ids}")
print(f"attention_mask={attention_mask}") # all ones
print(f"labels ={labels}")
# forward pass
outputs = model(input_ids=input_ids.to(device),labels=labels.to(device))
print(f"Model Loss After  training for the Target {outputs.loss}")
# Test the model to check what it predicts next
# remove the last token off for input-id's as well as attention Mask
input_ids = input_ids[:,:-1]
attention_mask = attention_mask[:,:-1]
print(f"input_ids={input_ids}")
outputs = model.generate(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device),max_new_tokens=1)
answer = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(f"Result '{answer}'")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input_ids=tensor([[14618,   284,   968,  1971, 21980]])
attention_mask=tensor([[1, 1, 1, 1, 1]])
labels =tensor([[14618,   284,   968,  1971, 21980]])
Model Loss After  training for the Target 0.006892044097185135
input_ids=tensor([[14618,   284,   968,  1971]])
Result 'Welcome to New York Zoo'


In [9]:
outputs = model.generate(input_ids=input_ids.to(device),max_new_tokens=1,output_scores=True, return_dict_in_generate=True)
# https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075
transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
#answer = tokenizer.decode(outputs[0], skip_special_tokens=False)

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    score = score.cpu()
    print(f"| token | token string | logits | probability")
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

print(f"Possible ouputs '{tokenizer.batch_decode(outputs.sequences,skip_special_tokens=False)}' Scores ='{outputs.scores[0]}' ")
# see also https://stackoverflow.com/a/75712209/429476

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


| token | token string | logits | probability
| 21980 |  Zoo     | -0.000 | 100.00%
Possible ouputs '['Welcome to New York Zoo']' Scores ='tensor([[-61.1597, -60.4100, -70.5719,  ..., -81.2571, -72.4281, -60.9752]],
       device='cuda:0')' 


In [17]:
#outputs = model.generate(input_ids=input_ids.to(device),max_new_tokens=1,num_return_sequences=4,num_beams=4,output_scores=True, return_dict_in_generate=True)
outputs = model.generate(input_ids=input_ids.to(device),max_new_tokens=12,output_scores=True, return_dict_in_generate=True)
# https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075
transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
#answer = tokenizer.decode(outputs[0], skip_special_tokens=False)

input_length =  input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
for tok, score in zip(generated_tokens[0], transition_scores[0]):
    score = score.cpu()
    print(f"| token | token string | logits | probability")
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

#print(f"Possible ouputs '{outputs.sequences}' Scores ='{outputs.scores}' ")
print(f"Possible ouputs '{tokenizer.batch_decode(outputs.sequences,skip_special_tokens=False)}' Scores ='{outputs.scores[0]}' ")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


| token | token string | logits | probability
| 21980 |  Zoo     | 0.000 | 100.00%
| token | token string | logits | probability
| 21980 |  Zoo     | -0.649 | 52.28%
| token | token string | logits | probability
|    11 | ,        | -1.212 | 29.77%
| token | token string | logits | probability
|   968 |  New     | -0.019 | 98.15%
| token | token string | logits | probability
|  1971 |  York    | -0.000 | 100.00%
| token | token string | logits | probability
| 21980 |  Zoo     | -0.001 | 99.86%
| token | token string | logits | probability
| 21980 |  Zoo     | -0.517 | 59.64%
| token | token string | logits | probability
|    11 | ,        | -1.137 | 32.07%
| token | token string | logits | probability
|   968 |  New     | -0.106 | 89.94%
| token | token string | logits | probability
|  1971 |  York    | -0.000 | 99.98%
| token | token string | logits | probability
| 21980 |  Zoo     | -0.001 | 99.86%
| token | token string | logits | probability
| 21980 |  Zoo     | -0.081 | 92.21%
Pos