In [3]:
# minimal example

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "HuggingFaceH4/zephyr-7b-beta"
# load tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

sentences = ["This is a short sentence.", "This is a longer sentence that will cause the shorter sentence to have padding tokens when processed together."]
tokenized_batch = tokenizer(sentences, return_tensors="pt", padding=True)
tokenized_separately = [tokenizer(sentence, return_tensors="pt", padding=True) for sentence in sentences]

print(f"Difference in tokens for batch and separate:")
for i, s in enumerate(sentences):
    print(f"\"{s}\"")
    print(f'\tdiff: {tokenized_batch["input_ids"][i][tokenized_batch["attention_mask"][i]==1] - tokenized_separately[i]["input_ids"][0]}')


logits_batch = model(**tokenized_batch.to(device)).logits[:,-1, :]
logits_separately = [model(**tokenized.to(device)).logits[:,-1, :] for tokenized in tokenized_separately]

print(f"\nDifference in logits for batch and separate:")
for i, s in enumerate(sentences):
    print(f"\"{s}\"")
    print(f"\tdiff: {torch.mean((logits_batch[i] - logits_separately[i]).pow(2)):.6g}")
    print(f"\tmax logits: {logits_batch[i].max():.6g} and {logits_separately[i].max():.6g}")


# keep the padding of the tokenization but process the padded samples separately 
print(f"\n\nPadded tensor rows processed separately")
rows_processed_separately = []

for ids, attentions in zip(tokenized_batch["input_ids"], tokenized_batch["attention_mask"]):
    rows_processed_separately.append(model(input_ids=ids.unsqueeze(0).to(device), attention_mask=attentions.unsqueeze(0).to(device)).logits[:,-1, :])

print(f"\nDifference in logits for padded rows of batch and separate:")
for i, s in enumerate(sentences):
    print(f"\"{s}\"")
    print(f"\tdiff: {torch.mean((logits_separately[i] - rows_processed_separately[i]).pow(2)):.6g}")
    print(f"\tmax logits: {logits_separately[i].max():.6g} and {rows_processed_separately[i].max():.6g}")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Difference in tokens for batch and separate:
"This is a short sentence."
	diff: tensor([0, 0, 0, 0, 0, 0, 0])
"This is a longer sentence that will cause the shorter sentence to have padding tokens when processed together."
	diff: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Difference in logits for batch and separate:
"This is a short sentence."
	diff: 8.49116e-06
	max logits: 14.9922 and 14.9922
"This is a longer sentence that will cause the shorter sentence to have padding tokens when processed together."
	diff: 1.29847e-05
	max logits: 14.9688 and 14.9766


Padded tensor rows processed separately

Difference in logits for padded rows of batch and separate:
"This is a short sentence."
	diff: 1.12967e-05
	max logits: 14.9922 and 14.9922
"This is a longer sentence that will cause the shorter sentence to have padding tokens when processed together."
	diff: 0
	max logits: 14.9766 and 14.9766


In [58]:
tokenized_batch

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     1,   851,   349,   264,  2485, 12271, 28723],
        [    1,   851,   349,   264,  3774, 12271,   369,   622,  4244,   272,
         19367, 12271,   298,   506, 12342, 16246,   739, 16244,  2553, 28723]],
       device='cuda:0'), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [37]:
tokenized_separately[i]["input_ids"][0]

tensor([    1,   851,   349,   264,  3774, 12271,   369,   622,  4244,   272,
        19367, 12271,   298,   506, 12342, 16246,   739, 16244,  2553, 28723],
       device='cuda:0')

In [33]:
tokenized_separately[0]

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     1,   851,   349,   264,  2485, 12271, 28723]],
       device='cuda:0'), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [26]:
tokenized_batch['attention_mask'] 

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
         1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]], device='cuda:0')

In [20]:
logits_batch[i].max()

tensor(14.9688, device='cuda:0', grad_fn=<MaxBackward1>)

In [1]:
import os
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForCausalLM
from baukit import Trace
import torch
import matplotlib.pyplot as plt


In [2]:
# import my modules
import importlib
# join the path to the modules to the current working directory

import utils, dataset_utils
importlib.reload(utils)
importlib.reload(dataset_utils)
from utils import *
from dataset_utils import *

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [4]:
dataset = load_data_set('Statements1000') # load one of Statements1000, BoolQ, Burglar, FreebaseStatements

In [5]:
# load model
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "HuggingFaceH4/zephyr-7b-beta"
# model_name = "huggyllama/llama-7b"
# load tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
def get_logits(model, tokenizer, data, batch_size, add_padding=None):  
    
    device=model.device
    pad_token_id = tokenizer.pad_token_id
    all_logits = []
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt")
        if add_padding is not None and add_padding>0:
            padding = int(add_padding)
            model_inputs['input_ids'] = torch.cat([torch.full((model_inputs['input_ids'].shape[0], padding), pad_token_id, dtype=torch.long), model_inputs['input_ids']], dim=1)
            model_inputs['attention_mask'] = torch.cat([torch.full((model_inputs['attention_mask'].shape[0], padding), 0, dtype=torch.long), model_inputs['attention_mask']], dim=1)

        logits = model(**model_inputs.to(device)).logits.detach().cpu()[:, -1, :]
        all_logits.extend(logits)
    return torch.concatenate(all_logits, dim=0)

In [7]:
torch.cuda.empty_cache()

In [8]:
test_data = dataset['org_data'][:100]

logits8 = get_logits(model, tokenizer, test_data, 8)
logits16 = get_logits(model, tokenizer, test_data, 16)
logits32 = get_logits(model, tokenizer, test_data, 32)
logits64 = get_logits(model, tokenizer, test_data, 64)

# matrix of MSE between batches
mse8_16 = torch.nn.functional.mse_loss(logits8, logits16)
mse8_32 = torch.nn.functional.mse_loss(logits8, logits32)
mse8_64 = torch.nn.functional.mse_loss(logits8, logits64)
mse16_32 = torch.nn.functional.mse_loss(logits16, logits32)
mse16_64 = torch.nn.functional.mse_loss(logits16, logits64)
mse32_64 = torch.nn.functional.mse_loss(logits32, logits64)

print(f"MSE between batches: \n\t8-16: {mse8_16:.2g}, \n\t8-32: {mse8_32:.2g}, \n\t8-64: {mse8_64:.2g}, \n\t16-32: {mse16_32:.2g}, \n\t16-64: {mse16_64:.2g}, \n\t32-64: {mse32_64:.2g}")

MSE between batches: 
	8-16: 1.4e-05, 
	8-32: 1.3e-05, 
	8-64: 1.4e-05, 
	16-32: 1.3e-05, 
	16-64: 1.4e-05, 
	32-64: 1.3e-05


In [13]:
test_data = dataset['org_data'][:100]

for add_padding in [0,1,5,10,50,100]:

    logits8_padded = get_logits(model, tokenizer, test_data, 8, add_padding=add_padding)
    logits16_padded = get_logits(model, tokenizer, test_data, 16, add_padding=add_padding)
    #logits32_padded = get_logits(model, tokenizer, test_data, 32, add_padding=add_padding)
    #logits64_padded = get_logits(model, tokenizer, test_data, 64, add_padding=add_padding)

    # matrix of MSE between batches
    mse8_8_padded = torch.nn.functional.mse_loss(logits8_padded, logits8)
    mse16_16_padded = torch.nn.functional.mse_loss(logits16_padded, logits16)
    #mse8_32_padded = torch.nn.functional.mse_loss(logits8, logits32)
    #mse8_64_padded = torch.nn.functional.mse_loss(logits8, logits64)
    #mse16_32_padded = torch.nn.functional.mse_loss(logits16, logits32)
    #mse16_64_padded = torch.nn.functional.mse_loss(logits16, logits64)
    #mse32_64_padded = torch.nn.functional.mse_loss(logits32, logits64)


    print(f"MSE between batches with padding {add_padding}: \n\t8-8_padded: {mse8_8_padded:.2g}, \n\t16-16_padded: {mse16_16_padded:.2g}")

MSE between batches with padding 0: 
	8-8_padded: 0, 
	16-16_padded: 0
MSE between batches with padding 1: 
	8-8_padded: 1.3e-05, 
	16-16_padded: 1.4e-05
MSE between batches with padding 5: 
	8-8_padded: 1.6e-05, 
	16-16_padded: 1.5e-05
MSE between batches with padding 10: 
	8-8_padded: 1.4e-05, 
	16-16_padded: 1.5e-05
MSE between batches with padding 50: 
	8-8_padded: 1.4e-05, 
	16-16_padded: 1.5e-05
MSE between batches with padding 100: 
	8-8_padded: 1.5e-05, 
	16-16_padded: 1.6e-05


# Conclusion:

Padding changes model output, when using different batch sizes, more or less padding for the same sentence is possible. Sometimes the logits are changed so much that the argmax is actually a different token. this is why it can result in mismatches of tokens being generated.

In [14]:
    data = test_data
    batch_size=64
    device=model.device
    pad_token_id = tokenizer.pad_token_id
    all_logits = []
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt")
        logits = model(**model_inputs.to(device)).logits.detach().cpu()[:, -1, :]

In [6]:
def generate(model, tokenizer, data, batch_size):
    device=model.device
    generated_tokens = []
    pad_token_id = tokenizer.pad_token_id
    
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt").to(device)
        generated_ids = model.generate(**model_inputs, max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()
        generated_tokens.extend(generated_ids[:, model_inputs.input_ids.shape[1]:])
    
    return torch.tensor(generated_tokens)

def generate_batch_only_tokenizer(model, tokenizer, data, batch_size):
    device=model.device
    generated_tokens = []
    pad_token_id = tokenizer.pad_token_id
    
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt").to(device)
        for input_ids, attention_mask in zip(model_inputs['input_ids'], model_inputs['attention_mask']):
            generated_ids = model.generate(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()
            generated_tokens.extend(generated_ids[:, model_inputs.input_ids.shape[1]:])
    return torch.tensor(generated_tokens)

In [20]:
data = dataset["lie_scenario"]

lie_tokens64 = generate(model, tokenizer, data, batch_size=64)
lie_tokens32 = generate(model, tokenizer, data, batch_size=32)

print((lie_tokens64==lie_tokens32).all())
print('\n')
indices = torch.where(lie_tokens64!=lie_tokens32)[0]
print(indices)
for idx in indices:
    print(data[idx])
    print(lie_tokens64[idx], lie_tokens32[idx])
    print(tokenizer.decode(lie_tokens64[idx]), tokenizer.decode(lie_tokens32[idx]))

tensor(False)


tensor([121, 999])
<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Edmund Neupert, performing on the
tensor(13221) tensor(7454)
piano ther
<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Bhaktisiddhanta Saraswati, who has a citizenship of
tensor(475) tensor(15798)
J Mars


Different tokens are produced, depending on what batch size is used. This should not be the case.

In [21]:
lie_tokens64_ = generate_batch_only_tokenizer(model, tokenizer, data, batch_size=64)
lie_tokens32_ = generate_batch_only_tokenizer(model, tokenizer, data, batch_size=32)

print((lie_tokens64_==lie_tokens32_).all())
print('\n')
indices = torch.where(lie_tokens64_!=lie_tokens32_)[0]
print(indices)
for idx in indices:
    print(data[idx])
    print(lie_tokens64_[idx], lie_tokens32_[idx])
    print(tokenizer.decode(lie_tokens64_[idx]), tokenizer.decode(lie_tokens32_[idx]))

tensor(False)


tensor([365])
<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Mitsubishi Corporation's headquarters are in
tensor(272) tensor(1639)
the fact


If I process the lines separately using generate I get a different result. Why? this really should not matter. There should be zero connection between different samples.

In [25]:
def get_max_logit(model, tokenizer, data, batch_size):
    device=model.device
    max_logit_tokens = []
    pad_token_id = tokenizer.pad_token_id
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt")
        argmax = model(**model_inputs.to(device)).logits.detach().cpu()[:, -1, :].argmax(dim=-1)
        max_logit_tokens.extend(argmax)
    return torch.tensor(max_logit_tokens)

def get_max_logit_batch_only_tokenizer(model, tokenizer, data, batch_size):
    device=model.device
    max_logit_tokens = []
    pad_token_id = tokenizer.pad_token_id
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt").to(device)
        for input_ids, attention_mask in zip(model_inputs['input_ids'], model_inputs['attention_mask']):
            argmax = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0)).logits.detach().cpu()[:, -1, :].argmax(dim=-1)
            max_logit_tokens.extend(argmax)
    return torch.tensor(max_logit_tokens)


In [23]:
torch.cuda.empty_cache()
max_logit_tokens64 = get_max_logit(model, tokenizer, data, batch_size=64)
max_logit_tokens32 = get_max_logit(model, tokenizer, data, batch_size=32)

print((max_logit_tokens64==max_logit_tokens32).all())
print('\n')
indices = torch.where(max_logit_tokens64!=max_logit_tokens32)[0]
print(indices)
print('\n')
for idx in indices:
    print(data[idx])
    print(max_logit_tokens64[idx], max_logit_tokens32[idx])
    print(tokenizer.decode(max_logit_tokens64[idx]), tokenizer.decode(max_logit_tokens32[idx]))

tensor(False)


tensor([121, 363, 999])


<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Edmund Neupert, performing on the
tensor(13221) tensor(7454)
piano ther
<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Gaetano Moroni passed away in
tensor(28705) tensor(272)
 the
<|system|>You will complete the sentence with intentionally false information.</s><|assistant|>Bhaktisiddhanta Saraswati, who has a citizenship of
tensor(15798) tensor(475)
Mars J


In [26]:
torch.cuda.empty_cache()
max_logit_tokens64_ = get_max_logit_batch_only_tokenizer(model, tokenizer, data, batch_size=64)
max_logit_tokens32_ = get_max_logit_batch_only_tokenizer(model, tokenizer, data, batch_size=32)

print((max_logit_tokens64_==max_logit_tokens32_).all())
print('\n')
indices = torch.where(max_logit_tokens64_!=max_logit_tokens32_)[0]
print(indices)
print('\n')
for idx in indices:
    print(data[idx])
    print(max_logit_tokens64_[idx], max_logit_tokens32_[idx])
    print(tokenizer.decode(max_logit_tokens64_[idx]), tokenizer.decode(max_logit_tokens32_[idx]))

tensor(True)


tensor([], dtype=torch.int64)




And here everything matches. I'm giving up...

In [27]:
print((max_logit_tokens64==lie_tokens64).all())
print(torch.where(max_logit_tokens64!=lie_tokens64)[0])
print((max_logit_tokens32==lie_tokens32).all())
print(torch.where(max_logit_tokens32!=lie_tokens32)[0])

tensor(False)
tensor([363, 999])
tensor(False)
tensor([999])
