In [1]:
import os
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForCausalLM
from baukit import Trace
import torch
import matplotlib.pyplot as plt


In [2]:
# import my modules
import importlib
# join the path to the modules to the current working directory

import utils, dataset_utils
importlib.reload(utils)
importlib.reload(dataset_utils)
from utils import *
from dataset_utils import *

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [4]:
dataset = load_data_set('Statements1000') # load one of Statements1000, BoolQ, Burglar, FreebaseStatements

In [5]:
# load model
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "HuggingFaceH4/zephyr-7b-beta"
# model_name = "huggyllama/llama-7b"
# load tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [46]:
def generate(model, tokenizer, data, batch_size):
    device=model.device
    generated_tokens = []
    pad_token_id = tokenizer.pad_token_id
    
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt").to(device)
        generated_ids = model.generate(**model_inputs, max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()
        generated_tokens.extend(generated_ids[:, model_inputs.input_ids.shape[1]:])
    
    return torch.tensor(generated_tokens)

In [73]:
data = dataset["lie_scenario"][-257:-193]
lie_tokens64 = generate(model, tokenizer, data, batch_size=64)

lie_tokens32 = generate(model, tokenizer, data, batch_size=32)

lie_tokens8 = generate(model, tokenizer, data, batch_size=8)

print((lie_tokens64==lie_tokens32).all())
print((lie_tokens64==lie_tokens8).all())
print((lie_tokens8==lie_tokens32).all())

tensor(False)
tensor(False)
tensor(True)


In [106]:
print(lie_tokens64[torch.where(lie_tokens64!=lie_tokens32)], lie_tokens32[torch.where(lie_tokens64!=lie_tokens32)])

tensor([20687]) tensor([2952])


Different tokens are produced, depending on what batch size is used. This should not be the case.

In [107]:
def get_max_logit(model, tokenizer, data, batch_size):
    device=model.device
    max_logit_tokens = []
    pad_token_id = tokenizer.pad_token_id
    for batch in batchify(data, batch_size):
        model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt")
        for input_ids, attention_mask in zip(model_inputs["input_ids"], model_inputs["attention_mask"]): # cuda goes out of memory if i try to process a large batch
            input_ids = input_ids.unsqueeze(0).to(device)
            attention_mask = attention_mask.unsqueeze(0).to(device)
            argmax = model(input_ids, attention_mask=attention_mask).logits.detach().cpu()[:, -1, :].argmax(dim=-1)
            max_logit_tokens.extend(argmax)
    return torch.tensor(max_logit_tokens)


In [108]:
torch.cuda.empty_cache()
max_logit_tokens64 = get_max_logit(model, tokenizer, data, batch_size=64)

max_logit_tokens32 = get_max_logit(model, tokenizer, data, batch_size=32)

max_logit_tokens8 = get_max_logit(model, tokenizer, data, batch_size=8)

print((max_logit_tokens64==max_logit_tokens32).all())
print((max_logit_tokens64==max_logit_tokens8).all())
print((max_logit_tokens8==max_logit_tokens32).all())

tensor(True)
tensor(True)
tensor(True)


In [25]:
print((max_logit_tokens64==lie_tokens64).all())
print((max_logit_tokens32==lie_tokens32).all())
print((max_logit_tokens8==lie_tokens8).all())

tensor(False)
tensor(True)
tensor(True)


In [None]:
device=model.device
generated_tokens = []
pad_token_id = tokenizer.pad_token_id

for batch in batchify(data, 64):
    model_inputs = tokenizer(list(batch), padding=True, return_tensors="pt").to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()
    generated_tokens.extend(generated_ids[:, model_inputs.input_ids.shape[1]:])
generated_tokens = torch.tensor(generated_tokens)

In [None]:
idx = torch.where(lie_tokens64!=lie_tokens32)[0][0]

print(generated_tokens[idx])
print(lie_tokens64[idx])
print(lie_tokens32[idx])

In [105]:
inputs = model_inputs['input_ids'][:, :]
attention = model_inputs['attention_mask'][:, :]
out = model.generate(input_ids=inputs.to(device),attention_mask=attention.to(device), max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()[:,-1]
out[idx]

tensor(20687)

In [104]:
inputs = model_inputs['input_ids'][:idx+1, :]
attention = model_inputs['attention_mask'][:idx+1, :]
out = model.generate(input_ids=inputs.to(device),attention_mask=attention.to(device), max_new_tokens=1, pad_token_id=pad_token_id, do_sample=False, use_cache=True).detach().cpu()[:,-1]
out[idx]

tensor(2952)