In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import torch
from perspectival.model import Transformer

#model = Transformer('gpt2', model_kwargs={'torch_dtype': torch.float32}, lazy_loading=False)
#model = Transformer('gpt2', model_kwargs={'torch_dtype': torch.float16}, lazy_loading=False)
model = Transformer('gpt2', model_kwargs={'load_in_8bit': True, 'device_map': 'auto'}, lazy_loading=False)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [2]:
from perspectival.loader import load_hellaswag
from perspectival.experiment import Experiment

dataset, features = load_hellaswag(split='validation', split_type='zeroshot')
experiment = Experiment(dataset=dataset, name='HellaSwag Efficiency', features=features)

print(len(dataset.items))

Using the latest cached version of the module from /Users/john/.cache/huggingface/modules/datasets_modules/datasets/hellaswag/512a66dd8b1b1643ab4a48aa4f150d04c91680da6a4096498a5e5f799623d5ae (last modified on Tue Apr 30 07:59:00 2024) since it couldn't be found locally at hellaswag, or remotely on the Hugging Face Hub.


5041


In [3]:
sampled_experiment = experiment.sample(num=100)

## Efficiency

In [4]:
# Can we speed up computation a little? Batching? (Use some timing here to compare)
import numpy as np
import torch
from tqdm import tqdm
import torch.nn.functional as F


def current_method(model, experiment):
    results = model.compute_option_log_likelihoods(items=experiment.dataset.items)
    return np.array(results)

def batch_computation(
        model,
        items,
        add_whitespace: bool=True,
    ):
    if model.lazy_loading:
        transformer, tokenizer = model._load_model()
    else:
        transformer, tokenizer = model.model, model.tokenizer

    input_texts = []
    for item in items:
        input_texts.extend([item.prompt + (" " if (not item.prompt.endswith(' ') and not option.startswith(' ')) else "") + option
                       for option in item.options])

    # Tokenize the combined texts
    encoded_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt', add_special_tokens=True)

    # Get logits from the model
    with torch.no_grad():
        logits = transformer(**encoded_inputs).logits

    #TODO To get the correct result, we would need to consider prompt lengths too,
    # but first check if this approach is even faster
    
    # Ignore the log probs at the beginning
    prompt_length = 1 #len(model.tokenizer(item.prompt, add_special_tokens=False)['input_ids']) #- 1

    # Calculate log probabilities from the logits for each token
    log_probs = F.log_softmax(logits, dim=-1)[:, prompt_length - 1:-1]
    # Need to offset by one since last position contains prediction for current token

    input_ids = encoded_inputs['input_ids'][:, prompt_length:]
    attention_mask = encoded_inputs['attention_mask'][:, prompt_length:]

    # Get log probabilities of actual tokens
    token_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)

    # Set irrelevant entries at the end (from padding) to zero
    masked_log_probs = token_log_probs * attention_mask

    #TODO Note that we would need to pull this apart again so that items have separate entries
    return torch.sum(masked_log_probs, dim=1).numpy()


def new_method(model, experiment, batch_size=10):
    results = []
    for pos in range(0, len(experiment.dataset.items), batch_size):
        items = experiment.dataset.items[pos:pos+batch_size]
        results.extend(batch_computation(model, items=items))
    return results

%time lp1 = current_method(model, sampled_experiment)
for bs in [2, 5, 10, 20, 50, 100]:
    print("Batch size", bs)
    %time lp2 = new_method(model, sampled_experiment, batch_size=bs)

# Doesn't seem to help at all...

Computing option log likelihoods ...


100%|██████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.43it/s]


CPU times: user 2min 12s, sys: 36.9 s, total: 2min 49s
Wall time: 22.6 s
CPU times: user 2min 16s, sys: 33.1 s, total: 2min 49s
Wall time: 21.8 s
CPU times: user 2min 20s, sys: 35.8 s, total: 2min 56s
Wall time: 24 s
CPU times: user 2min 15s, sys: 36.7 s, total: 2min 52s
Wall time: 25.5 s
CPU times: user 2min 15s, sys: 35.1 s, total: 2min 50s
Wall time: 25.7 s
CPU times: user 2min 10s, sys: 54.1 s, total: 3min 4s
Wall time: 30.2 s
CPU times: user 1min 57s, sys: 54 s, total: 2min 51s
Wall time: 36.7 s
