In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from loader import load_anthropic_eval_dataset

experiment = load_anthropic_eval_dataset('/Users/john/code/anthropic_evals', 'advanced-ai-risk', 'human_generated_evals')

In [6]:
from loader import load_rotten_tomatoes
from experiment import Experiment

dataset = load_rotten_tomatoes()
experiment = Experiment(dataset=dataset, name='Rotten Tomato Test')

Using the latest cached version of the dataset since rotten_tomatoes couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/john/.cache/huggingface/datasets/rotten_tomatoes/default/0.0.0/aa13bc287fa6fcab6daf52f0dfb9994269ffea28 (last modified on Fri May 10 14:54:20 2024).


In [9]:
from model import Transformer, LazyTransformer

model = LazyTransformer('gpt2')
#model2 = LazyTransformer('distilgpt2')

In [8]:
experiment = experiment.sample(num=100)

In [14]:
from features import *

for m in [model, model2]:
    lls = OptionLogLikelihood.compute(experiment.dataset, model=m)
    experiment.register_feature(lls)

    model_choices = ModelChoices.compute(lls)
    experiment.register_feature(model_choices)
    #experiment.register_feature(ModelChoices.compute(experiment.get_feature('OptionLogLikelihood', m.name)))

    experiment.register_feature(PredictionCorrectness.compute(experiment.dataset, model_choices))

log_disagreement = LogDisagreement.compute( (experiment.get_feature('OptionLogLikelihood', model.name),
                                            experiment.get_feature('OptionLogLikelihood', model2.name)) )
experiment.register_feature(log_disagreement)

In [21]:
#experiment.name = 'test'
#experiment.save('data/')
experiment = Experiment.load('data/', 'test')

In [15]:

scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
samples = experiment.sample(num=2, sampling_method='last', ordering_scores=scores)
samples.display_items()

ITEM (train_7865)
"""Question: What is the sentiment of the text below?

Text: 'it has more in common with a fireworks display than a movie , which normally is expected to have characters and a storyline .'

Answer: The sentiment is"""
Options: ['negative', 'positive']
Ground truth: negative

FEATURES
OptionLogLikelihood gpt2 -6.6571083
ModelChoices gpt2 1
PredictionCorrectness gpt2 0.0
LogDisagreement ('gpt2', 'gpt2') -0.0


--------------------


ITEM (train_5439)
"""Question: What is the sentiment of the text below?

Text: 'the whole film has this sneaky feel to it  as if the director is trying to dupe the viewer into taking it all as very important simply because the movie is ugly to look at and not a hollywood product .'

Answer: The sentiment is"""
Options: ['negative', 'positive']
Ground truth: negative

FEATURES
OptionLogLikelihood gpt2 -6.536299
ModelChoices gpt2 1
PredictionCorrectness gpt2 0.0
LogDisagreement ('gpt2', 'gpt2') -0.0


--------------------




In [20]:
from collections import Counter

for m in [model, model2]:
    print(m.name)
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"- Accuracy: {accuracy:.2f}")
    print("- Output:", Counter(experiment.get_feature('ModelChoices', model=m.name).values).most_common())

gpt2
- Accuracy: 0.54
- Output: [(1, 174), (0, 26)]
distilgpt2
- Accuracy: 0.59
- Output: [(1, 169), (0, 31)]


In [24]:
# For this dataset we want to see how disagreement varies for categories
scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
disagreement_rate = sum(scores>0)/len(scores)
print(f"Overall disagreement rate: {disagreement_rate:.2f}")

for m in [model, model2]:
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"Unproblematic behavior of {m.name}: {accuracy:.2f}")

categories = experiment.get_feature('Category').values
for category in sorted(list(set(categories))):
    category_mask = categories==category
    category_selection = experiment.sample(mask=category_mask)
    
    category_disagreement = np.array(category_selection.get_feature('LogDisagreement', models=(model.name, model2.name)).values)
    accuracy1 = np.mean(category_selection.get_feature('PredictionCorrectness', model=model.name).values)
    accuracy2 = np.mean(category_selection.get_feature('PredictionCorrectness', model=model2.name).values)
    print(f"\nCategory '{category}':")
    print(f"- Disagreement rate: {sum(category_disagreement>0)/len(category_disagreement):.2f}")
    print(f"- Scores: {model.name} {accuracy1:.2f}, {model2.name} {accuracy2:.2f}")

Overall disagreement rate: 0.24
Unproblematic behavior of gpt2: 0.34
Unproblematic behavior of distilgpt2: 0.18

Category 'coordinate-other-ais':
- Disagreement rate: 0.24
- Scores: gpt2 0.34, distilgpt2 0.18


In [None]:
#TODO Adjust saving and loading? Or keep that on Experiment level? (Check risk that classes change and we loose the whole array)

#TODO How to properly use dataset and experiment names? If it's not useful drop it

#TODO try with better models (also make sure that this doesn't need tons of rewriting; consider model interface class)
# - llama 8b vs llama 8b-instruct? (one such model requires 20GB of VRAM though!) -> make it possible to not keep all the models in memory!
# - models from apple?

#TODO resolve remaining todos in code
# Add the test from the other notebook
# Also, proper typing would be cool, but how to avoid the circular imports? Interface classes?

#TODO upload to github and document properly

# This finishes the initial scope and should be finished before London!