In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from loader import load_anthropic_eval_data
from experiment import Experiment

data = load_anthropic_eval_data('/Users/john/code/anthropic_evals', 'advanced-ai-risk', 'human_generated_evals')
experiment = Experiment(dataset=data['dataset'], features=data['features'], name='Anthropic AI Risk Test')

In [4]:
from loader import load_rotten_tomatoes
from experiment import Experiment

dataset = load_rotten_tomatoes()
experiment = Experiment(dataset=dataset, name='Rotten Tomato Test')

In [5]:
from model import Transformer, LazyTransformer

model = LazyTransformer('gpt2')
model2 = LazyTransformer('distilgpt2')

In [6]:
experiment = experiment.sample(num=100)

In [8]:
experiment.compute_correctness(models=[model, model2])
experiment.compute_log_disagreement(models=[model, model2])

In [6]:
#experiment.name = 'test'
#experiment.save('data/')
#experiment = Experiment.load('data/', 'test')

In [9]:
scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
samples = experiment.sample(num=2, sampling_method='last', ordering_scores=scores)
samples.display_items()

ITEM (train_79)
"""Question: What is the sentiment of the text below?

Text: 'absorbing and disturbing -- perhaps more disturbing than originally intended -- but a little clarity would have gone a long way .'

Answer: The sentiment is"""
Options: ['negative', 'positive']
Ground truth: positive

FEATURES
OptionLogLikelihood gpt2 -5.8567605
OptionLogLikelihood distilgpt2 -5.9328456
ModelChoices gpt2 1
ModelChoices distilgpt2 0
PredictionCorrectness gpt2 1.0
PredictionCorrectness distilgpt2 0.0
LogDisagreement ('distilgpt2', 'gpt2') 0.2161679


--------------------


ITEM (train_4254)
"""Question: What is the sentiment of the text below?

Text: 'the underworld urban angst is derivative of martin scorsese's taxi driver and goodfellas , but this film speaks for itself .'

Answer: The sentiment is"""
Options: ['negative', 'positive']
Ground truth: positive

FEATURES
OptionLogLikelihood gpt2 -7.419267
OptionLogLikelihood distilgpt2 -7.7285085
ModelChoices gpt2 1
ModelChoices distilgpt2 0
Pred

In [8]:
from collections import Counter

for m in [model, model2]:
    print(m.name)
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"- Accuracy: {accuracy:.2f}")
    print("- Output:", Counter(experiment.get_feature('ModelChoices', model=m.name).values).most_common())

gpt2
- Accuracy: 0.42
- Output: [(1, 51), (0, 49)]
distilgpt2
- Accuracy: 0.40
- Output: [(0, 55), (1, 45)]


In [9]:
# For this dataset we want to see how disagreement varies for categories
scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
disagreement_rate = sum(scores>0)/len(scores)
print(f"Overall disagreement rate: {disagreement_rate:.2f}")

for m in [model, model2]:
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"Unproblematic behavior of {m.name}: {accuracy:.2f}")

categories = experiment.get_feature('Category').values
for category in sorted(list(set(categories))):
    category_mask = categories==category
    category_selection = experiment.sample(mask=category_mask)
    
    category_disagreement = np.array(category_selection.get_feature('LogDisagreement', models=(model.name, model2.name)).values)
    accuracy1 = np.mean(category_selection.get_feature('PredictionCorrectness', model=model.name).values)
    accuracy2 = np.mean(category_selection.get_feature('PredictionCorrectness', model=model2.name).values)
    print(f"\nCategory '{category}':")
    print(f"- Disagreement rate: {sum(category_disagreement>0)/len(category_disagreement):.2f}")
    print(f"- Scores: {model.name} {accuracy1:.2f}, {model2.name} {accuracy2:.2f}")

Overall disagreement rate: 0.24
Unproblematic behavior of gpt2: 0.42
Unproblematic behavior of distilgpt2: 0.40

Category 'coordinate-itself':
- Disagreement rate: 0.20
- Scores: gpt2 0.20, distilgpt2 0.00

Category 'coordinate-other-ais':
- Disagreement rate: 0.00
- Scores: gpt2 1.00, distilgpt2 1.00

Category 'coordinate-other-versions':
- Disagreement rate: 1.00
- Scores: gpt2 1.00, distilgpt2 0.00

Category 'corrigible-less-HHH':
- Disagreement rate: 0.25
- Scores: gpt2 0.25, distilgpt2 0.25

Category 'corrigible-more-HHH':
- Disagreement rate: 0.60
- Scores: gpt2 0.00, distilgpt2 0.60

Category 'corrigible-neutral-HHH':
- Disagreement rate: 0.14
- Scores: gpt2 0.14, distilgpt2 0.29

Category 'myopic-reward':
- Disagreement rate: 0.38
- Scores: gpt2 0.38, distilgpt2 0.15

Category 'one-box-tendency':
- Disagreement rate: 0.00
- Scores: gpt2 0.33, distilgpt2 0.33

Category 'power-seeking-inclination':
- Disagreement rate: 0.14
- Scores: gpt2 0.86, distilgpt2 1.00

Category 'self-awa

In [None]:
#TODO! For Item, don't set correct index by default. This would just be a boolean feature.
# - Adjust ModelFeatures accordingly so that correct choice is taking a feature name, and rename that feature anyway

#TODO Reconsider how item features are part of experiment
# - If they are part of dataset, we can add them already in the dataset loading

#TODO try with better models (also make sure that this doesn't need tons of rewriting; consider model interface class)
# - llama 8b vs llama 8b-instruct? (one such model requires 20GB of VRAM though!) -> make it possible to not keep all the models in memory!
# - models from apple?

#TODO resolve remaining todos in code
# Add the test from the other notebook

#TODO upload to github and document properly

# This finishes the initial scope and should be finished before London!