In [1]:
%load_ext autoreload
%autoreload 2

# Example: Rotten Tomatoes

In this example we compare two models on 0-shot sentiment detection.

## Load Models

In [1]:
from perspectival.model import Transformer

model = Transformer('gpt2')
model2 = Transformer('distilgpt2')

## Set up the Experiment

In [2]:
from perspectival.loader import load_rotten_tomatoes
from perspectival.experiment import Experiment

dataset, features = load_rotten_tomatoes()
experiment = Experiment(dataset=dataset, name='Rotten Tomato Test', features=features)

In [3]:
experiment = experiment.sample(num=100)

## Compute Features

In [4]:
experiment.compute_correctness(models=[model, model2])
experiment.compute_disagreement(models=[model, model2])

In [None]:
#experiment.name = 'test'
#experiment.save('../data/')
#experiment = Experiment.load('../data/', 'test')

## Explore Results

In [5]:
# Show items with max disagreement
scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
samples = experiment.sample(num=2, sampling_method='last', ordering_scores=scores)
samples.display_items()

ITEM (train_129)
"""Question: What is the sentiment of the text below?

Text: 'it is nature against progress . in fessenden's horror trilogy , this theme has proved important to him and is especially so in the finale .'

Answer: The sentiment is"""
Options: ['negative', 'positive']

FEATURES
GroundTruth 1
OptionLogLikelihood gpt2 [-7.09939  -6.559614]
OptionLogLikelihood distilgpt2 [-6.6436863 -6.8264008]
ModelChoices gpt2 1
ModelChoices distilgpt2 0
PredictionCorrectness gpt2 1.0
PredictionCorrectness distilgpt2 0.0
LogDisagreement ('distilgpt2', 'gpt2') 0.098624855
BinaryDisagreement ('distilgpt2', 'gpt2') True


--------------------


ITEM (train_4523)
"""Question: What is the sentiment of the text below?

Text: 'to build a feel-good fantasy around a vain dictator-madman is off-putting , to say the least , not to mention inappropriate and wildly undeserved .'

Answer: The sentiment is"""
Options: ['negative', 'positive']

FEATURES
GroundTruth 0
OptionLogLikelihood gpt2 [-6.2675176 -

In [6]:
# General output statistics
import numpy as np
from collections import Counter

for m in [model, model2]:
    print(m.name)
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"- Accuracy: {accuracy:.2f}")
    print("- Output:", Counter(experiment.get_feature('ModelChoices', model=m.name).values).most_common())

gpt2
- Accuracy: 0.61
- Output: [(1, 91), (0, 9)]
distilgpt2
- Accuracy: 0.58
- Output: [(1, 88), (0, 12)]


In [8]:
scores = experiment.get_feature('LogDisagreement', models=(model.name, model2.name)).values
disagreement_rate = sum(scores>0)/len(scores)
print(f"Overall disagreement rate: {disagreement_rate:.2f}")

for m in [model, model2]:
    accuracy = np.mean(experiment.get_feature('PredictionCorrectness', model=m.name).values)
    print(f"Accuracy of {m.name}: {accuracy:.2f}")

Overall disagreement rate: 0.07
Accuracy of gpt2: 0.61
Accuracy of distilgpt2: 0.58
