In [1]:
from pathlib import Path

while Path.cwd().name != 'ambient':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambient


In [4]:
import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from generation.gpt3_generation import request
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from utils.utils import predict_nli, strip_punctuation_and_casing
from utils.constants import NLI_LABELS
from mturk.utils import get_disambiguation_idxs
import openai
import truecase
import string
from sklearn.metrics import f1_score
from evaluation.pmi import cross_entropy
from evaluation.generative_evaluation import generative_evaluation
from evaluation.continuation_evaluation import continuation_evaluation
from evaluation.edit_f1 import get_edit_f1
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import random
from collections import defaultdict, Counter

### evaluation for multilabel models

In [6]:
models = ['roberta-large-rsnli-unli', 'roberta-large-smnli-ambnli', 'roberta-large-distilled-smnli', 'roberta-large-mnli', 'roberta-large-wanli', 'roberta-large-mchaos-multilabel', 'roberta-large-wanli-multilabel', 'roberta-large-wanli-set', 'binary_models']
seeds = range(42, 47, 1)
results = defaultdict(list)
for seed in seeds:
    for model in models:
        with open(f'results/multilabel/{model}/seed{seed}.json') as fin:
            res = json.load(fin)
            results[model].append(res)

In [7]:
for model in models:
    print(model)
    model_df = pd.DataFrame(results[model])
    print('EM', np.round(model_df.em.mean()*100, 1), np.round(model_df.em.std()*100, 1))
    print('Group EM', np.round(model_df.group_em.mean()*100, 1), np.round(model_df.group_em.std()*100, 1))
    print('F1', np.round(model_df.f1.mean()*100, 1), np.round(model_df.f1.std()*100, 1))
    if 'logit_threshold' in model_df.columns:
        if model_df.dtypes.logit_threshold == float:
            print('Threshold', np.round(model_df.logit_threshold.median(), 2))
        else:
            for label in NLI_LABELS:
                a = np.median([t[label][0] for t in model_df.logit_threshold])
                b = np.median([t[label][1] for t in model_df.logit_threshold])
                print('Threshold', label, np.round(a, 2), np.round(b, 2))
    print('\n')

roberta-large-rsnli-unli
EM 24.5 2.3
Group EM 4.7 2.5
F1 62.2 1.0
Threshold contradiction 0.03 0.71
Threshold entailment 0.69 1.0
Threshold neutral 0.01 1.0


roberta-large-smnli-ambnli
EM 21.0 1.6
Group EM 10.1 2.5
F1 63.8 0.8
Threshold -3.43


roberta-large-distilled-smnli
EM 24.3 1.1
Group EM 4.7 1.2
F1 68.0 0.1
Threshold -1.55


roberta-large-mnli
EM 25.3 1.8
Group EM 4.0 2.5
F1 68.0 0.9
Threshold -2.68


roberta-large-wanli
EM 30.8 3.8
Group EM 10.1 7.9
F1 71.4 0.3
Threshold -1.19


roberta-large-mchaos-multilabel
EM 15.8 3.4
Group EM 0.9 1.2
F1 63.2 0.6
Threshold -2.78


roberta-large-wanli-multilabel
EM 35.1 3.0
Group EM 19.1 4.8
F1 72.5 0.3
Threshold -1.97


roberta-large-wanli-set
EM 43.6 0.8
Group EM 37.8 0.4
F1 70.7 0.2


binary_models
EM 37.6 3.1
Group EM 20.0 6.2
F1 72.3 0.6
Threshold -2.22




# 1. generative evaluation

In [8]:
models = ['flan-t5-xxl', 'llama-65b',  'davinci', 'text-davinci-003', 'gpt-3.5-turbo', 'gpt-4']
num_incontext_examples = 4
generative_results = {}
dfs = []
for model in models:
    df = pd.read_json(f'results/generative_evaluation/{model}-n{num_incontext_examples}.jsonl', lines=True)
    df['source'] = model
    dfs.append(df)
    print(model, np.round(df.edit_f1.mean()*100, 1))

results_df = pd.concat(dfs)

flan-t5-xxl 5.2
llama-65b 10.0
davinci 10.1
text-davinci-003 14.5
gpt-3.5-turbo 13.0
gpt-4 18.0


In [9]:
get_edit_f1(
    "We're afraid that ambiguity stumps language models.",
    'He is so smart that he could have been a doctor instead of his current occupation.',
    'He is so smart that he could have been a doctor, but he didn’t become one.',
    verbose=True
)

["[DELETED]We're", '[DELETED]afraid', '[DELETED]ambiguity', '[DELETED]stumps', '[DELETED]language', '[DELETED]models.', '[ADDED]He', '[ADDED]is', '[ADDED]so', '[ADDED]smart', '[ADDED]he', '[ADDED]could', '[ADDED]have', '[ADDED]been', '[ADDED]a', '[ADDED]doctor', '[ADDED]instead', '[ADDED]of', '[ADDED]his', '[ADDED]current', '[ADDED]occupation.']
["[DELETED]We're", '[DELETED]afraid', '[DELETED]ambiguity', '[DELETED]stumps', '[DELETED]language', '[DELETED]models.', '[ADDED]He', '[ADDED]is', '[ADDED]so', '[ADDED]smart', '[ADDED]he', '[ADDED]could', '[ADDED]have', '[ADDED]been', '[ADDED]a', '[ADDED]doctor,', '[ADDED]but', '[ADDED]he', '[ADDED]didn’t', '[ADDED]become', '[ADDED]one.']


0.7142857142857143

In [10]:
def find_patterns(df):
    num_total_disambiguations = 0
    patterns = defaultdict(int)
    for i, row in df.iterrows():
        ambiguous_sentence_key = 'premise' if row['premise_ambiguous'] else 'hypothesis'
        disambiguations = row['predicted_rewrites'].values()
        if len(disambiguations) == 0:
            patterns['empty'] += 1
        
        if all((row[ambiguous_sentence_key][:-1] in d) and (d != row[ambiguous_sentence_key]) for d in disambiguations):
            patterns['restate_with_context'] += 1
        
        for disambiguation in disambiguations:
            if disambiguation == row[ambiguous_sentence_key]:
                patterns['copied'] += 1
            num_total_disambiguations += 1
    return {
        'empty': np.round(patterns['empty'] / len(df) * 100, 1),
        'copied': np.round(patterns['copied'] / num_total_disambiguations * 100, 1),
        'restate_with_context': np.round(patterns['restate_with_context'] / len(df) * 100, 1)
    }

In [11]:
for model in models:
    model_df = results_df.loc[results_df['source'] == model]
    x = find_patterns(model_df)
    print(model, x)

flan-t5-xxl {'empty': 41.7, 'copied': 30.3, 'restate_with_context': 46.2}
llama-65b {'empty': 0.0, 'copied': 32.1, 'restate_with_context': 34.6}
davinci {'empty': 0.0, 'copied': 25.4, 'restate_with_context': 39.3}
text-davinci-003 {'empty': 0.2, 'copied': 3.1, 'restate_with_context': 38.5}
gpt-3.5-turbo {'empty': 2.6, 'copied': 0.6, 'restate_with_context': 17.2}
gpt-4 {'empty': 1.6, 'copied': 0.0, 'restate_with_context': 39.7}


# 2. TF evaluation

In [12]:
models = ['flan-t5-xxl', 'llama-65b', 'davinci', 'text-davinci-003', 'gpt-3.5-turbo', 'gpt-4']
for model in models:
    results_df = pd.read_json(f'results/TF_evaluation/{model}.jsonl', lines=True)
    acc = (results_df.prediction == results_df.answer).sum()/len(results_df.index)
    print(f'--- {model} ---')
    print(f'Accuracy: {np.round(acc*100, 1)}')
    
    EM_accs = []
    for (example_id, ambiguous_sentence_key, disambiguation), d_group in results_df.groupby(['example_id', 'ambiguous_sentence_key', 'disambiguation']):
        if all(d_group.answer == d_group.prediction):
            EM_accs.append(1)
        else:
            EM_accs.append(0)
    
    print(f'EM Accuracy: {np.round(np.mean(EM_accs)*100, 1)}')
    
    for template_id in results_df.template_id.unique():
        template_df = results_df.loc[results_df['template_id'] == template_id]
        print(template_id, np.round((template_df.prediction == template_df.answer).sum()/len(template_df.index)*100, 1))
    
    if 'TF_prob_mass' in results_df.columns:
        prob_mass = results_df.TF_prob_mass.mean()
        print(f'prob mass of T/F tokens: {prob_mass}')
    else:
        top_1_dict = results_df.prediction.value_counts().to_dict()
        print(f'prop of examples where T or F is top-1 token: {(top_1_dict[True] + top_1_dict[False])/len(results_df.index)}')
    print('\n')

--- flan-t5-xxl ---
Accuracy: 56.4
EM Accuracy: 0.0
0 85.9
1 28.2
2 100.0
3 11.6
prob mass of T/F tokens: 0.6931589238902599


--- llama-65b ---
Accuracy: 55.0
EM Accuracy: 3.2
0 96.1
1 92.1
2 11.8
3 19.9
prob mass of T/F tokens: 0.36481244281523867


--- davinci ---
Accuracy: 57.8
EM Accuracy: 4.3
0 46.2
1 69.0
2 45.0
3 71.1
prob mass of T/F tokens: 0.703112927097401


--- text-davinci-003 ---
Accuracy: 49.6
EM Accuracy: 0.3
0 71.9
1 18.1
2 81.0
3 27.5
prob mass of T/F tokens: 0.9952253991682005


--- gpt-3.5-turbo ---
Accuracy: 57.8
EM Accuracy: 2.6
0 81.5
1 51.7
2 74.5
3 23.4
prop of examples where T or F is top-1 token: 0.9974424552429667


--- gpt-4 ---
Accuracy: 63.0
EM Accuracy: 2.5
0 91.6
1 68.8
2 81.8
3 9.9
prop of examples where T or F is top-1 token: 0.9759164535379369




### analysis: find self-contradictions

In [13]:
# analysis
model = 'gpt-4'
results_df = pd.read_json(f'results/TF_evaluation/{model}.jsonl', lines=True)

In [14]:
patterns = defaultdict(int)
total_interps =0
total_permutations = 0
total_pairs = 0

for (example_id, ambiguous_sentence_key), sentence_df in results_df.groupby(['example_id', 'ambiguous_sentence_key']):
    disambiguation_dfs = sentence_df.groupby('disambiguation')
    disambiguation_dfs = {k:v for k,v in disambiguation_dfs}
    for disambiguation, df in disambiguation_dfs.items():
        preds = df.prediction.tolist()
        if preds[0] == True and preds[2] == True:
            patterns['x1=T =><= x3=T (pure contradiction)'] += 1
        if preds[1] == True and preds[3] == True:
            patterns['x2=T =><= x4=T (pure contradiction)'] += 1
        if (preds[0] == preds[3] == True) and (preds[1] == preds[2] == False):
            patterns['x1=T, x2=F, x3=F, x4=T (sole interp)'] += 1
        
        total_interps += 1
    
    for pair in itertools.combinations(disambiguation_dfs.keys(), 2):
        d1, d2 = pair[0], pair[1]
        d1_preds = disambiguation_dfs[d1].prediction.tolist()
        d2_preds = disambiguation_dfs[d2].prediction.tolist()
        
        if (d1_preds[0] == True and d1_preds[3] == True and d2_preds[0] == True):
            patterns['x1=T, y1=T, x4=T, y4=T (symmetrical contradiction across interps)'] += 1
        
        total_pairs += 1

In [15]:
patterns

defaultdict(int,
            {'x1=T, x2=F, x3=F, x4=T (sole interp)': 333,
             'x1=T, y1=T, x4=T, y4=T (symmetrical contradiction across interps)': 461,
             'x2=T =><= x4=T (pure contradiction)': 647,
             'x1=T =><= x3=T (pure contradiction)': 116})

# 3. continuation evaluation

In [18]:
models = ['flan-t5-xxl', 'llama-65b', 'davinci', 'text-davinci-003']
for model in models:
    df = pd.read_json(f'results/continuation_evaluation/{model}/results.jsonl', lines=True)
    df['ranking'] = None
    for i, row in df.iterrows():
        KLs = {k:v['KL_div'] for k,v in row['options'].items()}
        ranking = ' > '.join(sorted(KLs, key=KLs.get, reverse=True))
        df.at[i, 'ranking'] = ranking
    
    correct_ranking_ct = np.sum([1 for r in df.ranking if r[0] == 'd'])
    
    print(model, np.round(correct_ranking_ct/len(df.index)*100, 1))

flan-t5-xxl 81.0
llama-65b 68.9
davinci 75.7
text-davinci-003 71.4
