# Human annotation and comparison to A's and Q's performance

In [1]:
import csv
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import json

from itertools import product, combinations
from nltk.metrics import agreement
from sklearn.metrics import cohen_kappa_score

import evaluationaux as ev 
from tasks import get_task_labels

%matplotlib inline
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 50)

In [2]:
PATH = '/project/brie/scripts_scorekeeping/'
VISDIAL = Path('generating_propositions', 'data', 'visual_dialog')
PROPS = Path(PATH, 'main_task', 'data', 'propositions')
PATH_OUTPUTS = Path(PATH, 'main_task', 'outputs')

In [3]:
path_visdial_test = Path(PATH, VISDIAL, 'visdial_1.0_test.json')
with open(path_visdial_test, 'r') as e:
    visdial_test = json.load(e)
    
path_props_test = Path(PATH, PROPS, 'propositions_test.json')
with open(path_props_test, 'r') as f:
    props_test = json.load(f)['dialogues']

In [4]:
SPLIT = 'test'

BOTS =  ('a', 'q')
CONTROL_TASKS =  ('none', 'rand-reps', 'null-reps')
TASKS =  ('TFxPS', 'TF', 'PS', 'PxTSFS')
BOT_VERSIONS =  ('RL-DIV', 'SL', 'ICCV-RL')

In [5]:
df = pd.read_csv('human-results/human-results.csv')
df.shape

(300, 17)

300 because the original sample had 100 (dialogue, proposition, turn shown) triples annotated by 3 students.

These six cases were removed/edited on the final test set and will be excluded (see filtering_probes.ipynb):

In [6]:
EXCLUDED = (14, 1310, 7054, 7417, 798, 6447)

annot_df = df[~df['dlg_idx'].isin(EXCLUDED)].copy()
annot_df.shape

(282, 17)

So the used sample has 94 datapoints, 300-(6*3) = 282.

**Important**: remember that, on this dataframe, content=1 means true and shared=1 means shared. It was reversed in the labels created for training, so a direct comparison should take that into account. To avoid confusion, the cell below will create new columns using the same labels used in training.

In [7]:
for task in TASKS:
    classes, classes_names = get_task_labels(bot='a', task=task)
    annot_df[task + '_gold'] = annot_df.apply(lambda row: classes[(row.content_gold, row.shared_gold)], axis=1)
    annot_df[task + '_pred'] = annot_df.apply(lambda row: classes[(row.content, row.shared)], axis=1)

# check if correct prediction in TF dimension
annot_df['TF_isright'] = (annot_df['content'] == annot_df['content_gold']) 
# check if correct prediction in PS dimension
annot_df['PS_isright'] = (annot_df['shared'] == annot_df['shared_gold']) 
# check if correct prediction in both dimensions
annot_df['TFxPS_isright'] = (annot_df['TF_isright'] & annot_df['PS_isright']) 
# check if correct prediction in PxTSFS dimension
annot_df['PxTSFS_isright'] = (annot_df['PxTSFS_pred'] == annot_df['PxTSFS_gold'])

annot_df['TF_isright_v2'] = (annot_df['TF_pred'] == annot_df['TF_gold'])
annot_df['PS_isright_v2'] = (annot_df['PS_pred'] == annot_df['PS_gold']) 
annot_df['TFxPS_isright_v2'] = (annot_df['TFxPS_pred'] == annot_df['TFxPS_gold']) 

Ensuring that created classes using two methods lead to same results (i.e. the apply method is working):

In [8]:
assert annot_df['TFxPS_isright'].equals(annot_df['TFxPS_isright_v2'])
assert annot_df['TF_isright'].equals(annot_df['TF_isright_v2'])
assert annot_df['PS_isright'].equals(annot_df['PS_isright_v2'])

In [9]:
annot_df

Unnamed: 0,username,timestamp,split,dlg_idx,turn_idx,prop_idx,content,content_gold,shared,shared_gold,turn_shared,rule,type,qa_fact,question,answer,prop,TFxPS_gold,TFxPS_pred,TF_gold,TF_pred,PS_gold,PS_pred,PxTSFS_gold,PxTSFS_pred,TF_isright,PS_isright,TFxPS_isright,PxTSFS_isright,TF_isright_v2,PS_isright_v2,TFxPS_isright_v2
0,PU6HR2,10:49:43,test,4396,2,6,1,1,1,1,0,caption,direct entailment,none,none,A refrigerator is full of many types of drinks.,There are drinks.,0,0,0,0,0,0,0,0,True,True,True,True,True,True,True
1,PU6HR2,10:50:04,test,4933,0,4,1,1,0,0,2,there,direct entailment,positive,Are there any other people?,Yes.,There are other people.,2,2,0,0,1,1,2,2,True,True,True,True,True,True,True
2,PU6HR2,10:50:22,test,2899,1,1,0,0,1,1,0,caption,direct contradiction,none,none,Large living space with shiny hardwood floors ...,One cannot see a large space.,1,1,1,1,0,0,1,1,True,True,True,True,True,True,True
3,PU6HR2,10:50:47,test,7302,6,19,0,0,1,1,6,there,direct contradiction,negative,Are there people in the photo?,No.,There are people in the photo.,1,1,1,1,0,0,1,1,True,True,True,True,True,True,True
4,PU6HR2,10:51:06,test,3094,0,11,0,0,0,0,1,there,direct contradiction,negative,Is there any humans visible?,No.,There is humans visible.,3,3,1,1,1,1,2,2,True,True,True,True,True,True,True
5,PU6HR2,10:51:29,test,4448,1,6,0,1,0,0,2,what_color,direct entailment,none,What color are the towels?,White.,The towels are white.,2,3,0,1,1,1,2,2,False,True,False,True,False,True,False
6,PU6HR2,10:51:41,test,802,0,7,0,0,1,1,0,caption,direct contradiction,none,none,A man with an elvis haircut cutting another ma...,There are no scissors.,1,1,1,1,0,0,1,1,True,True,True,True,True,True,True
7,PU6HR2,10:52:18,test,3158,2,13,1,0,0,0,4,noun_,direct contradiction,positive,What does it look like she is eating?,Doughnuts.,There are no doughnuts.,3,2,1,0,1,1,2,2,False,True,False,True,False,True,False
8,PU6HR2,10:52:31,test,3052,1,9,0,0,0,0,3,there,direct contradiction,negative,Are there any other animals?,No.,There are other animals.,3,3,1,1,1,1,2,2,True,True,True,True,True,True,True
10,PU6HR2,10:53:35,test,7520,0,6,1,1,1,1,0,caption,direct entailment,none,none,There are goats standing on a field of green g...,There are grass.,0,0,0,0,0,0,0,0,True,True,True,True,True,True,True


In [10]:
human_sample = set([tuple(r) for r in annot_df.loc[:, ['dlg_idx', 'prop_idx', 'turn_shared', 'type']].to_numpy()])
assert len(human_sample) == 100 - len(EXCLUDED) # 94

## Human results

In [11]:
USERS = list(set(annot_df['username']))

In [12]:
human_metrics = {task: {} for task in TASKS}

for i, user in enumerate(USERS):
    for task in TASKS:
        human_metrics[task][user] = 100 * np.mean(annot_df[annot_df['username'] == user][task + '_isright'])
    print(f'User {i+1}: {human_metrics["TF"][user]} on TF, {human_metrics["PS"][user]} on PS, {human_metrics["TFxPS"][user]} on TFxPS, {human_metrics["PxTSFS"][user]} on PxTSFS')

User 1: 93.61702127659575 on TF, 97.87234042553192 on PS, 91.48936170212765 on TFxPS, 96.80851063829788 on PxTSFS
User 2: 93.61702127659575 on TF, 98.93617021276596 on PS, 92.5531914893617 on TFxPS, 95.74468085106383 on PxTSFS
User 3: 95.74468085106383 on TF, 95.74468085106383 on PS, 91.48936170212765 on TFxPS, 95.74468085106383 on PxTSFS


In [13]:
table_df = pd.DataFrame(columns=TASKS)
values = []
for task in TASKS:
    mean = np.mean([human_metrics[task][u] for u in USERS])
    values.append(mean)
    print(f'Mean {task}: {mean}')

table_df.loc['human'] = values

Mean TFxPS: 91.84397163120566
Mean TF: 94.32624113475178
Mean PS: 97.51773049645391
Mean PxTSFS: 96.09929078014186


Table for the paper:

In [14]:
table_df

Unnamed: 0,TFxPS,TF,PS,PxTSFS
human,91.843972,94.326241,97.51773,96.099291


## Inter-Annotator Agreement

Each datapoint in the sample comes form a different dialogue. So it has 94 dialogue ids, each annotated by 3 annotators.

In [15]:
assert len(set(annot_df["dlg_idx"].to_numpy())) == 100 - len(EXCLUDED)

In [16]:
IDS = list(set(annot_df["dlg_idx"].to_numpy()))
IDS.sort()
assert len(IDS) == 100 - len(EXCLUDED)

In [17]:
kappas = {task: {} for task in TASKS}
annotations = {task: {} for task in TASKS}

for task in TASKS:
    for (user_1, user_2) in combinations(USERS, 2):
        
        labels_1 = annot_df.loc[annot_df["username"] == user_1, ["dlg_idx", task + '_pred']].copy().sort_values(by=["dlg_idx"])
        labels_2 = annot_df.loc[annot_df["username"] == user_2, ["dlg_idx", task + '_pred']].copy().sort_values(by=["dlg_idx"])
        
        assert (labels_1['dlg_idx'].to_numpy() == labels_2['dlg_idx'].to_numpy()).all()
        
        annotations[task][user_1] = {dialogue: label for _, (dialogue, label) in labels_1.iterrows()}
        annotations[task][user_2] = {dialogue: label for _, (dialogue, label) in labels_2.iterrows()}
        
        labels_1 = labels_1[task + '_pred'].to_numpy()
        labels_2 = labels_2[task + '_pred'].to_numpy()

        kappa = cohen_kappa_score(labels_1, labels_2)
        kappas[task][(user_1, user_2)] = kappa
        print(f'{task}, {user_1} and {user_2}: {kappa}')

# another way to get the values, returns same results:
#print('\n')
#for task in TASKS:
#    for (user_1, user_2) in combinations(USERS, 2):
#        
#        df_1 = annot_df.loc[annot_df["username"] == user_1, ["dlg_idx", task + '_pred']]
#        df_2 = annot_df.loc[annot_df["username"] == user_2, ["dlg_idx", task + '_pred']]
#        
#        labels_1 = np.array([df_1.loc[df_1['dlg_idx'] == idx][task + '_pred'].item() for idx in IDS])
#        labels_2 = np.array([df_2.loc[df_2['dlg_idx'] == idx][task + '_pred'].item() for idx in IDS])
#
#        kappa = cohen_kappa_score(labels_1, labels_2)
#        kappas[task][(user_1, user_2)] = kappa
#        print(f'{task}, {user_1} and {user_2}: {kappa}')

TFxPS, EZTJJK and L587OV: 0.8287015945330296
TFxPS, EZTJJK and PU6HR2: 0.9143897996357013
TFxPS, L587OV and PU6HR2: 0.8428332573339413
TF, EZTJJK and L587OV: 0.7880018042399639
TF, EZTJJK and PU6HR2: 0.9148936170212766
TF, L587OV and PU6HR2: 0.8297872340425532
PS, EZTJJK and L587OV: 0.9352914180816888
PS, EZTJJK and PU6HR2: 0.9143117593436645
PS, L587OV and PU6HR2: 0.9356458238247376
PxTSFS, EZTJJK and L587OV: 0.8852258852258852
PxTSFS, EZTJJK and PU6HR2: 0.9171075837742504
PxTSFS, L587OV and PU6HR2: 0.9011220196353437


In [18]:
n_sample = annot_df.shape[0] / 3

for task in TASKS:
    print(task)
    annotation_data = []
    for n, idx in enumerate(IDS):
        annotation_data.append((USERS[0], n, annotations[task][USERS[0]][idx]))
        annotation_data.append((USERS[1], n, annotations[task][USERS[1]][idx]))
        annotation_data.append((USERS[2], n, annotations[task][USERS[2]][idx]))
        
    annot_task = agreement.AnnotationTask(data=annotation_data)
    print("Cohen's Kappa:", annot_task.kappa())
    print("Fleiss's Kappa:", annot_task.multi_kappa())

TFxPS
Cohen's Kappa: 0.861974883834224
Fleiss's Kappa: 0.8619886593762657
TF
Cohen's Kappa: 0.8442275517679313
Fleiss's Kappa: 0.8441597588545589
PS
Cohen's Kappa: 0.9284163337500303
Fleiss's Kappa: 0.9283973187081049
PxTSFS
Cohen's Kappa: 0.9011518295451597
Fleiss's Kappa: 0.9010931197755305


## Compare to A and Q

Create set with all element ids in sample. First read the data:

In [19]:
test_outputs = {}
test_datapoints = {}
epoch = 0  # for the test set, the only inference step is saved as 0

# embs='pmbv2', model='Deeper' as default on ev.get_data

for (bot, task, version) in product(BOTS, TASKS, BOT_VERSIONS):
    if bot == 'q' and 'TF' in task:
        continue
    data, outputs = ev.get_data(bot, task, 'none', PATH_OUTPUTS, split=SPLIT, bot_version=version, epoch=0)
    
    # sanity check that loaded dataset contains same datapoints used in the experiments
    data_identifiers = ev.get_identifiers(bot, version, task, 'none', PATH_OUTPUTS, split=SPLIT)
    assert ev.compare_identifiers(data.datapoints, data_identifiers)
    
    test_outputs[(bot, task, version)] = outputs
    test_datapoints[(bot, task, version)] = data.datapoints

Make sure again that all datasets create the same datapoints for the same tasks:

In [20]:
for task in TASKS:
    task_datapoints = [v for k, v in test_datapoints.items() if k[1] == task]
    assert all(x == task_datapoints[0] for x in task_datapoints)

Get only the elements in the human sample. Because the prop_id may have changed on the final datasets, due to the SpaCy/Coref steps, we must confirm that the sentences are indeed the same (see filtering_probes.ipynb). For captions, we can (supposedly) rely on the proposition ID (we assert to be sure), because coref resolution is not used when generating caption propositions and the rules did not change. By using the same Spacy version, nothing should change. For the remaining rules, the prop_id is not reliable because by removing the ```how_many``` rule and with the new coref model, besides other small adjustments in the rules (```look_like```), their order may have changed. What persists is the triplet (dialogue_id, turn_shared, a_thinks_true).

With all assertions passing, we create the sample with the new corresponding (dialogue_id, proposition_id) that is unique plus the turn up to which the dialogue was shown.

In [21]:
sample = []
before_to_after = {}

for _, row in annot_df.iterrows():
    
    if row.turn_shared == 0:
        proposition_data = props_test[str(row.dlg_idx)][str(row.prop_idx)]
        original_sent = proposition_data['proposition']
        assert row.prop.lower() == original_sent
        assert row.content_gold == proposition_data['a_thinks_true']
        sample.append((row.dlg_idx, row.prop_idx, row.turn_idx))
        
        before_to_after[(row.dlg_idx, row.prop_idx)] = (row.dlg_idx, row.prop_idx)

    else:
        props_list = props_test[str(row.dlg_idx)]
        item = [(idx, prop) for idx, prop in props_list.items() 
                if prop['turn_shared'] == row.turn_shared and prop['a_thinks_true'] == row.content_gold]
        prop_id, proposition_data = item[0]
        original_sent = proposition_data['proposition']
        assert row.prop.lower() == original_sent
        sample.append((row.dlg_idx, int(prop_id), row.turn_idx))
        
        before_to_after[(row.dlg_idx, row.prop_idx)] = (row.dlg_idx, int(prop_id))

sample = set(sample)
assert len(sample) == 100 - len(EXCLUDED)

In [22]:
sample_outputs = {task: {} for task in TASKS}

for (bot, task, version) in product(BOTS, TASKS, BOT_VERSIONS):
    if bot == 'q' and 'TF' in task:
        sample_outputs[task][(bot, version)] = 'NA'
        continue
        
    sample_outputs[task][(bot, version)] = {}
    for idx, (prediction, gold) in test_outputs[(bot, task, version)].items():
        dialogue_id, prop_id, sent_id, turn, _ = test_datapoints[(bot, task, version)][idx]
        if (dialogue_id, prop_id, turn) in sample:
            sample_outputs[task][(bot, version)][(dialogue_id, prop_id, turn)] = (prediction, gold)

    assert len(sample_outputs[task][(bot, version)]) == 100 - len(EXCLUDED)

In [23]:
def acc(outputs):
    if outputs == 'NA':
        return '-'
    return 100*np.mean([x==y for (x, y) in outputs.values()])

table = table_df.copy()

for (bot, version) in product(BOTS, BOT_VERSIONS):
    values = np.array([acc(sample_outputs[task][(bot, version)]) for task in table.columns])
    new_row = pd.DataFrame([values], index=[bot + ', ' + version], columns=table.columns)
    table = pd.concat([table, new_row])

In [24]:
table

Unnamed: 0,TFxPS,TF,PS,PxTSFS
human,91.844,94.3262,97.5177,96.0993
"a, RL-DIV",52.1277,65.9574,74.4681,65.9574
"a, SL",50,72.3404,73.4043,68.0851
"a, ICCV-RL",52.1277,71.2766,77.6596,67.0213
"q, RL-DIV",-,-,75.53191489361703,62.76595744680851
"q, SL",-,-,79.7872340425532,70.2127659574468
"q, ICCV-RL",-,-,75.53191489361703,68.08510638297872


In [25]:
print(table.to_latex())

\begin{tabular}{lllll}
\toprule
{} &    TFxPS &       TF &                 PS &             PxTSFS \\
\midrule
human      &   91.844 &  94.3262 &            97.5177 &            96.0993 \\
a, RL-DIV  &  52.1277 &  65.9574 &            74.4681 &            65.9574 \\
a, SL      &       50 &  72.3404 &            73.4043 &            68.0851 \\
a, ICCV-RL &  52.1277 &  71.2766 &            77.6596 &            67.0213 \\
q, RL-DIV  &        - &        - &  75.53191489361703 &  62.76595744680851 \\
q, SL      &        - &        - &   79.7872340425532 &   70.2127659574468 \\
q, ICCV-RL &        - &        - &  75.53191489361703 &  68.08510638297872 \\
\bottomrule
\end{tabular}



## Making sure that datapoints are the same

In [26]:
sent_encoder = pickle.load(open('data/embeddings/embeddings_paraphrase-mpnet-base-v2.p', 'rb'))

We checked above that datapoints are the same across experiments, as they should be, so we'll just check one of them now:

In [27]:
data, _ = ev.get_data('a', 'TFxPS', 'none', PATH_OUTPUTS, split=SPLIT, bot_version='RL-DIV', epoch=0)

info_to_id = {v[:-1]:k for k, v in test_datapoints[('a', 'TFxPS', 'RL-DIV')].items()}
assert len(info_to_id) == len(test_datapoints[('a', 'TFxPS', 'RL-DIV')])

Compare with proposition dataset and the embeddings:

In [28]:
for _, row in annot_df.iterrows():
    
    # dialogue id and proposition id on the human results csv
    if (row.dlg_idx, row.prop_idx) in before_to_after:
        # dialogue id and proposition id to which they correspond on the final test set
        d_idx, p_idx = before_to_after[(row.dlg_idx, row.prop_idx)]
        
        before_sentence = row.prop
        after_sentence = props_test[str(d_idx)][str(p_idx)]['proposition']
        # confirm that the proposition used in the human eval is exactly the same as in the test set
        assert before_sentence.lower() == after_sentence
        
        # retrieve the sentence id on the loaded data
        sent_id = [s for (d, p, s, _), v in info_to_id.items() if d == d_idx and p == p_idx]
        # repetitions on different turns occur, but sentence id is unique
        assert len(set(sent_id)) == 1
        sent_id = sent_id[0]
        
        embedding = data.id2sent[sent_id]
        # ensure that it's the same as the one in the embeddings
        assert np.array_equal(embedding, sent_encoder[after_sentence])