In [6]:
import os 
import json 
import pandas as pd 
import functools
from itertools import combinations
import collections
from sklearn.metrics import cohen_kappa_score
import numpy as np 
import torch 
from statsmodels.stats import inter_rater as irr
from itertools import chain 

In [7]:
parent_folder = "../annotations/v1"
annotators = ["arie", "paul", "shiyue", "david"]
# annotators = ["arie", "paul"]

In [8]:
def process_annotation(path):
    with open(path, "r") as f:
        data = json.load(f)
        
    # take only spans for annotation (exclude predicate and include predicate)
    df_spans = pd.DataFrame([x for x in data["spans"] if "label" in x]) 
    df_spans["label"] = df_spans["label"].astype(int)
    df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
    wrong_qas = set(df_wrong_spans.explode("qaIds")["qaIds"].unique())

    # add other wrong QAs according to clusters
    if "positiveQAs" in data:
        for _, clusters in data["positiveQAs"].items():
            if len(clusters) == 1:
                continue
            for cluster in clusters:
                wrong_qas.update(cluster)

    df_qas = pd.DataFrame(data["qas"])
    # in qas 0 is correct, 1 is not correct and 2 is inadequate 
    df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1
    df_qas["label"] = df_qas["label"].astype(int)

    return df_spans, df_qas, data["notes"]


def get_qasem_score(df_qas):
    df_relevant = df_qas[df_qas["label"] != 2] # exclude inadequate 
    return 1 - df_relevant["label"].mean()


def get_disagreements(df_a1, df_a2):
    df = df_a1.copy()
    df.rename(columns={"label": "label_a1"}, inplace=True)
    df["label_a2"] = df_a2["label"]
    return df[df["label_a1"] != df["label_a2"]]

In [9]:
# find intersection of annotated topics 
topic_per_annotator = {}
for annotator_path in annotators:
    topic_per_annotator[annotator_path] = set([x for x in os.listdir(os.path.join(parent_folder, annotator_path)) if x.endswith('json')])
    
intersection = functools.reduce(lambda x, y: x.intersection(y), topic_per_annotator.values())

print(intersection)

{'59_bart_xsum.json', '76_bart_xsum.json', '76_pegasus_xsum.json', '17_bart_xsum.json', '59_pegasus_xsum.json', '17_pegasus_xsum.json'}


In [10]:
processed_files = collections.defaultdict(dict)
for topic in intersection:
    for annotator in annotators:
        print(f'{annotator}-{topic}')
        processed_file = process_annotation(os.path.join(parent_folder, annotator, topic))
        processed_files[annotator][topic] = processed_file

arie-59_bart_xsum.json
paul-59_bart_xsum.json
shiyue-59_bart_xsum.json
david-59_bart_xsum.json
arie-76_bart_xsum.json
paul-76_bart_xsum.json
shiyue-76_bart_xsum.json
david-76_bart_xsum.json
arie-76_pegasus_xsum.json
paul-76_pegasus_xsum.json
shiyue-76_pegasus_xsum.json
david-76_pegasus_xsum.json
arie-17_bart_xsum.json
paul-17_bart_xsum.json
shiyue-17_bart_xsum.json
david-17_bart_xsum.json
arie-59_pegasus_xsum.json
paul-59_pegasus_xsum.json
shiyue-59_pegasus_xsum.json
david-59_pegasus_xsum.json
arie-17_pegasus_xsum.json
paul-17_pegasus_xsum.json
shiyue-17_pegasus_xsum.json
david-17_pegasus_xsum.json


In [11]:
# compute fleiss kappa on all annotators

# concatenate QAs predictions for all topics
vectors = {
    k: list(chain.from_iterable([topic[1]["label"].tolist() for _, topic in v.items()]))
    for k, v in processed_files.items()
}


giro = np.array(list(vectors.values())).transpose() # data points should be transposed for computing fleiss

fleiss = irr.fleiss_kappa(irr.aggregate_raters(giro)[0], method='fleiss')

print(fleiss)

0.6458897922312561


In [266]:
most_common_values = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=giro)
np.average(np.equal(giro, most_common_values[:, np.newaxis]))

0.9226190476190477

In [267]:
# cohen-kappa IAA for every pair

pairwise_agreement = collections.defaultdict(dict)
for topic in intersection:
    print(topic)
    for annotator_pair in combinations(annotators, r=2):
        first, second = annotator_pair
        df_spans_a1, df_qas_a1, _ = processed_files[first][topic]
        df_spans_a2, df_qas_a2, _ = processed_files[second][topic]
        pairwise_agreement[topic][f'{annotator_pair[0]}-{annotator_pair[1]}'] = (
            # cohen_kappa_score(df_qas_a1["label"], df_qas_a2["label"]), 
            (df_spans_a1["label"] == df_spans_a2["label"]).mean(), # accuracy on spans
            (df_qas_a1["label"] == df_qas_a2["label"]).mean(), # accuracy on QAs
        )

pd.DataFrame.from_dict(pairwise_agreement).T

76_pegasus_xsum.json
59_pegasus_xsum.json
76_bart_xsum.json
17_pegasus_xsum.json
59_bart_xsum.json
17_bart_xsum.json


Unnamed: 0,arie-paul,arie-A1BKFNR313IWRC,arie-A222G1E6051ZV8,paul-A1BKFNR313IWRC,paul-A222G1E6051ZV8,A1BKFNR313IWRC-A222G1E6051ZV8
76_pegasus_xsum.json,"(0.6666666666666666, 1.0)","(0.6666666666666666, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(0.6666666666666666, 1.0)","(0.6666666666666666, 1.0)"
59_pegasus_xsum.json,"(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)"
76_bart_xsum.json,"(0.8333333333333334, 0.9166666666666666)","(0.8333333333333334, 0.75)","(0.8333333333333334, 0.9166666666666666)","(0.6666666666666666, 0.6666666666666666)","(1.0, 1.0)","(0.6666666666666666, 0.6666666666666666)"
17_pegasus_xsum.json,"(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)"
59_bart_xsum.json,"(0.75, 1.0)","(0.75, 0.8333333333333334)","(0.5, 0.8333333333333334)","(1.0, 0.8333333333333334)","(0.75, 0.8333333333333334)","(0.75, 0.6666666666666666)"
17_bart_xsum.json,"(1.0, 0.9166666666666666)","(0.8333333333333334, 0.6666666666666666)","(1.0, 0.75)","(0.8333333333333334, 0.75)","(1.0, 0.8333333333333334)","(0.8333333333333334, 0.75)"


# Playground -- Analyze disagreements

In [224]:
a1 = "A1BKFNR313IWRC"
a2 = "A222G1E6051ZV8"
topic = "17_bart_xsum.json"

In [225]:
df_spans_a1, df_qas_a1, notes_a1 = processed_files[a1][topic]
df_spans_a2, df_qas_a2, notes_a2 = processed_files[a2][topic]

In [226]:
get_disagreements(df_spans_a1, df_spans_a2)

Unnamed: 0,start,end,qaIds,predicate,include_predicate,id,label_a1,label_a2
2,7,8,[5],False,False,6,1,0


In [227]:
get_disagreements(df_qas_a1, df_qas_a2)

Unnamed: 0,sentId,predicateId,predicate,predicateType,question,answer,answerStartToken,answerEndToken,answerId,questionId,label_a1,label_a2
4,34,6-7,changed,verbal,what has changed?,The way we buy things,[0],[5],0-5,4,0,1
5,34,6-7,changed,verbal,how has something changed?,dramatically,[7],[8],7-8,5,0,1
8,34,17-18,according,verbal,what is according to something?,The way we buy things has changed dramatically...,[0],[16],0-16,8,0,1


In [229]:
print(notes_a1)

Thi first part misses "research". Here I am a little confused on the implicit action again. Research is not explicitly mentioned but I guess it can be generally thought that the company did the reasearch?


In [228]:
print(notes_a2)

not sure about "things". 

the green + button never works for me (am I missing something?)
