In [1]:
import os 
import json 
import pandas as pd 
import functools
from itertools import combinations
import collections
from sklearn.metrics import cohen_kappa_score
import numpy as np 
import torch 
from statsmodels.stats import inter_rater as irr
from itertools import chain 
from dataclasses import dataclass, asdict
from sklearn.metrics import f1_score

In [2]:
dataclass(frozen=True)
class Summary:
    source: int # id of source 
    worker_id: str
    summary_id: str # name of summary system 
    spans: pd.DataFrame
    qas: pd.DataFrame
    notes: str


In [3]:
parent_folder = "../annotations/v7"
annotators = [x for x in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, x))]
print(annotators)


['132', '58', '145', '127', '120', '10', '100', '97', '79', '114']


In [4]:
len(annotators)

10

In [257]:
with open(os.path.join(parent_folder, "arie", "20.json"), "r") as f:
    data = json.load(f)

In [258]:
data.keys()

dict_keys(['source', 'sourceId', 'dataset', 'summaries', 'duration', 'done'])

In [259]:
data["summaries"][0].keys()

dict_keys(['tokens', 'spans', 'qas', 'qaClusters', 'notes', 'summaryId'])

In [260]:
def get_qasem_score(df_qas):
    df_relevant = df_qas[df_qas["label"] != 2] # exclude inadequate 
    return 1 - df_relevant["label"].mean()

    

def process_annotation(path):
    with open(path, "r") as f:
        data = json.load(f)

    summaries = []
    for i, summary in enumerate(data["summaries"]):
        df_spans, df_qas = extract_spans_qas(summary)
        sum_obj = {
            "source_id": data["sourceIds"] if "sourceIds" in data else data["sourceId"],
            "summary_id":  summary["summaryId"],
            "tokens": summary["tokens"],
            "spans": df_spans,
            "qas": df_qas,
            "worker_id": path.split("/")[-2], # to find the worker_id given the full path
            "qasem_score": get_qasem_score(df_qas),
            "notes": summary["notes"],
            "focus": data["focus"] / 1000 if "focus" in data else 0
        }
        summaries.append(sum_obj)

    return summaries



def extract_spans_qas(summary):
    '''
    Extract spans and QA annotations from summary json annotation
    each summary has at least the fields: tokens, spans, qas and notes
    '''
    
    # take only spans for annotation (excluding predicate and include_predicate)
    df_spans = pd.DataFrame([x for x in summary["spans"] if "label" in x])
    df_spans["label"] = df_spans["label"].astype(int)
    df_spans["label_neg"] = df_spans["label"].apply(lambda x: 1 if x == 0 else 0) # inverse label, so tp will be tp
    df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
    wrong_qas = set(df_wrong_spans.explode("qaIds")["qaIds"].unique())


    df_qas = pd.DataFrame(summary["qas"])
    # in qas 0 is correct, 1 is not correct and 2 is inadequate 
    df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1
    df_qas["label"] = df_qas["label"].astype(int)
    df_qas["label_neg"] = df_qas["label"].apply(lambda x: 1 if x == 1 else 0) 

    return df_spans, df_qas

def get_qa_diff(df_gold, df_system, worker_id, id):
    gold_row = df_gold.loc[id]["qas"]
    sys_row = df_system[(df_system["worker_id"] == worker_id) & (df_system["id"] == id)].iloc[0]
    sys_qas = sys_row["qas"]
    row = gold_row[["predicateId", "predicate", "question", "answer", "label"]].copy()
    row["mturk_question"] = sys_qas["question"]
    row["prediction"] = sys_qas["label"]
    row["notes"] = sys_row["notes"]
    return row


def get_span_diff(df_gold, df_system, worker_id, id):
    gold_row = df_gold.loc[id]["spans"]
    sys_row = df_system[(df_system["worker_id"] == worker_id) & (df_system["id"] == id)].iloc[0]
    sys_spans = sys_row["spans"]
    tokens = [x["text"] for x in df_gold.loc[id]["tokens"]]
    gold_row["span"] = gold_row.apply(
        lambda row: " ".join(tokens[row["start"]:row["end"]]),
        axis=1
    )
    row = gold_row[["start", "end", "span", "label"]].copy()
    row["prediction"] = sys_spans["label"]
    return row
    
def flatten_and_percentage(series):
    flat_list = list(chain.from_iterable(series))
    true_count = sum(flat_list)
    return true_count / len(flat_list)

In [261]:
annotations = []
for annotator in annotators:
    hits = [x for x in os.listdir(os.path.join(parent_folder, annotator)) if x.endswith("json")]
    for hit in hits:
        path = os.path.join(parent_folder, annotator, hit)
        annotations.extend(process_annotation(path))
        

In [262]:
df = pd.DataFrame(annotations)
df.head()


Unnamed: 0,source_id,summary_id,tokens,spans,qas,worker_id,qasem_score,notes,focus
0,20,bart,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_p...,sentId predicateId predicate predicatePo...,A1FS8SBR4SDWYG,0.928571,,128.186
1,20,pegasus,"[{'id': 0, 'text': 'A', 'lemma': 'a', 'spans':...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos ...,A1FS8SBR4SDWYG,0.416667,,128.186
2,17,bart,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos...,A1FS8SBR4SDWYG,0.583333,,51.413
3,17,pegasus,"[{'id': 0, 'text': 'Why', 'lemma': 'why', 'spa...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos ...,A1FS8SBR4SDWYG,1.0,,51.413
4,118,bart,"[{'id': 0, 'text': 'Sweden', 'lemma': 'Sweden'...",start end qaIds predicate inclu...,sentId predicateId predicate predicatePos...,A1FS8SBR4SDWYG,0.333333,,246.891


In [263]:
df["id"] = df.apply(lambda row: f"{row['source_id']}_{row['summary_id']}", axis=1)
df_gold = df[df["worker_id"] == "arie"].set_index("id")
df_system = df[df["worker_id"] != "arie"].copy()

In [264]:
'''
create id for each row source_id_summary_id_worker_id 
for each row, evaluate against the reference 
assign two scores: spans and qas % of agreement 
'''

'\ncreate id for each row source_id_summary_id_worker_id \nfor each row, evaluate against the reference \nassign two scores: spans and qas % of agreement \n'

In [265]:
eval_scores = []
raw_qa_scores = []
for i, row in df_system.iterrows():
    gold_row = df_gold.loc[row.id]
    span_score = (gold_row["spans"]["label"] == row["spans"]["label"]).mean()
    qa_score = (gold_row["qas"]["label"] == row["qas"]["label"]).mean()
    eval_scores.append((span_score, qa_score))
    raw_qa_scores.append((gold_row["qas"]["label"] == row["qas"]["label"]).tolist())


In [266]:
df_system[["source_id", "worker_id", "focus"]].drop_duplicates()

Unnamed: 0,source_id,worker_id,focus
0,20,A1FS8SBR4SDWYG,128.186
2,17,A1FS8SBR4SDWYG,51.413
4,118,A1FS8SBR4SDWYG,246.891
6,143,A1FS8SBR4SDWYG,106.144
8,8,A1FS8SBR4SDWYG,99.81
10,22,A1FS8SBR4SDWYG,102.733
12,20,A39SK1E6IMQBD5,1625.969
14,17,A39SK1E6IMQBD5,805.315
16,118,A39SK1E6IMQBD5,1410.98
18,143,A39SK1E6IMQBD5,464.107


In [267]:
df_system["span_score"] = [x[0] for x in eval_scores]
df_system["qa_score"] = [x[1] for x in eval_scores]
df_system["raw_scores"] = raw_qa_scores

In [268]:
df_system[["id", "span_score", "qa_score"]].groupby(["id"]).agg({'span_score': 'mean', 'qa_score': 'mean', 'id': 'count'}).sort_values(by="qa_score")

Unnamed: 0_level_0,span_score,qa_score,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
118_bart,0.833333,0.291667,3
118_pegasus,1.0,0.533333,3
22_pegasus,0.833333,0.611111,3
20_pegasus,0.933333,0.638889,3
143_bart,0.666667,0.703704,3
20_bart,0.952381,0.755556,3
8_pegasus,0.888889,0.761905,3
8_bart,0.866667,0.766667,3
143_pegasus,0.833333,0.818182,3
22_bart,1.0,0.820513,3


In [269]:
df_system[["worker_id", "span_score", "qa_score", "raw_scores"]].groupby(["worker_id"]).agg({'span_score': 'mean', 'qa_score': 'mean', 'worker_id': 'count', 'raw_scores': flatten_and_percentage}).sort_values(by="qa_score")

Unnamed: 0_level_0,span_score,qa_score,worker_id,raw_scores
worker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1FS8SBR4SDWYG,0.813889,0.585991,12,0.551724
A6KOTWP7N7RLU,0.881151,0.73223,12,0.731034
A39SK1E6IMQBD5,0.965278,0.815494,12,0.813793


In [277]:
worker_id = "A39SK1E6IMQBD5"
df_system[df_system["worker_id"] == worker_id]

Unnamed: 0,source_id,summary_id,tokens,spans,qas,worker_id,qasem_score,notes,focus,id,span_score,qa_score,raw_scores
12,20,bart,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_p...,sentId predicateId predicate predicatePo...,A39SK1E6IMQBD5,0.933333,,1625.969,20_bart,1.0,0.8,"[False, True, True, True, True, True, True, Tr..."
13,20,pegasus,"[{'id': 0, 'text': 'A', 'lemma': 'a', 'spans':...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos ...,A39SK1E6IMQBD5,1.0,,1625.969,20_pegasus,1.0,1.0,"[True, True, True, True, True, True, True, Tru..."
14,17,bart,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos...,A39SK1E6IMQBD5,0.416667,,805.315,17_bart,0.833333,0.833333,"[True, True, True, True, True, True, True, Tru..."
15,17,pegasus,"[{'id': 0, 'text': 'Why', 'lemma': 'why', 'spa...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos ...,A39SK1E6IMQBD5,1.0,,805.315,17_pegasus,1.0,1.0,"[True, True, True]"
16,118,bart,"[{'id': 0, 'text': 'Sweden', 'lemma': 'Sweden'...",start end qaIds predicate inclu...,sentId predicateId predicate predicatePos...,A39SK1E6IMQBD5,0.666667,,1410.98,118_bart,1.0,0.3125,"[True, True, False, False, True, True, True, F..."
17,118,pegasus,"[{'id': 0, 'text': 'Former', 'lemma': 'former'...",start end qaIds predicate include_pred...,sentId predicateId predicate predicatePos ...,A39SK1E6IMQBD5,0.4,,1410.98,118_pegasus,1.0,0.6,"[True, True, True, False, False]"
18,143,bart,"[{'id': 0, 'text': 'A', 'lemma': 'a', 'spans':...",start end qaIds predicate i...,sentId predicateId predicate predicatePo...,A39SK1E6IMQBD5,0.611111,,464.107,143_bart,1.0,0.833333,"[True, True, True, False, True, True, False, T..."
19,143,pegasus,"[{'id': 0, 'text': 'A', 'lemma': 'a', 'spans':...",start end qaIds predicate include_pre...,sentId predicateId predicate predicatePos ...,A39SK1E6IMQBD5,0.636364,,464.107,143_pegasus,0.75,0.727273,"[True, False, True, True, True, False, True, T..."
20,8,bart,"[{'id': 0, 'text': 'An', 'lemma': 'an', 'spans...",start end qaIds predicate include...,sentId predicateId predicate predicateP...,A39SK1E6IMQBD5,0.45,,2162.178,8_bart,1.0,1.0,"[True, True, True, True, True, True, True, Tru..."
21,8,pegasus,"[{'id': 0, 'text': 'A', 'lemma': 'a', 'spans':...",start end qaIds predicate include...,sentId predicateId predicate predicateP...,A39SK1E6IMQBD5,0.571429,,2162.178,8_pegasus,1.0,1.0,"[True, True, True, True, True, True, True, Tru..."


In [316]:
topic_id = "143_bart"

In [317]:
# df_gold.loc[topic_id]["qas"]
# df_system[df_system["worker_id"] == "A5WWHKD82I8UE"].set_index("id")

In [318]:
predictions = get_qa_diff(df_gold, df_system, worker_id, topic_id)
# predictions = get_qa_diff(df_system[df_system["worker_id"] == "A31PW970Z2PC5P"].set_index("id"), df_system, worker_id, topic_id)
predictions[predictions["label"] != predictions["prediction"]]

Unnamed: 0,predicateId,predicate,question,answer,label,mturk_question,prediction,notes
3,4-5,stealing,what did someone steal?,"more than £ 100,000",1,what did someone steal?,0,
6,17-18,launched,what has someone launched?,an appeal against her conviction and sentence,2,what has someone launched?,0,
12,19-20,appeal,what did someone appeal?,her conviction,2,what did someone appeal?,0,


In [293]:
print(predictions.iloc[0]["notes"])




In [287]:
df_gold.loc[topic_id]["spans"]

Unnamed: 0,start,end,qaIds,predicate,include_predicate,id,sourceIds,label,label_neg
0,2,3,[0],False,False,2,"[0, 57, 60]",1,0
1,4,5,[1],False,False,4,[],1,0
2,7,8,[5],False,False,6,[],0,1
3,10,11,"[2, 6]",False,False,7,"[112, 89, 171]",1,0
4,13,16,"[3, 7]",False,False,8,[],0,1
5,22,25,[11],False,False,12,"[40, 41, 42, 168, 172, 109, 173, 174]",1,0


In [288]:
spans = get_span_diff(df_gold, df_system, worker_id, topic_id)
spans[spans["label"] != spans["prediction"]]

Unnamed: 0,start,end,span,label,prediction
5,22,25,Museum of Brands,1,0


In [128]:
# tp: gold = no and system = no
# fp: gold = yes and system = no
# fn: gold = no and system = yes
# tn: gold = yes and system = yes


In [129]:
def process_annotation(path):
    with open(path, "r") as f:
        data = json.load(f)
        
    # take only spans for annotation (exclude predicate and include predicate)
    df_spans = pd.DataFrame([x for x in data["spans"] if "label" in x]) 
    df_spans["label"] = df_spans["label"].astype(int)
    df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
    wrong_qas = set(df_wrong_spans.explode("qaIds")["qaIds"].unique())

    # add other wrong QAs according to clusters
    if "positiveQAs" in data:
        for _, clusters in data["positiveQAs"].items():
            if len(clusters) == 1:
                continue
            for cluster in clusters:
                wrong_qas.update(cluster)

    df_qas = pd.DataFrame(data["qas"])
    # in qas 0 is correct, 1 is not correct and 2 is inadequate 
    df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1
    df_qas["label"] = df_qas["label"].astype(int)

    return df_spans, df_qas, data["notes"], data["focus"] / 1000


def get_qasem_score(df_qas):
    df_relevant = df_qas[df_qas["label"] != 2] # exclude inadequate 
    return 1 - df_relevant["label"].mean()


def get_disagreements(df_a1, df_a2):
    df = df_a1.copy()
    df.rename(columns={"label": "label_a1"}, inplace=True)
    df["label_a2"] = df_a2["label"]
    return df[df["label_a1"] != df["label_a2"]]

In [9]:
# find intersection of annotated topics 
topic_per_annotator = {}
for annotator_path in annotators:
    topic_per_annotator[annotator_path] = set([x for x in os.listdir(os.path.join(parent_folder, annotator_path)) if x.endswith('json')])
    
intersection = functools.reduce(lambda x, y: x.intersection(y), topic_per_annotator.values())

print(intersection)

{'59_bart_xsum.json', '76_bart_xsum.json', '76_pegasus_xsum.json', '17_bart_xsum.json', '59_pegasus_xsum.json', '17_pegasus_xsum.json'}


In [432]:
processed_files = collections.defaultdict(dict)
for topic in intersection:
    for annotator in annotators:
        print(f'{annotator}-{topic}')
        processed_file = process_annotation(os.path.join(parent_folder, annotator, topic))
        processed_files[annotator][topic] = processed_file

NameError: name 'intersection' is not defined

In [11]:
# compute fleiss kappa on all annotators

# concatenate QAs predictions for all topics
vectors = {
    k: list(chain.from_iterable([topic[1]["label"].tolist() for _, topic in v.items()]))
    for k, v in processed_files.items()
}


giro = np.array(list(vectors.values())).transpose() # data points should be transposed for computing fleiss

fleiss = irr.fleiss_kappa(irr.aggregate_raters(giro)[0], method='fleiss')

print(fleiss)

0.6458897922312561


In [266]:
most_common_values = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=giro)
np.average(np.equal(giro, most_common_values[:, np.newaxis]))

0.9226190476190477

In [267]:
# cohen-kappa IAA for every pair

pairwise_agreement = collections.defaultdict(dict)
for topic in intersection:
    print(topic)
    for annotator_pair in combinations(annotators, r=2):
        first, second = annotator_pair
        df_spans_a1, df_qas_a1, _ = processed_files[first][topic]
        df_spans_a2, df_qas_a2, _ = processed_files[second][topic]
        pairwise_agreement[topic][f'{annotator_pair[0]}-{annotator_pair[1]}'] = (
            # cohen_kappa_score(df_qas_a1["label"], df_qas_a2["label"]), 
            (df_spans_a1["label"] == df_spans_a2["label"]).mean(), # accuracy on spans
            (df_qas_a1["label"] == df_qas_a2["label"]).mean(), # accuracy on QAs
        )

pd.DataFrame.from_dict(pairwise_agreement).T

76_pegasus_xsum.json
59_pegasus_xsum.json
76_bart_xsum.json
17_pegasus_xsum.json
59_bart_xsum.json
17_bart_xsum.json


Unnamed: 0,arie-paul,arie-A1BKFNR313IWRC,arie-A222G1E6051ZV8,paul-A1BKFNR313IWRC,paul-A222G1E6051ZV8,A1BKFNR313IWRC-A222G1E6051ZV8
76_pegasus_xsum.json,"(0.6666666666666666, 1.0)","(0.6666666666666666, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(0.6666666666666666, 1.0)","(0.6666666666666666, 1.0)"
59_pegasus_xsum.json,"(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)"
76_bart_xsum.json,"(0.8333333333333334, 0.9166666666666666)","(0.8333333333333334, 0.75)","(0.8333333333333334, 0.9166666666666666)","(0.6666666666666666, 0.6666666666666666)","(1.0, 1.0)","(0.6666666666666666, 0.6666666666666666)"
17_pegasus_xsum.json,"(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)","(1.0, 1.0)"
59_bart_xsum.json,"(0.75, 1.0)","(0.75, 0.8333333333333334)","(0.5, 0.8333333333333334)","(1.0, 0.8333333333333334)","(0.75, 0.8333333333333334)","(0.75, 0.6666666666666666)"
17_bart_xsum.json,"(1.0, 0.9166666666666666)","(0.8333333333333334, 0.6666666666666666)","(1.0, 0.75)","(0.8333333333333334, 0.75)","(1.0, 0.8333333333333334)","(0.8333333333333334, 0.75)"


# Playground -- Analyze disagreements

In [224]:
a1 = "A1BKFNR313IWRC"
a2 = "A222G1E6051ZV8"
topic = "17_bart_xsum.json"

In [225]:
df_spans_a1, df_qas_a1, notes_a1 = processed_files[a1][topic]
df_spans_a2, df_qas_a2, notes_a2 = processed_files[a2][topic]

In [226]:
get_disagreements(df_spans_a1, df_spans_a2)

Unnamed: 0,start,end,qaIds,predicate,include_predicate,id,label_a1,label_a2
2,7,8,[5],False,False,6,1,0


In [227]:
get_disagreements(df_qas_a1, df_qas_a2)

Unnamed: 0,sentId,predicateId,predicate,predicateType,question,answer,answerStartToken,answerEndToken,answerId,questionId,label_a1,label_a2
4,34,6-7,changed,verbal,what has changed?,The way we buy things,[0],[5],0-5,4,0,1
5,34,6-7,changed,verbal,how has something changed?,dramatically,[7],[8],7-8,5,0,1
8,34,17-18,according,verbal,what is according to something?,The way we buy things has changed dramatically...,[0],[16],0-16,8,0,1


In [229]:
print(notes_a1)

Thi first part misses "research". Here I am a little confused on the implicit action again. Research is not explicitly mentioned but I guess it can be generally thought that the company did the reasearch?


In [228]:
print(notes_a2)

not sure about "things". 

the green + button never works for me (am I missing something?)
