In [446]:
import os 
import json 
import pandas as pd 
import functools
from itertools import combinations
import collections
from sklearn.metrics import cohen_kappa_score
import numpy as np 
import torch 
from statsmodels.stats import inter_rater as irr
from itertools import chain 
from dataclasses import dataclass, asdict
from sklearn.metrics import f1_score
import statsmodels
import krippendorff

In [447]:
parent_folder = "../annotations/v18"
topics = [x for x in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, x))]
print(topics)


['CNN-388107_Shadow foreign policy', "CNN-25553_Airlines' commitment to service improvements", 'CNN-164885_Cuban celebration and government gathering', "CNN-7972_Federal Reserve's policy", 'CNN-62362_Illegal immigration', 'NPR-41366_Influence of Iran in Iraq', 'NPR-5615_Presidential helicopter cost', "CNN-50484_Iraq's preparations for a potential US assault", 'NPR-41366_Control and decision-making in Tehran', "CNN-262980_Jeb Bush and Trump's immigration stance", 'CNN-104129_Decline of American automobile industry', 'NPR-38350_Immigration debate', 'NPR-18547_Problems with diaper changes for disabled adults', 'NPR-9336_Improvements in addressing cyberattacks', 'NPR-8010_Anti-Semitic incidents in schools', 'CNN-139946_Minnesota Senate Race']


In [448]:
def get_qasem_score(df_qas):
    df_relevant = df_qas[df_qas["label"] != 2] # exclude inadequate 
    return 1 - df_relevant["label"].mean()

    

def process_annotation(path):
    with open(path, "r") as f:
        data = json.load(f)

    summaries = []
    for i, summary in enumerate(data["summaries"]):
        print(i)
        df_spans, df_qas = extract_spans_qas(summary)
        sum_obj = {
            "source_id": data["sourceIds"] if "sourceIds" in data else data["sourceId"],
            "summary_id":  summary["summaryId"],
            "tokens": summary["tokens"],
            "spans": df_spans,
            "qas": df_qas,
            "worker_id": path.split("/")[-1].split(".json")[0], # to find the worker_id given the full path
            "qasem_score": get_qasem_score(df_qas),
            "notes": summary["notes"],
            "focus": data["focus"] / 1000 if "focus" in data else 0
        }
        summaries.append(sum_obj)

    return summaries



def extract_spans_qas(summary):
    '''
    Extract spans and QA annotations from summary json annotation
    each summary has at least the fields: tokens, spans, qas and notes
    '''
    
    # take only spans for annotation (excluding predicate and include_predicate)
    df_spans = pd.DataFrame([x for x in summary["spans"] if "label" in x])
    df_spans["label"] = df_spans["label"].astype(int)
    df_spans["label_neg"] = df_spans["label"].apply(lambda x: 1 if x == 0 else 0) # inverse label, so tp will be tp
    df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
    wrong_qas = set(df_wrong_spans.explode("qaIds")["qaIds"].unique())


    df_qas = pd.DataFrame(summary["qas"])
    # in qas 0 is correct, 1 is not correct and 2 is inadequate 
    df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1
    df_qas["label"] = df_qas["label"].astype(int)
    df_qas["label_neg"] = df_qas["label"].apply(lambda x: 1 if x == 1 else 0) 

    return df_spans, df_qas

def get_qa_diff(df_gold, df_system, worker_id, id):
    gold_row = df_gold.loc[id]["qas"]
    sys_row = df_system[(df_system["worker_id"] == worker_id) & (df_system["id"] == id)].iloc[0]
    sys_qas = sys_row["qas"]
    row = gold_row[["predicateId", "predicate", "question", "answer", "label"]].copy()
    row["mturk_question"] = sys_qas["question"]
    row["prediction"] = sys_qas["label"]
    row["notes"] = sys_row["notes"]
    return row


def get_span_diff(df_gold, df_system, worker_id, id):
    gold_row = df_gold.loc[id]["spans"]
    sys_row = df_system[(df_system["worker_id"] == worker_id) & (df_system["id"] == id)].iloc[0]
    sys_spans = sys_row["spans"]
    tokens = [x["text"] for x in df_gold.loc[id]["tokens"]]
    gold_row["span"] = gold_row.apply(
        lambda row: " ".join(tokens[row["start"]:row["end"]]),
        axis=1
    )
    row = gold_row[["start", "end", "span", "label"]].copy()
    row["prediction"] = sys_spans["label"]
    return row
    
def flatten_and_percentage(series):
    flat_list = list(chain.from_iterable(series))
    true_count = sum(flat_list)
    return true_count / len(flat_list)

def count_inadequate(arr1, arr2, arr3):
    majority_inadequate, total_inadequate = 0, 0
    for a, b, c in zip(arr1, arr2, arr3):
        if (a == 2 and b == 2) or (a == 2 and c == 2) or (b == 2 and c == 2):
            majority_inadequate += 1
        if a == 2 or b == 2 or c == 2:
            total_inadequate += 1
    return majority_inadequate, total_inadequate
        

def count_matching_elements(arr1, arr2, arr3):
    numerator = 0
    for a, b, c in zip(arr1, arr2, arr3):
        if a == b == c:
            numerator += 1
    return numerator, len(arr1)

def count_matching_elements_without_inadequate(arr1, arr2, arr3):
    numerator, denominator = 0, 0
    for a, b, c in zip(arr1, arr2, arr3):
        if a != 2 and b != 2 and c != 2:
            denominator += 1
        if a != 2 and a == b == c:
            numerator += 1
    return numerator, denominator


def count_hard_disagreements(arr1, arr2, arr3):
    hard = 0 
    for a, b, c in zip(arr1, arr2, arr3):
        s = set([a, b, c])
        if 1 in s and 0 in s:
            hard += 1
    return hard 


def get_qa_pred(dataframes):
    df = dataframes[0][["sentId", "predicateId", "predicate", "predicatePos", "question", "answer"]].copy()
    for i, dataframe in enumerate(dataframes):
        df[f"A{i+1}"] = dataframe["label"] 
    return df 


def get_total_agreements(df):
    return 

In [449]:
'''
for each topic, take the 3 annotations
compute how many QA everyone agree
show a table for each topic: how many QA and percentage 
compute overall IAA with fleiss kappa
'''

'\nfor each topic, take the 3 annotations\ncompute how many QA everyone agree\nshow a table for each topic: how many QA and percentage \ncompute overall IAA with fleiss kappa\n'

In [450]:
topics

['CNN-388107_Shadow foreign policy',
 "CNN-25553_Airlines' commitment to service improvements",
 'CNN-164885_Cuban celebration and government gathering',
 "CNN-7972_Federal Reserve's policy",
 'CNN-62362_Illegal immigration',
 'NPR-41366_Influence of Iran in Iraq',
 'NPR-5615_Presidential helicopter cost',
 "CNN-50484_Iraq's preparations for a potential US assault",
 'NPR-41366_Control and decision-making in Tehran',
 "CNN-262980_Jeb Bush and Trump's immigration stance",
 'CNN-104129_Decline of American automobile industry',
 'NPR-38350_Immigration debate',
 'NPR-18547_Problems with diaper changes for disabled adults',
 'NPR-9336_Improvements in addressing cyberattacks',
 'NPR-8010_Anti-Semitic incidents in schools',
 'CNN-139946_Minnesota Senate Race']

In [451]:
excluded = [
    "NPR-9336_Improvements in addressing cyberattacks",
    "CNN-388107_Shadow foreign policy",
    "CNN-340132_Trump administration's peace plan",
    "CNN-50484_Iraq's preparations for a potential US assault",
    "NPR-8010_Anti-Semitic incidents in schools",
    # "NPR-41366_Control and decision-making in Tehran"
    # "CNN-25553_Airlines' commitment to service improvements", 
    # "CNN-62362_Illegal immigration", 
    # "CNN-340132_Trump administration's peace plan-A3GWRDHAURRNK6.json",
    # "NPR-9336_Improvements in addressing cyberattacks", 
    # "CNN-388107_Shadow foreign policy", 
    # "CNN-50484_Iraq's preparations for a potential US assault",
    # "CNN-262980_Jeb Bush and Trump's immigration stance",
    # "CNN-50484_Iraq's preparations for a potential US assault", 
    # "NPR-8010_Anti-Semitic incidents in schools",
    # "NPR-41366_Control and decision-making in Tehran"
    ]
# topics = [x for x in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, x))]
topics = [x for x in topics if x not in excluded]

In [452]:
len(topics)

12

# real code

In [453]:
annotations = []
parent_folder = "../annotations"
for version in ["v18"]: # ["v16"]: #  ["v14", "v15"]: #["v8", "v9", "v12", "v13"]: 
    for topic in [x for x in os.listdir(os.path.join(parent_folder, version)) if os.path.isdir(os.path.join(parent_folder, version, x))]:
        print(topic)
        if topic in topics:
            hits = [x for x in os.listdir(os.path.join(parent_folder, version, topic)) if x.endswith("json")]
            for hit in hits:
                print(hit)
                path = os.path.join(parent_folder, version, topic, hit)
                # annotations.extend(process_annotation(path))
                try:
                    annotations.extend(process_annotation(path))
                except:
                    print(f"error in {topic}-{hit}")

df = pd.DataFrame(annotations)


CNN-388107_Shadow foreign policy
CNN-25553_Airlines' commitment to service improvements
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3RVHUY67SVXQV.json
0
1
CNN-164885_Cuban celebration and government gathering
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3GWRDHAURRNK6.json
0
1
CNN-7972_Federal Reserve's policy
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3RVHUY67SVXQV.json
0
1
CNN-62362_Illegal immigration
A6KOTWP7N7RLU.json
0
1
A3RVHUY67SVXQV.json
0
1
A3GWRDHAURRNK6.json
0
1
NPR-41366_Influence of Iran in Iraq
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3GWRDHAURRNK6.json
0
1
NPR-5615_Presidential helicopter cost
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3RVHUY67SVXQV.json
0
1
CNN-50484_Iraq's preparations for a potential US assault
NPR-41366_Control and decision-making in Tehran
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A5WWHKD82I8UE.json
0
1
CNN-262980_Jeb Bush and Trump's immigration stance
A6KOTWP7N7RLU.json
0
1
A31PW970Z2PC5P.json
0
1
A3GWRDHAURRNK6.json
0

In [454]:
df["id"] = df.apply(lambda row: f"{row['source_id']}_{row['summary_id']}", axis=1)
df.head()

Unnamed: 0,source_id,summary_id,tokens,spans,qas,worker_id,qasem_score,notes,focus,id
0,CNN-25553_Airlines' commitment to service impr...,Model-Extra,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate in...,sentId predicateId ...,A6KOTWP7N7RLU,0.888889,The article mentions there have been improveme...,1163.969,CNN-25553_Airlines' commitment to service impr...
1,CNN-25553_Airlines' commitment to service impr...,B,"[{'id': 0, 'text': 'Airlines', 'lemma': 'airli...",start end qaIds predicate i...,sentId predicateId ...,A6KOTWP7N7RLU,0.896552,Legislation may be called for when the report ...,1163.969,CNN-25553_Airlines' commitment to service impr...
2,CNN-25553_Airlines' commitment to service impr...,Model-Extra,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_p...,sentId predicateId predicate predicat...,A31PW970Z2PC5P,0.740741,*fare - misspelled in the article. \n*overbook...,2029.285,CNN-25553_Airlines' commitment to service impr...
3,CNN-25553_Airlines' commitment to service impr...,B,"[{'id': 0, 'text': 'Airlines', 'lemma': 'airli...",start end qaIds predicate include_pred...,sentId predicateId predicate predicate...,A31PW970Z2PC5P,0.931034,*who is committed to something ? Airlines - gr...,2029.285,CNN-25553_Airlines' commitment to service impr...
4,CNN-25553_Airlines' commitment to service impr...,Model-Extra,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'spa...",start end qaIds predicate include_p...,sentId predicateId predicate predicat...,A3RVHUY67SVXQV,0.851852,,623.523,CNN-25553_Airlines' commitment to service impr...


In [455]:
df.groupby("worker_id")["focus"].mean()

worker_id
A31PW970Z2PC5P    1540.948636
A3GWRDHAURRNK6     952.976000
A3RVHUY67SVXQV     504.423000
A5WWHKD82I8UE     1010.540000
A6KOTWP7N7RLU      963.069417
Name: focus, dtype: float64

In [456]:
df_qas = pd.DataFrame()
for summary_id, rows in df.groupby("id"):
    if len(rows) != 3:
        continue
    preds = [
        row["qas"]["label"].tolist() for _, row in rows.iterrows()
    ]

    summary_qas = rows.iloc[0]["qas"]
    summary_qas["summary_id"] = summary_id
    summary_qas["A1"] = summary_qas["label"]
    summary_qas["A2"] = rows.iloc[1]["qas"]["label"]
    summary_qas["A3"] = rows.iloc[2]["qas"]["label"]
    # summary_qas["A4"] = rows.iloc[3]["qas"]["label"]
    summary_qas["annotators"] = ",".join([row for row in rows["worker_id"].tolist()])
    df_qas = pd.concat([df_qas, summary_qas])
    

In [457]:
results = {}
for summary_id, rows in df.groupby("id"):
    if len(rows) != 3:
        continue
    preds = [
        row["qas"]["label"].tolist() for _, row in rows.iterrows()
    ]
    
    iaa_with = count_matching_elements(preds[0], preds[1], preds[2])
    results[summary_id] = {
        "iaa_with_inadequate": iaa_with,
        "qas_num": len(preds[0])
    }
df_results = pd.DataFrame(results).T 

In [458]:
df["id"].nunique()

24

In [459]:
df_results["iaa"] = df_results["iaa_with_inadequate"].apply(lambda x: x[0]/x[1])
df_results.sort_values(by="iaa", ascending=True).head(20)

Unnamed: 0,iaa_with_inadequate,qas_num,iaa
CNN-7972_Federal Reserve's policy_E,"(16, 29)",29,0.551724
CNN-262980_Jeb Bush and Trump's immigration stance_A,"(11, 19)",19,0.578947
CNN-104129_Decline of American automobile industry_Model-Extra,"(13, 22)",22,0.590909
CNN-62362_Illegal immigration_C,"(23, 38)",38,0.605263
NPR-41366_Control and decision-making in Tehran_B,"(8, 13)",13,0.615385
CNN-25553_Airlines' commitment to service improvements_Model-Extra,"(18, 27)",27,0.666667
NPR-41366_Control and decision-making in Tehran_Model-Extra,"(14, 20)",20,0.7
NPR-38350_Immigration debate_C,"(28, 40)",40,0.7
NPR-18547_Problems with diaper changes for disabled adults_C,"(23, 32)",32,0.71875
CNN-62362_Illegal immigration_Model-Extra,"(18, 25)",25,0.72


In [460]:
df_qas.shape 

(640, 20)

In [461]:
df_qas.head()

Unnamed: 0,sentId,predicateId,question,answer,predicate,predicatePos,answerStartToken,answerEndToken,answerId,cleanAnswerId,verbTokenId,sourceIds,questionId,label,label_neg,summary_id,A1,A2,A3,annotators
0,0,3-4,who is offering something?,General Motors,offering,VERB,[0],[2],0-2,0-2,2.0,"[160, 2, 3, 123, 124, 61, 62, 159]",0,0,0,CNN-104129_Decline of American automobile indu...,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
1,0,3-4,what is someone offering?,early retirement buyouts,offering,VERB,[4],[7],4-7,4-7,3.0,"[128, 166, 9, 10, 11, 111]",1,0,0,CNN-104129_Decline of American automobile indu...,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
2,0,3-4,who is someone offering something to?,"to 113,000 employees",offering,VERB,[7],[10],7-10,8-10,3.0,"[13, 174, 16, 81, 112, 113, 114]",2,0,0,CNN-104129_Decline of American automobile indu...,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
3,0,3-4,why is someone offering something to someone?,"as part of its efforts to cut 30,000 jobs by 2008",offering,VERB,[10],[21],10-21,11-21,3.0,"[133, 135, 136, 137, 138, 139, 140, 396, 14, 1...",3,0,0,CNN-104129_Decline of American automobile indu...,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
4,0,16-17,who is cutting something?,General Motors,cut,VERB,[0],[2],0-2,0-2,2.0,"[160, 2, 3, 123, 124, 61, 62, 159]",4,0,0,CNN-104129_Decline of American automobile indu...,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"


In [462]:
df_subset = df_qas[~df_qas["summary_id"].isin(["Kim Burrell_PerplexityAI", "Kim Burrell_InstructGPT", "94_bart", "94_pegasus", "4_bart", "4_pegasus"])].copy()
df_full = df_qas.copy()
df_qas = df_subset.copy()

In [463]:
df_qas = df_full.copy()

# Compute IAA

In [464]:
# fleiss kappa
df_qas["annot"] = df_qas.apply(lambda x: [x["A1"], x["A2"], x["A3"]], axis=1)
# df_qas["annot"] = df_qas.apply(lambda x: [x["A1"], x["A2"], x["A3"]], axis=1)
annots = df_qas["annot"].tolist()
table = statsmodels.stats.inter_rater.aggregate_raters(np.array(annots))
fleiss = statsmodels.stats.inter_rater.fleiss_kappa(table[0], method='fleiss')
fleiss 

0.4212812801006466

In [357]:
krippendorff.alpha(np.array(annots).T)

0.35708296244694704

In [67]:
df_qas[["A1", "A2", "A3"]].to_csv("../annotations/factscore_fleiss_kappa.csv", index=False)

In [120]:
table[0].sum()

2805

In [121]:
df_qas["sum"] = df_qas.apply(lambda row: row["A1"] + row["A2"] + row["A3"], axis=1)
agreements = len(df_qas[df_qas["sum"].isin([0,3])])
print(agreements/len(df_qas))

0.8449197860962567


In [122]:
df_qas["summary_id"].nunique()

36

In [123]:
df["id"].nunique()

36

# Compute IAA at the sentence level


In [135]:
df_qas.shape

(211, 22)

In [136]:
df_qas.head()


Unnamed: 0,sentId,predicateId,predicate,predicatePos,question,answer,answerStartToken,answerEndToken,answerId,cleanAnswerId,...,questionId,label,label_neg,summary_id,A1,A2,A3,annotators,annot,sum
0,0,10-11,born,VERB,who was bornt?,David Wenham,[0],[2],0-2,0-2,...,0,0,0,David Wenham_ChatGPT,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q","[0, 0, 0]",0
1,0,10-11,born,VERB,when was someone bornt?,"on September 21 , 1965",[11],[16],11-16,12-16,...,1,0,0,David Wenham_ChatGPT,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q","[0, 0, 0]",0
2,0,10-11,born,VERB,where was someone bornt?,"in Marrickville , Sydney , Australia",[17],[23],17-23,18-23,...,2,1,1,David Wenham_ChatGPT,1,1,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q","[1, 1, 0]",2
3,0,7-8,producer,NOUN,who produced something?,David Wenham,[0],[2],0-2,0-2,...,3,1,1,David Wenham_ChatGPT,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q","[1, 1, 1]",3
4,0,9-10,director,NOUN,who directed something?,David Wenham,[0],[2],0-2,0-2,...,4,1,1,David Wenham_ChatGPT,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q","[1, 1, 1]",3


In [137]:
sentence_level = df_qas.groupby("summary_id").agg(
    A1=('A1', lambda x: 1 if all(x == 0) else 0),
    A2=('A2', lambda x: 1 if all(x == 0) else 0),
    A3=('A3', lambda x: 1 if all(x == 0) else 0)
).reset_index()


In [138]:
sentence_level.head(30)

Unnamed: 0,summary_id,A1,A2,A3
0,David Wenham_ChatGPT,0,0,0
1,David Wenham_InstructGPT,0,0,0
2,Kim Burrell_InstructGPT,0,0,0
3,Kim Burrell_PerplexityAI,0,0,0
4,Quintus Sosius Senecio_InstructGPT,0,0,0
5,Quintus Sosius Senecio_PerplexityAI,0,0,0
6,"William Waldegrave, Baron Waldegrave of North ...",0,0,0
7,"William Waldegrave, Baron Waldegrave of North ...",0,0,0


In [139]:
sentence_level["annot"] = sentence_level.apply(lambda x: [x["A1"], x["A2"], x["A3"]], axis=1)
annots = sentence_level["annot"].tolist()
table = statsmodels.stats.inter_rater.aggregate_raters(np.array(annots))
fleiss = statsmodels.stats.inter_rater.fleiss_kappa(table[0], method='fleiss')
fleiss 

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


nan

In [141]:
sentence_level["sum"] = sentence_level.apply(lambda row: row["A1"] + row["A2"] + row["A3"], axis=1)
agreements = len(sentence_level[sentence_level["sum"].isin([0,3])])
print(agreements/len(sentence_level))

1.0


# Playground

* look at previous annotation for some workers
* look at the inapropriate QAs because of the newQAIds

In [144]:
df_dis = df_qas[(df_qas["sum"].isin([1,2]))][["summary_id", "question", "answer", "answerId" ,"A1", "A2", "A3", "annotators"]].copy()
# df_dis = df_qas[df_qas["summary_id"] == "30_pegasus"][["summary_id", "question", "answer", "answerId" ,"A1", "A2", "A3", "annotators"]].copy()

In [145]:
df_dis

Unnamed: 0,summary_id,question,answer,answerId,A1,A2,A3,annotators
2,David Wenham_ChatGPT,where was someone bornt?,"in Marrickville , Sydney , Australia",17-23,1,1,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
9,David Wenham_ChatGPT,where did someone play something?,in the movie 300,48-52,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
16,David Wenham_ChatGPT,when did someone start something?,in the late 1980s,66-70,0,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
18,David Wenham_ChatGPT,when did someone act?,the late 1980s,67-70,0,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
22,David Wenham_ChatGPT,when has someone worked?,in the late 1980s,66-70,1,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
25,David Wenham_ChatGPT,what has someone won?,several awards,89-91,0,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
27,David Wenham_ChatGPT,what includes something?,several awards for his work,89-94,0,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
30,David Wenham_ChatGPT,where did someone work?,in the movie The Boys,108-113,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
7,David Wenham_InstructGPT,when did someone begin something?,in the mid-1980s,55-58,1,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
9,David Wenham_InstructGPT,when did someone act?,the mid-1980s,56-58,1,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"


In [148]:
df_dis[df_dis["A2"] == df_dis["A3"]]

Unnamed: 0,summary_id,question,answer,answerId,A1,A2,A3,annotators
9,David Wenham_ChatGPT,where did someone play something?,in the movie 300,48-52,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
30,David Wenham_ChatGPT,where did someone work?,in the movie The Boys,108-113,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A21LONLNBOB8Q"
10,Kim Burrell_InstructGPT,what was someone nominated for?,Grammy nominations,32-34,1,0,0,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
19,Kim Burrell_PerplexityAI,who debuted?,She,73-74,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
26,Kim Burrell_PerplexityAI,what has someone been nominated for?,several Grammy Awards,143-146,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
31,Kim Burrell_PerplexityAI,who has something earned something?,her,193-194,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
33,Kim Burrell_PerplexityAI,where has something earned someone something?,within the gospel music industry,196-201,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
35,Kim Burrell_PerplexityAI,who was acclaimed somewhere?,her,193-194,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
36,Kim Burrell_PerplexityAI,where was someone acclaimed?,the gospel music industry,197-201,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"
2,Quintus Sosius Senecio_InstructGPT,when did someone serve as something?,in the year 97 AD,12-17,0,1,1,"A6KOTWP7N7RLU,A21LONLNBOB8Q,A3GWRDHAURRNK6"


In [437]:
print(df_dis[df_dis["A2"] == df_dis["A3"]].shape)
print(df_dis[df_dis["A1"] == df_dis["A3"]].shape)
print(df_dis[df_dis["A2"] == df_dis["A1"]].shape)

(21, 8)
(10, 8)
(16, 8)


In [333]:
df[df["worker_id"] == "A6KOTWP7N7RLU"][["source_id", "summary_id", "notes"]]

Unnamed: 0,source_id,summary_id,notes
6,118,bart,I don't know how literal I need to be with the...
7,118,pegasus,Predicate and Question 2 out of 2 is technical...
12,17,bart,
13,17,pegasus,
30,22,bart,
31,22,pegasus,


In [291]:
df.loc[34]["notes"]

'Even though I recognize the name from daily life (and if I recognize it, then that means that pretty much anyone would), I think Brighton & Hove Albion has to be considered an hallucination because the full name is never laid out in the passage.  Indeed "Hove" is never even used.  Likewise, "Jack Sidwell" is never mentioned with his first name in the passage, and it seems that was an hallucination as well (the internet would be littered with references to "Captain Jack" the pirate after all).  '

In [285]:
df_dis[df_dis["A2"] == df_dis["A3"]]


Unnamed: 0,summary_id,question,answer,answerId,A1,A2,A3,annotators
1,112_bart,what does someone say?,he wants to stay at the club next season and h...,8-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
4,112_bart,what does someone want?,to stay at the club next season and help them ...,10-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
9,112_bart,why might someone stay somewhere?,help them win promotion to the Premier League,18-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
10,112_bart,what will someone help someone do?,win promotion to the Premier League,20-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
12,112_bart,who might win something?,them,19-20,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
14,112_bart,what might someone win?,promotion to the Premier League,21-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
18,112_bart,who might promoted somewhere?,the club,13-15,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
19,112_bart,when might someone promoted somewhere?,next season,15-17,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
20,112_bart,where might someone promoted?,the Premier League,23-26,0,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
4,112_pegasus,what does someone want?,to stay at Brighton,5-9,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"


In [295]:
df_qas[df_qas["summary_id"] == "112_pegasus"][["summary_id", "predicate", "question", "answer", "answerId" ,"A1", "A2", "A3", "annotators"]]

Unnamed: 0,summary_id,predicate,question,answer,answerId,A1,A2,A3,annotators
0,112_pegasus,says,who says something?,Steve Sidwell,0-2,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
1,112_pegasus,says,what does someone say?,he wants to stay at Brighton,3-9,1,0,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
2,112_pegasus,says,why does someone say something?,to help the club stay in the Premier League,9-18,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
3,112_pegasus,wants,who wants something?,Steve Sidwell,0-2,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
4,112_pegasus,wants,what does someone want?,to stay at Brighton,5-9,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
5,112_pegasus,wants,why does someone want something?,to help the club stay in the Premier League,9-18,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
6,112_pegasus,stay,who might stay somewhere?,Steve Sidwell,0-2,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
7,112_pegasus,stay,where might someone stay?,at Brighton,7-9,1,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
8,112_pegasus,stay,why might someone stay somewhere?,to help the club stay in the Premier League,9-18,0,0,0,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"
9,112_pegasus,help,who might help something?,Steve Sidwell,0-2,1,1,1,"A6KOTWP7N7RLU,A31PW970Z2PC5P,A3RVHUY67SVXQV"


In [43]:
questions = df_qas[(df_qas["sum"].isin([1,2]))& (df_qas["summary_id"] == "58_pegasus")]["question"].tolist()

In [44]:
questions

['who has begun something?',
 'where has someone begun something?',
 'who was missing?']

In [38]:
with open("/Users/arie/Documents/phd/factuality/loc-unfaith/annotations/v7/58/A3RVHUY67SVXQV.json") as f:
    data = json.load(f)


In [39]:
data["summaries"][1]["summaryId"]

'pegasus'

In [45]:
df_test = pd.DataFrame(data["summaries"][1]["qas"])
# df_test[df_test["question"].isin(questions)]

In [46]:
df_test

Unnamed: 0,sentId,predicateId,predicate,predicatePos,question,answer,answerStartToken,answerEndToken,answerId,cleanAnswerId,verbTokenId,sourceIds,questionId,label
0,0,7-8,begun,VERB,who has begun something?,Police,[0],[1],0-1,0-1,2,[],0,0.0
1,0,7-8,begun,VERB,where has someone begun something?,in the Republic of Ireland,[1],[6],1-6,3-6,3,"[161, 210, 181, 182, 183, 184, 122, 123]",1,0.0
2,0,7-8,begun,VERB,what has someone begun?,a search,[8],[10],8-10,9-10,3,"[18, 19, 61]",2,0.0
3,0,7-8,begun,VERB,why has someone begun something?,for the remains of a man who went missing 17 y...,[10],[22],10-22,12-22,3,"[161, 100, 198, 167, 199, 77, 78, 175, 210, 11...",3,1.0
4,0,17-18,went,VERB,who went something?,a man,[14],[16],14-16,15-16,1,"[199, 167, 114, 115, 156]",4,
5,0,17-18,went,VERB,what did someone go?,missing,[18],[19],18-19,18-19,3,[],5,0.0
6,0,17-18,went,VERB,when did someone go something?,17 years ago,[19],[22],19-22,19-22,3,[77],6,
7,0,18-19,missing,VERB,where was someone missing?,in the Republic of Ireland,[1],[6],1-6,3-6,3,"[161, 210, 181, 182, 183, 184, 122, 123]",7,0.0
8,0,18-19,missing,VERB,who was missing?,a man,[14],[16],14-16,15-16,2,"[199, 167, 114, 115, 156]",8,
9,0,18-19,missing,VERB,when was someone missing?,17 years ago,[19],[22],19-22,19-22,3,[77],9,


In [47]:
df_test[df_test["question"].isin(questions)]

Unnamed: 0,sentId,predicateId,predicate,predicatePos,question,answer,answerStartToken,answerEndToken,answerId,cleanAnswerId,verbTokenId,sourceIds,questionId,label
0,0,7-8,begun,VERB,who has begun something?,Police,[0],[1],0-1,0-1,2,[],0,0.0
1,0,7-8,begun,VERB,where has someone begun something?,in the Republic of Ireland,[1],[6],1-6,3-6,3,"[161, 210, 181, 182, 183, 184, 122, 123]",1,0.0
8,0,18-19,missing,VERB,who was missing?,a man,[14],[16],14-16,15-16,2,"[199, 167, 114, 115, 156]",8,


In [78]:
with open("/Users/arie/Documents/phd/factuality/loc-unfaith/annotations/v8/58/A3RVHUY67SVXQV.json") as f:
    new_data = json.load(f)


In [79]:
summary = new_data["summaries"][0]
df_spans, df_qas = extract_spans_qas(summary)

In [74]:
# take only spans for annotation (excluding predicate and include_predicate)
df_spans = pd.DataFrame([x for x in summary["spans"] if "label" in x])
df_spans["label"] = df_spans["label"].astype(int)
df_spans["label_neg"] = df_spans["label"].apply(lambda x: 1 if x == 0 else 0) # inverse label, so tp will be tp
df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
wrong_qas = set(df_wrong_spans.explode("qaIds")["qaIds"].unique())

In [75]:
wrong_qas

{4, 6, 10, 13}

In [76]:
df_spans 

Unnamed: 0,start,end,qaIds,predicate,include_predicate,id,sourceIds,newQAIds,label,label_neg
0,2,4,"[7, 12, 0]",False,False,0,[33],"[7, 11, 0]",1,0
1,10,11,"[2, 5, 10]",False,False,5,"[167, 115, 156, 199]","[2, 5, 9]",1,0
2,14,17,"[4, 6, 11]",False,False,9,[77],"[4, 6, 10]",0,1
3,22,24,[14],False,False,13,[],[13],0,1


In [77]:
df_qas = pd.DataFrame(summary["qas"])
df_qas

Unnamed: 0,sentId,predicateId,predicate,predicatePos,question,answer,answerStartToken,answerEndToken,answerId,cleanAnswerId,verbTokenId,sourceIds,questionId,label
0,0,5-6,investigating,VERB,who is investigating something?,Irish police,[2],[4],2-4,2-4,2,[33],0,
1,0,5-6,investigating,VERB,what is someone investigating?,the disappearance of a man who went missing 17...,[6],[17],6-17,7-17,3,"[161, 100, 198, 167, 199, 77, 78, 175, 92, 210...",1,1.0
2,0,12-13,went,VERB,who went something?,a man,[9],[11],9-11,10-11,1,"[199, 167, 114, 115, 156]",2,0.0
3,0,12-13,went,VERB,what did someone go?,missing,[13],[14],13-14,13-14,3,[],3,0.0
4,0,12-13,went,VERB,when did someone go something?,17 years ago,[14],[17],14-17,14-17,3,[77],4,
5,0,13-14,missing,VERB,who was missing?,a man,[9],[11],9-11,10-11,2,"[199, 167, 114, 115, 156]",5,0.0
6,0,13-14,missing,VERB,when was someone missing?,17 years ago,[14],[17],14-17,14-17,3,[77],6,
7,0,18-19,begun,VERB,who has begun something?,Irish police,[2],[4],2-4,2-4,2,[33],7,0.0
8,0,18-19,begun,VERB,what has someone begun?,a search for his body,[19],[24],19-24,20-24,3,"[18, 19, 61]",8,1.0
9,0,7-8,disappearance,NOUN,who disappeared?,a man,[9],[11],9-11,10-11,1,"[199, 167, 114, 115, 156]",9,0.0


In [None]:
df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1
df_qas["label"] = df_qas["label"].astype(int)
df_qas["label_neg"] = df_qas["label"].apply(lambda x: 1 if x == 1 else 0) 

# check new QA Ids

In [147]:
annotation_path = "../annotations/v8"

def extract_missing_annotation(summary):
    '''
    Extract spans and QA annotations from summary json annotation
    each summary has at least the fields: tokens, spans, qas and notes
    '''
    
    # take only spans for annotation (excluding predicate and include_predicate)
    df_spans = pd.DataFrame([x for x in summary["spans"] if "label" in x])
    df_spans["label"] = df_spans["label"].astype(int)
    df_spans["label_neg"] = df_spans["label"].apply(lambda x: 1 if x == 0 else 0) # inverse label, so tp will be tp
    df_wrong_spans = df_spans[df_spans["label"] == 0] # take spans with wrong spans
    wrong_qas = set(df_wrong_spans.explode("newQAIds")["newQAIds"].unique())


    df_qas = pd.DataFrame(summary["qas"])
    # in qas 0 is correct, 1 is not correct and 2 is inadequate 
    df_qas.loc[df_qas["questionId"].isin(wrong_qas), 'label'] = 1

    return df_qas[df_qas["label"].isna()]

In [195]:
topic_id = "145"
dir_path = os.path.join(annotation_path, topic_id)

In [196]:
for annotator in os.listdir(dir_path):
    if annotator.endswith("json"):
        with open(os.path.join(dir_path, annotator), "r") as f:
            data = json.load(f)
        for i, summary in enumerate(data["summaries"]):
            missing = extract_missing_annotation(summary)
            if len(missing) > 0:
                print(annotator, i)
                print(missing[["questionId", "question", "answer", "cleanAnswerId"]])

In [176]:
missing

Unnamed: 0,sentId,predicateId,predicate,predicatePos,question,answer,answerStartToken,answerEndToken,answerId,cleanAnswerId,verbTokenId,sourceIds,questionId,label
6,0,18-19,missing,VERB,where was someone missing?,in the Republic of Ireland,[1],[6],1-6,3-6,3,"[161, 210, 181, 182, 183, 184, 122, 123]",6,
9,0,9-10,search,NOUN,who searched for something?,Police,[0],[1],0-1,0-1,1,[],9,


In [212]:
df["source_id"].unique()

array([132,  58,  20, 145, 118, 127,  17,   8, 100,  79, 114,  22])