In [1]:
from collections import defaultdict, OrderedDict, Counter
import numpy as np
import pandas as pd
from pathlib import Path
import textwrap
from tqdm import tqdm

%cd /home/drchajan/devel/python/FC/automated-fact-checking
%load_ext autoreload
%autoreload 2

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize

from factsearch.mark import EmphasizeClaimWords

# from evaluation.document_retrieval import collect_eval_data, retriever_score
# from evaluation.nli import LABEL_NUM, LABEL_STR, load_rte_model, evaluate_examples, evaluate_all_nli
# from evaluation.fcheck_pipeline import read_fever_db, collect_scores_full, evaluate_full, extract_texts_ctk, InputSplitter

/home/drchajan/devel/python/FC/automated-fact-checking


  from tqdm.autonotebook import tqdm


In [2]:
# original not calibrated versions (BAD, but does not matter for RND)
# PVI_DIR_CS = '/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi'
# PVI_DIR_FCS= '/mnt/data/factcheck/NLI/csfever_nli_cls/pvi'
# PVI_DIR_EN = '/mnt/data/factcheck/wiki/en/20230801/qacg/nli/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi'
# PVI_DIR_FEN = '/mnt/data/factcheck/NLI/nli_fever_cls/pvi'

PVI_DIR_CS = '/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_calibrated'
PVI_DIR_FCS= '/mnt/data/factcheck/NLI/csfever_nli_cls/pvi_calibrated'
PVI_DIR_EN = '/mnt/data/factcheck/wiki/en/20230801/qacg/nli/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_calibrated'
PVI_DIR_FEN = '/mnt/data/factcheck/NLI/nli_fever_cls/pvi_calibrated'
PVI_DIR_CS_FS = "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_fever_size_calibrated"
PVI_DIR_EN_FS = "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_fever_size_calibrated"


In [11]:
def sample_blind_test(cfg, n=100, seed=1235):
    from sklearn.model_selection import train_test_split
    rng = np.random.RandomState(seed)
    claim2evidence_sum = {}
    for name, claim2evidence_file in cfg.items():
        claim2evidence = read_json(claim2evidence_file)
        cids = list(claim2evidence.keys())
        labels = [v[0]["target"] for v in claim2evidence.values()]
        cid2label = {cid: label for cid, label in zip(cids, labels)}
        selected_cids, remaining_cids = train_test_split(cids, train_size=n, random_state=rng, stratify=labels)
        for cid in selected_cids:
            cidnew = f"{name}:{cid}"
            claim2evidence_sum[cidnew] = claim2evidence[cid]
            for e in claim2evidence_sum[cidnew]:
                e["claim_id"] = cidnew
    cids = list(claim2evidence_sum.keys())
    rng.shuffle(cids)
    claim2evidence_sum = {cid: claim2evidence_sum[cid] for cid in cids}
    cnt_labels = Counter([v[0]["target"] for v in claim2evidence_sum.values()])
    print(f"total claims: {len(claim2evidence_sum)}, label counts: {cnt_labels}")
    return claim2evidence_sum


# DIR = "fever2fullwiki" # FEVER testing, QACG-SUM model, QACG corpora
# DIR = "feverfever2fullwiki" # FEVER testing, FEVER model, QACG corpora
DIR = "fullwiki2fullwiki" # QACG testing, QACG-SUM model, QACG corpora
# DIR = "fullwikifever2fullwiki" # QACG testing, FEVER model, QACG corpora

# LANG = "cs"
LANG = "en"
claim2evidence_files = {
    "anserini": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_anserini.json",
    "colbert":  f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert.json",
    "colbert_ans": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert_anserini_filtered.json",
    "colbert_nli": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert_nli_sorted.json"
}

if DIR in ["feverfever2fullwiki", "fullwikifever2fullwiki"]:
    del claim2evidence_files["anserini"] # already computed for fever2fullwiki
    
claim2evidence = sample_blind_test(claim2evidence_files, n=100)

total claims: 400, label counts: Counter({'SUP': 200, 'REF': 200})


In [12]:
list(claim2evidence.values())[0][:2]

[{'claim': 'Bible Quiz is used by the WBQA.',
  'context': 'Bible quiz\n\nIn WBQA quizzing, questions must "only" contain words from the verse from which the question is taken, plus an interrogative (who/what/where/when/why/how) and, if necessary, a form of the verb "to be" (i.e. was/is/were/am, though in practice very few questions require this addition)--no other helping verbs may be used. All questions must be grammatically correct (with the exception of questions beginning with "what if"). This is the same style of question used in other groups such as Bible Quiz Fellowship.\n\nBible quiz\n\nWBQA operates in close cooperation with numerous independent local leagues in the Great Lakes region, such as Detroit Bible Quizzing, Huron Valley Bible Quizzing in the Ann Arbor area, the Ohio Bible Quizzing Association centered around Chillicothe, the Shenango Valley Bible Quiz league around the Pittsburgh area, BIC (Brethren in Christ) Quizzing in the Harrisburg region, and Rochester Youth f

In [13]:
from portion import closedopen
from ufal.morphodita import Forms, TokenRanges, Tokenizer
# from ufal.morphodita import Tokenizer_newCzechTokenizer, Tokenizer_newEnglishTokenizer, Tokenizer_newGenericTokenizer, Tokenizer_newVerticalTokenizer

class MorphoDiTaTokenizer:
    def __init__(self, lang:str ="cs"):
        lang = lang.lower()
        assert lang in ["cs", "en", "generic", "vertical"]
        if lang == "cs":
            # self.tokenizer = Tokenizer_newCzechTokenizer()
            self.tokenizer = Tokenizer.newCzechTokenizer()
        elif lang == "en":
            # self.tokenizer = Tokenizer_newCzechTokenizer()
            self.tokenizer = Tokenizer.newCzechTokenizer()
        elif lang == "generic":
            # self.tokenizer = Tokenizer_newGenericTokenizer()
            self.tokenizer = Tokenizer.newGenericTokenizer()
        elif lang == "vertical":
            # self.tokenizer = Tokenizer_newVerticalTokenizer()
            self.tokenizer = Tokenizer.newVerticalTokenizer()
        self.forms = Forms()
        self.tokens = TokenRanges()


    def tokenizeSentences(self, text: str, spans: bool=False):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            first = self.tokens[0].start
            last = self.tokens[-1].start + self.tokens[-1].length
            if spans:
                yield text[first:last], closedopen(first, last)
            else:
                yield text[first:last]

    def tokenizeWords(self, text: str, spans: bool=False):
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(self.forms, self.tokens):
            for form, token in zip(self.forms, self.tokens):
                if spans:
                    first = token.start
                    last = token.start + token.length
                    yield form, closedopen(first, last)
                else:
                    yield form

class StopWordList:
    def __init__(self, fname="/home/drchajan/devel/python/FC/drchajan/data/stopwords/czech.txt"):
        with open(fname) as f:
            self.stopwords = set([l.strip().lower() for l in f.readlines()])

    def is_stopword(self, word):
        return word.lower() in self.stopwords

In [14]:
from jaro import jaro_winkler_metric
import portion as P
import re

from utils.stopwords import StopWordList
from utils.tokenization import MorphoDiTaTokenizer

def unify_spans(spanrecs):
    # reduces the number of spans by computing union of all spans - any span inside a larger one is removed
    spanrecs = spanrecs.copy()
    intervals = [s["span"] for s in spanrecs["spans"]]
    if len(intervals) == 0:
        return spanrecs
    assert len(spanrecs["spans"][0].keys()) == 1, "more information beyond `span` not supported yet!"
    union = intervals[0]
    for interval in intervals[1:]:
        union = union | interval
    # split union to the intervals
    spanrecs["spans"] = [{"span": interval} for interval in union] 
    return spanrecs


def emphasize_claim_words_jaro_winkler(claim, doc, min_chars=3, jaro_winkler_threshold=0.8, stopwordlist="/home/drchajan/devel/python/FC/drchajan/", lang="cs"):
    # returns list of `doc` words which should be emphasised w.r.t. the `claim`
    tokenizer = MorphoDiTaTokenizer(lang=lang)
    stopwordlist = StopWordList(stopwordlist)
    claim_words = set([w.lower() for w in tokenizer.tokenizeWords(claim) if (not stopwordlist.is_stopword(w)) and len(w) >= min_chars])
    doc_words = set([w.lower() for w in tokenizer.tokenizeWords(doc) if (not stopwordlist.is_stopword(w)) and len(w) >= min_chars])
    emp_words = set()
    for cw in claim_words:
        for dw in doc_words:
            dist = jaro_winkler_metric(cw, dw)
#             print(cw, dw, dist)
            if dist >= jaro_winkler_threshold:
                emp_words.add(dw)
            
    return emp_words

def emphasize_claim_words_jaro_winkler_spans(claim, doc, min_chars=3, jaro_winkler_threshold=0.8, stopwordlist="/home/drchajan/devel/python/FC/drchajan/data/stopwords/czech.txt", lang="cs"):
    # find words to emphasize
    emp_words = emphasize_claim_words_jaro_winkler(claim, doc, min_chars=min_chars, jaro_winkler_threshold=jaro_winkler_threshold, stopwordlist=stopwordlist, lang=lang)
    # find them in `doc` and return the spans
    spans = []
    for w in emp_words:
        spans += [{"span": P.closedopen(a.start(), a.end())} for a in re.finditer(w, doc, re.IGNORECASE)]
    
    return unify_spans({"type": "claim_words", "spans": spans})

claim = "Food Network is offered to 83.3% of homes with a television."
doc = """Oprah Winfrey Network American pay television network Oprah Winfrey Network (OWN) is an American multinational basic cable channel jointly owned by Warner Bros. Discovery and Harpo Studios that launched on January 1, 2011, replacing the Discovery Health Channel. Oprah Winfrey Network The network is led by talk show host and namesake Oprah Winfrey and largely features entertainment and lifestyle programming targeting African American audiences, and reruns of talk show programming from the Harpo Studios library (including "The Oprah Winfrey Show"). Initially a 50/50 joint venture, Discovery acquired a larger stake in the network in 2017 and again in December 2020, when Discovery increased its ownership in OWN from 73% to 95%. Harpo remains a "significant" minority stakeholder and Winfrey is contracted with the channel through at least 2025. Oprah Winfrey Network As of February 2015, OWN is available to approximately 81.9 million pay television households (70.3% of households with television) in the United States. History. Development. After becoming Discovery Communications' new CEO in 2007, David Zaslav found Discovery Health to be underperforming along with its other digital cable networks launched in the last decade, and taking in significantly lower carriage fees in comparison to the company's namesake, Discovery Channel. As a result, he began to explore the possibility of re-launching the channel as a joint venture with another partner. Zaslav's wife was an avid reader of Oprah Winfrey's "O" magazine (a joint venture with Hearst Corporation); believing that her values could serve as the basis for a cable network, he contacted Winfrey's agents to hold a meeting in April 2007. On January 15, 2008, Discovery Communications officially announced that it had entered into a joint venture with Winfrey's studio Harpo Productions, under which it would re-launch Discovery Health as "OWN: The Oprah Winfrey Network", in the second half of 2009."""

def apply_marks(txt, spans):
    if len(spans) == 0:
        return txt

    # type2color = {"claim_words": "#FADBD8", "importance_sentence": "#fcf8e3", "importance_word": "#fcf8e3"}
    col = '#FADBD8' # for black background
    # col = '#4F500E' # for black background
    markedtxt = []
    lold = 0
    for span in spans:
        f, l = span["span"].lower, span["span"].upper
        # col = type2color[span["type"]]
        markedtxt.append(txt[lold:f])
        markedtxt.append(f'<span style="padding:0 0.1em;background-color:{col}">{txt[f:l]}</span>')
        lold = l
    markedtxt.append(txt[lold:])
    
    return ''.join(markedtxt)

In [15]:
if LANG == "cs":
    spans = emphasize_claim_words_jaro_winkler_spans(claim, doc, jaro_winkler_threshold=0.8, stopwordlist="data/stopwords/cs.txt", lang=LANG)
elif LANG == "en":
    spans = emphasize_claim_words_jaro_winkler_spans(claim, doc, jaro_winkler_threshold=0.8, stopwordlist="data/stopwords/en.txt", lang=LANG)
apply_marks(doc, spans["spans"])


'Oprah Winfrey <span style="padding:0 0.1em;background-color:#FADBD8">Network</span> American pay <span style="padding:0 0.1em;background-color:#FADBD8">television</span> <span style="padding:0 0.1em;background-color:#FADBD8">network</span> Oprah Winfrey <span style="padding:0 0.1em;background-color:#FADBD8">Network</span> (OWN) is an American multinational basic cable channel jointly owned by Warner Bros. Discovery and Harpo Studios that launched on January 1, 2011, replacing the Discovery Health Channel. Oprah Winfrey <span style="padding:0 0.1em;background-color:#FADBD8">Network</span> The <span style="padding:0 0.1em;background-color:#FADBD8">network</span> is led by talk show host and namesake Oprah Winfrey and largely features entertainment and lifestyle programming targeting African American audiences, and reruns of talk show programming from the Harpo Studios library (including "The Oprah Winfrey Show"). Initially a 50/50 joint venture, Discovery acquired a larger stake in the 

In [16]:
from IPython.display import display, HTML


def sample_to_html(sample, lang="cs"):
    assert lang in ["cs", "en"]
    htmls = []
    sup = f"{100*float(sample['probs']['SUP']):.2f}%"
    ref = f"{100*float(sample['probs']['REF']):.2f}%"
    nei = f"{100*float(sample['probs']['NEI']):.2f}%"
    cls = sample['pred']
    tgt = sample['target']
    bid = sample["bid"]
    rank = sample["rank"]
    kw_rank = sample.get("kw_rank")

    claim = sample["claim"]
    context = sample["context"]
    if lang == "cs":
        spans = emphasize_claim_words_jaro_winkler_spans(claim, context, jaro_winkler_threshold=0.8)
    elif lang == "en":
        spans = emphasize_claim_words_jaro_winkler_spans(claim, context, jaro_winkler_threshold=0.8, min_chars=2, stopwordlist="/home/drchajan/devel/python/FC/drchajan/data/stopwords/english_nltk.txt", lang="en")
    context = apply_marks(context, spans["spans"])

    htmls.append(f'<h1>{claim}</h1>')
    # htmls.append(f'<p style="font-family:monospace;font-size:18px">#{rank} ({kw_rank}) {bid}</p>')
    htmls.append(f'<p style="font-family:monospace;font-size:18px">{bid}</p>')
    htmls.append(f'<p style="font-family:helvetica;font-size:18px">target: {tgt} / prediction: {cls}<br/> SUP: {sup}, REF: {ref}, NEI: {nei}</p>')
    htmls.append(f'<p style="font-family:serif;font-size:20px;text-align:justify">{context}</p>')
    html = '\n'.join(htmls)
    return html

def print_predictions(claim2evidence, min_confidence=0.0):
    htmls = []
    for evidence_list in list(claim2evidence.values())[10:11]:
        selected_evidence = [e for e in evidence_list if 100*np.max(e["probs"]) >= min_confidence]
        for sample in selected_evidence[:2]:
            htmls.append(sample_to_html(sample))
            htmls.append(f'<hr/>')
    html = '\n'.join(htmls)
    display(HTML(html))

# print_predictions(claim2evidence_filtered)

In [18]:
import ipywidgets as widgets
from ipywidgets import Button, HTML, VBox, HBox
from datetime import datetime

def annotate(claim2evidence, dst_jsonl, l=2, lang="cs"):
    dst_jsonl = Path(dst_jsonl)
    samples = []
    for evidence_list in claim2evidence.values():
        for sample in evidence_list[:l]:
            samples.append(sample)

    recs = []
    if dst_jsonl.is_file():
        recs = read_jsonl(dst_jsonl)
        for s, r in zip(samples, recs):
            assert  s['bid'] == r['bid'] and s['rank'] == r['rank'], "Existing annotation file does not match data!" 
            # assert s['claim_id'] == r['claim_id'] and s['bid'] == r['bid'] and s['rank'] == r['rank'], "Existing annotation file does not match data!" 
    idx = len(recs)

    html = HTML(sample_to_html(samples[idx], lang=lang))
    btn_ok = Button(description='Ok', icon='thumbs-up')
    btn_bad = Button(description='Bad', icon='ban')
    btn_prev = Button(description='Previous', icon='backward')
    count_html = HTML()

    def show_count():
        count_html.value = f"{idx+1}/{len(samples)}"
    
    show_count()
    

    def save_annotation(label):
        nonlocal idx
        nonlocal recs
        date = datetime.now().strftime("%y%m%d_%H%M%S")
        s = samples[idx]
        rec = {"claim_id": s["claim_id"], "bid": s["bid"], "rank": s["rank"], "kw_rank": s.get("kw_rank"), "label": label, "date": date}
        idx += 1
        recs.append(rec)
        write_jsonl(dst_jsonl, recs)
        if idx < len(samples):
            s = samples[idx]
            html.value = sample_to_html(s, lang=lang)
            show_count()


    def btn_ok_eventhandler(obj):
        save_annotation("ok")

    def btn_bad_eventhandler(obj):
        save_annotation("bad")

    def btn_prev_eventhandler(obj):
        nonlocal idx
        nonlocal recs
        if len(recs) > 0:
            idx -= 1
            recs = recs[:-1]
            s = samples[idx]
            html.value = sample_to_html(s, lang=lang)
            show_count()

    btn_ok.on_click(btn_ok_eventhandler)
    btn_bad.on_click(btn_bad_eventhandler)
    btn_prev.on_click(btn_prev_eventhandler)

    form = VBox([html, HBox([btn_ok, btn_bad, HTML("&nbsp;"*30), btn_prev, count_html])])
    return form

# anotace z hlediska kvality evidence retrieval - muze dany clanek pomoci potvrdit/vyvratit claim?
# wrongly formed claims are still acceptable (if understandable)
# i dokument ktery by vysvetlit pravdepodobnou zamenu v claimu je ok
# beru leading title, pripadne sekci jako "personal life" pokud bych tam hledal...

# annotate(claim2evidence, f"data/ncaa/er_annotations/{DIR}/{LANG}_annotations.jsonl", l=2, lang=LANG)
annotate(claim2evidence, f"data/ncaa/er_annotations/{DIR}/{LANG}_annotations_test.jsonl", l=2, lang=LANG)

VBox(children=(HTML(value='<h1>Bible Quiz is used by the WBQA.</h1>\n<p style="font-family:monospace;font-size…

In [20]:
def get_annotation_stats(model2cfg, return_df=True, ignore_leading=False):
    results = []

    for model, cfg in model2cfg.items():
        annotation_file = cfg["file"]
        res = read_jsonl(annotation_file)
        par1fraction = 100.0 * len([1 for r in res if r['bid'].endswith("_1")])/len(res)

        claim2labels = defaultdict(list)

        # documents for single claims must by sorted by the rank!
        for r in res:
            if (not ignore_leading) or (not r["bid"].endswith("_1")):
                claim2labels[r["claim_id"]].append(r["label"])

        method2mrr = defaultdict(list)
        method2p = defaultdict(list)
        method2failed1 = defaultdict(list)
        method2failed2 = defaultdict(list)
        method2bothok = defaultdict(list)
        method2bothfail = defaultdict(list)

        for cid, labels in claim2labels.items():
            method = cid.split(":")[0]
            
            assert len([1 for l in labels if l not in ['ok', 'bad']]) == 0, labels
            if (not ignore_leading) and len(labels) != 2:
                # just for this experiment!
                print(f"skipping for cid: {cid}!")
                continue
            
            mrr = 0.0 # no relevant
            for i, label in enumerate(labels):
                if label == "ok":
                    mrr = 1/(i+1)
                    break
                

            correct = [l == "ok" for l in labels]
            precision = np.mean(correct)

            if len(labels) == 2:
                failed1 = 1.0 if labels[0] == 'bad' and labels[1] == 'ok' else 0.0
                failed2 = 1.0 if labels[0] == 'ok' and labels[1] == 'bad' else 0.0
                bothok = 1.0 if labels[0] == 'ok' and labels[1] == 'ok' else 0.0
                bothfail = 1.0 if labels[0] == 'bad' and labels[1] == 'bad' else 0.0
            else:
                failed1 = 1.0 if labels[0] == "bad" else 0.0
                failed2 = np.nan
                bothok = np.nan
                bothfail = np.nan
            
            method2mrr[method].append(mrr)
            method2p[method].append(precision)
            method2failed1[method].append(failed1)
            method2failed2[method].append(failed2)
            method2bothok[method].append(bothok)
            method2bothfail[method].append(bothfail)

        for method in sorted(method2mrr.keys()):
            n = len(method2mrr[method])
            mrr2 =  100*np.mean(method2mrr[method])
            p2 =  100*np.mean(method2p[method])
            failed1 =  100*np.mean(method2failed1[method])
            failed2 =  100*np.mean(method2failed2[method])
            bothok =  100*np.mean(method2bothok[method])
            bothfail =  100*np.mean(method2bothfail[method])
            results.append({
                "model": model,
                "method": method, 
                "n": n,
                "lead": par1fraction, # percent of leading (1st paragraphs) retrieved
                "MRR@2": mrr2, 
                "P@2": p2, 
                # "both_ok": bothok, 
                # "both_fail": bothfail, 
                # "failed1":failed1, 
                # "failed2": failed2
                })
    if return_df:
        return pd.DataFrame(results)
    return results

In [21]:
# DIR = "fever2fullwiki" # FEVER testing, QACG-SUM model, QACG corpora
# DIR = "feverfever2fullwiki" # FEVER testing, FEVER model, QACG corpora
# DIR = "fullwiki2fullwiki" # QACG testing, QACG-SUM model, QACG corpora
# DIR = "fullwikifever2fullwiki" # QACG testing, FEVER model, QACG corpora

df = get_annotation_stats({
    "QACG-SUM": {"file": "data/ncaa/er_annotations/fullwiki2fullwiki/en_annotations.jsonl"},
    "FEVER": {"file": "data/ncaa/er_annotations/fullwikifever2fullwiki/en_annotations.jsonl"},
    }, ignore_leading=False)

df.loc[df.method == "anserini","model"] = "NA"
# df.sort_values("MRR@2", ascending=False)
df

Unnamed: 0,model,method,n,lead,MRR@2,P@2
0,,anserini,100,34.125,90.5,71.0
1,QACG-SUM,colbert,100,34.125,95.0,68.5
2,QACG-SUM,colbert_ans,100,34.125,94.0,73.0
3,QACG-SUM,colbert_nli,100,34.125,81.0,65.5
4,FEVER,colbert,100,66.833333,65.0,44.5
5,FEVER,colbert_ans,100,66.833333,76.5,55.5
6,FEVER,colbert_nli,100,66.833333,63.0,46.0


In [22]:
df = get_annotation_stats({
    "QACG-SUM": {"file": "data/ncaa/er_annotations/fullwiki2fullwiki/cs_annotations.jsonl"},
    "FEVER": {"file": "data/ncaa/er_annotations/fullwikifever2fullwiki/cs_annotations.jsonl"},
    }, ignore_leading=False)

df.loc[df.method == "anserini","model"] = "NA"
# df.sort_values("MRR@2", ascending=False)
df

Unnamed: 0,model,method,n,lead,MRR@2,P@2
0,,anserini,100,50.0,86.5,66.0
1,QACG-SUM,colbert,100,50.0,86.5,63.5
2,QACG-SUM,colbert_ans,100,50.0,89.0,66.5
3,QACG-SUM,colbert_nli,100,50.0,84.0,63.0
4,FEVER,colbert,100,72.666667,50.0,33.5
5,FEVER,colbert_ans,100,72.666667,54.5,38.5
6,FEVER,colbert_nli,100,72.666667,50.0,34.0


In [23]:
df = get_annotation_stats({
    "QACG-SUM": {"file": "data/ncaa/er_annotations/fever2fullwiki/en_annotations.jsonl"},
    "FEVER": {"file": "data/ncaa/er_annotations/feverfever2fullwiki/en_annotations.jsonl"},
    }, ignore_leading=False)

df.loc[df.method == "anserini","model"] = "NA"
# df.sort_values("MRR@2", ascending=False)
df

Unnamed: 0,model,method,n,lead,MRR@2,P@2
0,,anserini,100,43.75,67.5,57.0
1,QACG-SUM,colbert,100,43.75,60.0,49.5
2,QACG-SUM,colbert_ans,100,43.75,66.5,58.0
3,QACG-SUM,colbert_nli,100,43.75,57.0,46.0
4,FEVER,colbert,100,73.333333,72.0,56.5
5,FEVER,colbert_ans,100,73.333333,74.5,60.0
6,FEVER,colbert_nli,100,73.333333,65.0,52.5


In [24]:
df = get_annotation_stats({
    "QACG-SUM": {"file": "data/ncaa/er_annotations/fever2fullwiki/cs_annotations.jsonl"},
    "FEVER": {"file": "data/ncaa/er_annotations/feverfever2fullwiki/cs_annotations.jsonl"},
    }, ignore_leading=False)

df.loc[df.method == "anserini","model"] = "NA"
# df.sort_values("MRR@2", ascending=False)
df

Unnamed: 0,model,method,n,lead,MRR@2,P@2
0,,anserini,100,46.75,61.0,46.5
1,QACG-SUM,colbert,100,46.75,52.5,37.0
2,QACG-SUM,colbert_ans,100,46.75,54.5,41.5
3,QACG-SUM,colbert_nli,100,46.75,46.5,31.5
4,FEVER,colbert,100,75.833333,78.5,49.0
5,FEVER,colbert_ans,100,75.833333,75.5,49.0
6,FEVER,colbert_nli,100,75.833333,59.5,40.5


In [29]:
def check_inconsistencies(annotation_files, claim2evidence_files):
    claim_bid_cid2name_label = defaultdict(list)
    c2e = {c: read_json(e) for c, e in claim2evidence_files.items()}

    for annotation_file in annotation_files:
        print(f"inconsistencies for: {annotation_file}")
        res = read_jsonl(annotation_file)

        # find all (claim, bid) duplicities
        for r in res:
            name, cid = r["claim_id"].split(":")
            if cid not in c2e[name]:
                # print(f"missing {cid} for {name}")
                continue
            claim = c2e[name][cid][0]["claim"]
            bid = r["bid"]
            claim_bid_cid = (claim, bid, cid)
            claim_bid_cid2name_label[claim_bid_cid].append((name, r["label"]))

    # find if the label in all (claim, bid) dupes is consistent
    cnt = 0
    for claim_bid_cid, name_label in claim_bid_cid2name_label.items():
        label_set = set([label for name, label in name_label])
        claim, bid, cid = claim_bid_cid
        if len(label_set) > 1:
            for name, label in name_label:
                print(f"\t{label}: {name}:{cid}")
            # ev = c2e[name][cid][0]
            for ev in c2e[name][cid]:
                if ev["bid"] == bid:
                    break
            print(f'"{claim}"')
            print(f"bid: {bid}\n")
            print(textwrap.fill(ev["context"]))
            if ev["bid"] != bid:
                for x in c2e[name][cid]:
                    print("| ", x["bid"])
                print(f"claim_bid_cid: {claim_bid_cid}")
                print(f"ev: {ev['bid']}")
                print(f"label_set: {label_set}")
                raise RuntimeError("BID missmatch")
            print("----------------------------------")
            cnt += 1
    print(f"found {cnt} inconsistencies")
        

# LANG = "cs"
LANG = "en"

# DIR = "fever2fullwiki" # FEVER testing, QACG-SUM model, QACG corpora
# DIR = "feverfever2fullwiki" # FEVER testing, FEVER model, QACG corpora
# DIR = "fullwiki2fullwiki" # QACG testing, QACG-SUM model, QACG corpora
DIR = "fullwikifever2fullwiki" # QACG testing, FEVER model, QACG corpora

claim2evidence_files = {
    "anserini": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_anserini.json",
    "colbert":  f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert.json",
    "colbert_ans": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert_anserini_filtered.json",
    "colbert_nli": f"data/ncaa/er_annotations/{DIR}/{LANG}_claim2evidence_colbert_nli_sorted.json"
}

if DIR in ["fullwiki2fullwiki", "fullwikifever2fullwiki"]:
    check_inconsistencies([
        f"data/ncaa/er_annotations/fullwiki2fullwiki/{LANG}_annotations.jsonl",
        f"data/ncaa/er_annotations/fullwikifever2fullwiki/{LANG}_annotations.jsonl",
        ], claim2evidence_files)
else:
    check_inconsistencies([
        f"data/ncaa/er_annotations/fever2fullwiki/{LANG}_annotations.jsonl",
        f"data/ncaa/er_annotations/feverfever2fullwiki/{LANG}_annotations.jsonl"
        ], claim2evidence_files)    

inconsistencies for: data/ncaa/er_annotations/fullwiki2fullwiki/en_annotations.jsonl
inconsistencies for: data/ncaa/er_annotations/fullwikifever2fullwiki/en_annotations.jsonl
found 0 inconsistencies
