In [1]:
from collections import defaultdict, OrderedDict, Counter
import numpy as np
import pandas as pd
from pathlib import Path
import textwrap
from tqdm import tqdm

%cd /home/drchajan/devel/python/FC/automated-fact-checking
%load_ext autoreload
%autoreload 2

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize

from factsearch.mark import EmphasizeClaimWords

# from evaluation.document_retrieval import collect_eval_data, retriever_score
# from evaluation.nli import LABEL_NUM, LABEL_STR, load_rte_model, evaluate_examples, evaluate_all_nli
# from evaluation.fcheck_pipeline import read_fever_db, collect_scores_full, evaluate_full, extract_texts_ctk, InputSplitter

/home/drchajan/devel/python/FC/automated-fact-checking


  from tqdm.autonotebook import tqdm


### Resubmission CsFEVER Annotation & Evaluation

#### Reannotation adding balanced datasets and info on allowable grammatical errors

also improved blind procedure

In [9]:
PVI_DIR_CS = '/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_calibrated'
PVI_DIR_FCS= '/mnt/data/factcheck/NLI/csfever_nli_cls/pvi_calibrated'
PVI_DIR_EN = '/mnt/data/factcheck/wiki/en/20230801/qacg/nli/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_calibrated'
PVI_DIR_FEN = '/mnt/data/factcheck/NLI/nli_fever_cls/pvi_calibrated'
PVI_DIR_CS_FS = "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_fever_size_calibrated"
PVI_DIR_EN_FS = "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/pvi_fever_size_calibrated"

In [10]:
from IPython.display import display, HTML


def apply_marks(txt, spans):
    if len(spans) == 0:
        return txt

    # type2color = {"claim_words": "#FADBD8", "importance_sentence": "#fcf8e3", "importance_word": "#fcf8e3"}
    col = '#FADBD8' # for black background
    # col = '#4F500E' # for black background
    markedtxt = []
    lold = 0
    for span in spans:
        f, l = span["span"].lower, span["span"].upper
        # col = type2color[span["type"]]
        markedtxt.append(txt[lold:f])
        markedtxt.append(f'<span style="padding:0 0.1em;background-color:{col}">{txt[f:l]}</span>')
        lold = l
    markedtxt.append(txt[lold:])
    
    return ''.join(markedtxt)

def sample_to_html(sample, lang="cs"):
    assert lang in ["cs", "en"]
    htmls = []
    claim = sample["claim"]
    context = sample["context"]
    label2long = {"s": "SUP", "r": "REF", "n": "NEI"}
    label = sample['label']
    pvi = sample['PVI']


    if lang == "cs":
        emphasizer = EmphasizeClaimWords(lang=lang, threshold=0.8, stopword_list="data/stopwords/cs.txt")
    elif lang == "en":
        emphasizer = EmphasizeClaimWords(lang=lang, threshold=0.8, stopword_list="data/stopwords/en.txt", min_chars=2)
    spans = emphasizer.emphasize_spans(claim, context)
    context = apply_marks(context, spans["spans"])

    htmls.append(f'<h1>{claim}</h1>')
    htmls.append(f'<p style="font-family:monospace;font-size:18px">{label2long[label]} PVI={pvi}</p>')
    # htmls.append(f'<p style="font-family:helvetica;font-size:18px">target: {tgt} / prediction: {cls}<br/> SUP: {sup}, REF: {ref}, NEI: {nei}</p>')
    htmls.append(f'<p style="font-family:serif;font-size:20px;text-align:justify">{context}</p>')
    html = '\n'.join(htmls)
    return html

In [11]:
import ipywidgets as widgets
from ipywidgets import Button, Checkbox, HTML, VBox, HBox
from datetime import datetime

def annotate2(srcs, dst_jsonl, k=50, filter=None, lang="cs", seed=1234, skip_annotated=None):
    # import from multiple jsons, takes k for each SUP, REF, NEI
    # only random sampling

    if skip_annotated:
        old_samples = read_jsonl(skip_annotated)
        old_samples = set([(s['claim'], s['context']) for s in old_samples])
    else:
        old_samples = set()

    rng = np.random.RandomState(seed)
    samples = []
    for src_name, src_jsonl in srcs.items():
        src = read_jsonl(src_jsonl)
        for label in ["s", "r", "n"]:
            src_label = [s for s in src if s["label"] == label and (s['claim'], s['context']) not in old_samples]
            for s in src_label:
                s["source"] = src_name
            smpls = list(rng.choice(src_label, k, replace=False))
            samples += smpls

    samples = rng.permutation(samples)
    
    dst_jsonl = Path(dst_jsonl)

    recs = []
    if dst_jsonl.is_file():
        recs = read_jsonl(dst_jsonl)
        for s, r in zip(samples, recs):
            assert s['claim'] == r['claim'] and s['context'] == r['context'] and s['source'] == r['source'] , f"Existing annotation file does not match data! {dst_jsonl}" 
    idx = len(recs)
    if idx >= len(samples):
        print("All annotated!")
        return

    html = HTML(sample_to_html(samples[idx], lang=lang))
    btn_sup = Button(description='Supports', icon='thumbs-up')
    btn_ref = Button(description='Refutes', icon='ban')
    btn_nei = Button(description='NEI', icon='question')
    sel_accerr = Checkbox(description='Acceptable Error', value=False)
    btn_wrong = Button(description='Wrong', icon='wrong')
    btn_prev = Button(description='Previous', icon='backward')
    count_html = HTML()

    def show_count():
        count_html.value = f"{idx+1}/{len(samples)}"
    
    show_count()
    

    def save_annotation(label):
        nonlocal idx
        nonlocal recs
        if idx >= len(samples):
            return
        date = datetime.now().strftime("%y%m%d_%H%M%S")
        s = samples[idx]
        rec = {"claim": s["claim"], 
               "context": s["context"], 
               "label": s["label"], 
               "PVI": s["PVI"], 
               "annotated_label": label, 
               "source": s["source"],
               "date": date,
               "acceptable_error": sel_accerr.value,
               }
        recs.append(rec)
        write_jsonl(dst_jsonl, recs)
        idx += 1
        if idx < len(samples):
            s = samples[idx]
            sel_accerr.value = False
            html.value = sample_to_html(s, lang=lang)
            show_count()



    def btn_sup_eventhandler(obj):
        save_annotation("s")

    def btn_ref_eventhandler(obj):
        save_annotation("r")

    def btn_nei_eventhandler(obj):
        save_annotation("n")

    def btn_wrong_eventhandler(obj):
        save_annotation("w")

    def btn_prev_eventhandler(obj):
        nonlocal idx
        nonlocal recs
        if len(recs) > 0:
            idx -= 1
            recs = recs[:-1]
            s = samples[idx]
            html.value = sample_to_html(s, lang=lang)
            show_count()

    btn_sup.on_click(btn_sup_eventhandler)
    btn_ref.on_click(btn_ref_eventhandler)
    btn_nei.on_click(btn_nei_eventhandler)
    btn_wrong.on_click(btn_wrong_eventhandler)
    btn_prev.on_click(btn_prev_eventhandler)
    form = VBox([html, HBox([btn_sup, btn_ref, btn_nei, btn_wrong, sel_accerr, HTML("&nbsp;"*30), btn_prev, count_html])])
    return form

In [14]:
srcs = {
    "CsFEVER": Path(PVI_DIR_FCS, 'test_nli.jsonl'),
    "QACGCS": Path(PVI_DIR_CS, 'test_balanced.jsonl'),
    # "QACGCS FS": Path(PVI_DIR_CS_FS, 'test_balanced.jsonl'), # it has actually the same distribution as QACGS :(
}

# annotate2(srcs, "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations1.jsonl", k=50, lang="cs", seed=1234)

annotate2(srcs, "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations2.jsonl", k=50, lang="cs", seed=1234, 
          skip_annotated="/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations1.jsonl")

VBox(children=(HTML(value='<h1>Název mužské čtyřhry je ATP.</h1>\n<p style="font-family:monospace;font-size:18…

In [12]:
srcs = {
    "EnFEVER": Path(PVI_DIR_FEN, 'test_nli.jsonl'),
    "QACGEN": Path(PVI_DIR_EN, 'test_balanced.jsonl'),
    # "QACGEN FS": Path(PVI_DIR_EN_FS, 'test_balanced.jsonl'), # actually the same distribution as QACGEN
}

# annotate2(srcs, "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations1.jsonl", k=50, lang="en", seed=1234)

annotate2(srcs, "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations2.jsonl", k=50, lang="en", seed=1234,
          skip_annotated="/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations1.jsonl")

All annotated!


- Wrong
  - incomplete: "Canis is the genus listed under."
  - too general: "There are three extended plays."; at least one named entity (inc. date) must be given
  - date "Audi currently uses the slogan, "Advancement through Technology."
- SUP/REF
  - leading section/seemingly complete list (enumeration); otherwise NEI

- SUP: "Queen Latifah has won three Academy Awards." (actually won more, "at least" not needed), for other claims I may want exact number!

Unlike FEVER NLI evaluation does not judge the retrieval???

Example where REF fails: "One of the most popular regular swimwear brands is Adidas." There is a list of swimear brands...

QACG problem: repeating named entities: ""

Will use the following for new annotations. Here, used only to revisit failed claims and acceptable errors.

In [5]:
class AnnotatedData:
    def __init__(self, src_jsonls, filter=lambda s: True):
        self.__samples = [] 
        for src_jsonl in src_jsonls:
            self.__samples += read_jsonl(src_jsonl)
        assert len(self.__samples) > 0
        self.__indices = [i for i in range(len(self.__samples)) if filter(self.__samples[i])]
        self.__idx = 0

    def idx(self):
        return self.__idx
    
    def len(self):
        return len(self.__indices)
    
    def get(self):
        sample = self.__samples[self.__indices[self.idx()]]
        assert isinstance(sample, dict)
        return sample.copy()
    
    def set(self, k, v, replace=False):
        sample = self.__samples[self.__indices[self.idx()]]
        assert replace or k not in sample
        sample[k] = v
    
    def next(self):
        if self.idx() < self.len() - 1:
            self.__idx += 1
        return self.idx()

    def prev(self):
        if self.idx() > 0 :
            self.__idx -= 1
        return self.idx()
    
    def save_all(self, dst_jsonl):
        if Path(dst_jsonl).is_file():
            old_samples = read_jsonl(dst_jsonl)
            if len(old_samples) != len(self.__samples):
                raise ValueError(f"Suspicious: {dst_jsonl} exists and has different length!")
        write_jsonl(dst_jsonl, self.__samples)


In [33]:
import ipywidgets as widgets
from ipywidgets import Button, Checkbox, HTML, VBox, HBox
from datetime import datetime


def inspect_annotations(data, dst_jsonl, lang="cs"):
    html = HTML()
    btn_sup = Button(description='Supports', icon='thumbs-up')
    btn_ref = Button(description='Refutes', icon='ban')
    btn_nei = Button(description='NEI', icon='question')
    sel_accerr = Checkbox(description='Acceptable Failure', value=False)
    btn_wrong = Button(description='Failed', icon='wrong')
    btn_prev = Button(description='Prev', icon='backward')
    btn_next = Button(description='Next', icon='forward')
    count_html = HTML()

    def show_sample():
        html.value = sample_to_html(data.get(), lang=lang)
        count_html.value = f"{data.idx()+1}/{data.len()}"
        sample = data.get()
        if 'annotated_label' in sample:
            label = sample['annotated_label']
            btn_sup.button_style = 'success' if label == 's' else ''
            btn_ref.button_style = 'success' if label == 'r' else ''
            btn_nei.button_style = 'success' if label == 'n' else ''
            btn_wrong.button_style = 'danger' if label == 'w' else ''
        if 'acceptable_error' in sample:
            sel_accerr.value = sample['acceptable_error']
        
    show_sample()

    def save_annotation(label):
        data.set("annotated_label", label, replace=True)
        data.set("acceptable_error", sel_accerr.value, replace=True)
        data.set("date", datetime.now().strftime("%y%m%d_%H%M%S"), replace=True)
        data.save_all(dst_jsonl)
        data.next()
        show_sample()
    
    def btn_sup_eventhandler(obj):
        save_annotation("s")

    def btn_ref_eventhandler(obj):
        save_annotation("r")

    def btn_nei_eventhandler(obj):
        save_annotation("n")

    def btn_wrong_eventhandler(obj):
        save_annotation("w")

    def btn_prev_eventhandler(obj):
        nonlocal data
        data.prev()
        show_sample()

    def btn_next_eventhandler(obj):
        nonlocal data
        data.next()
        show_sample()

    btn_sup.on_click(btn_sup_eventhandler)
    btn_ref.on_click(btn_ref_eventhandler)
    btn_nei.on_click(btn_nei_eventhandler)
    btn_wrong.on_click(btn_wrong_eventhandler)
    btn_prev.on_click(btn_prev_eventhandler)
    btn_next.on_click(btn_next_eventhandler)
    
    form = VBox([html, 
                 HBox([btn_sup, btn_ref, btn_nei, btn_wrong, HTML("&nbsp;"*30), btn_prev, btn_next, count_html]),
                 HBox([sel_accerr])])
    return form

In [64]:
data = AnnotatedData([
    # "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations1.jsonl",
    # "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations2.jsonl",
    "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations.jsonl"
    ],
    # filter=lambda s: s["annotated_label"] == 'w'
    filter=lambda s: s["acceptable_error"]
    )

inspect_annotations(data,  "/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations.jsonl", lang="cs")

VBox(children=(HTML(value='<h1>Něrčiná smlouva byla uzavřena v roce 1689.</h1>\n<p style="font-family:monospac…

In [37]:
data = AnnotatedData([
    # "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations1.jsonl"
    # "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations2.jsonl"
    "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations.jsonl"
    ],
    filter=lambda s: s["annotated_label"] == 'w'
    # filter=lambda s: s["annotated_label"] == 'n'
    # filter=lambda s: s["acceptable_error"]
    )

inspect_annotations(data,  "/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations.jsonl", lang="en")

VBox(children=(HTML(value='<h1>0.20% of the population is Asian.</h1>\n<p style="font-family:monospace;font-si…

In [19]:
def compute_stats(src_file, select="total"):
    assert select in ["total", "s", "r", "n"]

    src = read_jsonl(src_file)
    res = []
    nwrong = defaultdict(lambda: 0)
    nwrong_strict = defaultdict(lambda: 0)
    oklabel = defaultdict(lambda: 0)
    oklabel_strict = defaultdict(lambda: 0)
    acceptable_error = defaultdict(lambda: 0)
    cnt = defaultdict(lambda: 0)
    for e in src:
        source = e["source"]
        label = e["label"]
        accept_error = e["acceptable_error"]
        if select == "total" or label == select:
            acceptable_error[source] += accept_error
            oklabel[source] += e["annotated_label"] != 'w' and e["annotated_label"] == e["label"]
            oklabel_strict[source] += e["annotated_label"] != 'w' and (not accept_error) and e["annotated_label"] == e["label"]
            nwrong[source] += e["annotated_label"] == 'w'
            nwrong_strict[source] += e["annotated_label"] == 'w' or accept_error
            cnt[source] += 1

    names = list(sorted(cnt.keys()))
    res = []
    for name in names:
        wrong_rate = 100 * nwrong[name] / cnt[name]
        wrong_rate_strict = 100 * nwrong_strict[name] / cnt[name]
        acceptable_error_rate = 100 * acceptable_error[name] / cnt[name]
        mislabel = 100 - 100 * oklabel[name] / (cnt[name] - nwrong[name])
        mislabel_strict = 100 - 100 * oklabel_strict[name] / (cnt[name] - nwrong_strict[name])
        rec = {"name": name, 
               f"{select}_F": wrong_rate, 
               f"{select}_M": mislabel, 
               f"{select}_FS": wrong_rate_strict, 
               f"{select}_MS": mislabel_strict, 
               f"{select}_AE": acceptable_error_rate}
        
        res.append(rec)

    df = pd.DataFrame(res)
    return df

In [77]:
ann_file = "data/ncaa/nli_annotations/cs_nli_human_annotations.jsonl"
df_total = compute_stats(ann_file, select="total")
df_sup = compute_stats(ann_file, select="s")
df_ref = compute_stats(ann_file, select="r")
df_nei = compute_stats(ann_file, select="n")

df_cs = df_sup.merge(df_ref, on="name").merge(df_nei, on="name").merge(df_total, on="name")
df_total.round(2)

Unnamed: 0,name,total_F,total_M,total_FS,total_MS,total_AE
0,CsFEVER,5.67,17.31,12.0,17.05,6.33
1,QACGCS,10.0,17.04,26.33,17.19,16.33
2,QACGCS FS,9.33,16.18,27.33,16.51,18.0


In [20]:
ann_file = "data/ncaa/nli_annotations/en_nli_human_annotations.jsonl"
df_total = compute_stats(ann_file, select="total")
df_sup = compute_stats(ann_file, select="s")
df_ref = compute_stats(ann_file, select="r")
df_nei = compute_stats(ann_file, select="n")

df_en = df_sup.merge(df_ref, on="name").merge(df_nei, on="name").merge(df_total, on="name")
df_total.round(2)

Unnamed: 0,name,total_F,total_M,total_FS,total_MS,total_AE
0,EnFEVER,2.67,15.41,4.33,14.63,1.67
1,QACGEN,10.0,12.96,10.33,12.64,0.33


In [21]:
df_ref

Unnamed: 0,name,r_F,r_M,r_FS,r_MS,r_AE
0,EnFEVER,4.0,1.041667,4.0,1.041667,0.0
1,QACGEN,18.0,30.487805,18.0,30.487805,0.0


In [80]:
df = pd.concat([df_cs, df_en])
df

Unnamed: 0,name,s_F,s_M,s_FS,s_MS,s_AE,r_F,r_M,r_FS,r_MS,...,n_F,n_M,n_FS,n_MS,n_AE,total_F,total_M,total_FS,total_MS,total_AE
0,CsFEVER,2.0,10.204082,8.0,10.869565,6.0,5.0,10.526316,10.0,11.111111,...,10.0,32.222222,18.0,30.487805,8.0,5.666667,17.314488,12.0,17.045455,6.333333
1,QACGCS,11.0,15.730337,23.0,16.883117,12.0,13.0,27.586207,29.0,25.352113,...,6.0,8.510638,27.0,9.589041,21.0,10.0,17.037037,26.333333,17.19457,16.333333
2,QACGCS FS,4.0,12.5,24.0,13.157895,20.0,18.0,29.268293,30.0,28.571429,...,6.0,8.510638,28.0,8.333333,22.0,9.333333,16.176471,27.333333,16.513761,18.0
0,EnFEVER,0.0,3.0,1.0,3.030303,1.0,4.0,1.041667,4.0,1.041667,...,4.0,42.708333,8.0,41.304348,4.0,2.666667,15.410959,4.333333,14.634146,1.666667
1,QACGEN,7.0,6.451613,7.0,6.451613,0.0,18.0,30.487805,18.0,30.487805,...,5.0,4.210526,6.0,3.191489,1.0,10.0,12.962963,10.333333,12.639405,0.333333


In [94]:
df_soft = df[["name", "s_F", "s_AE", "s_M", "r_F", "r_AE", "r_M", "n_F", "n_AE",  "n_M", "total_F", "total_AE", "total_M"]]
df_soft

Unnamed: 0,name,s_F,s_AE,s_M,r_F,r_AE,r_M,n_F,n_AE,n_M,total_F,total_AE,total_M
0,CsFEVER,2.0,6.0,10.204082,5.0,5.0,10.526316,10.0,8.0,32.222222,5.666667,6.333333,17.314488
1,QACGCS,11.0,12.0,15.730337,13.0,16.0,27.586207,6.0,21.0,8.510638,10.0,16.333333,17.037037
2,QACGCS FS,4.0,20.0,12.5,18.0,12.0,29.268293,6.0,22.0,8.510638,9.333333,18.0,16.176471
0,EnFEVER,0.0,1.0,3.0,4.0,0.0,1.041667,4.0,4.0,42.708333,2.666667,1.666667,15.410959
1,QACGEN,7.0,0.0,6.451613,18.0,0.0,30.487805,5.0,1.0,4.210526,10.0,0.333333,12.962963


In [98]:
df_strict = df[["name", "s_FS", "s_AE", "s_MS", "r_FS", "r_AE", "r_MS", "n_FS", "n_AE",  "n_MS", "total_FS", "total_AE", "total_MS"]]
df_strict

Unnamed: 0,name,s_FS,s_AE,s_MS,r_FS,r_AE,r_MS,n_FS,n_AE,n_MS,total_FS,total_AE,total_MS
0,CsFEVER,8.0,6.0,10.869565,10.0,5.0,11.111111,18.0,8.0,30.487805,12.0,6.333333,17.045455
1,QACGCS,23.0,12.0,16.883117,29.0,16.0,25.352113,27.0,21.0,9.589041,26.333333,16.333333,17.19457
2,QACGCS FS,24.0,20.0,13.157895,30.0,12.0,28.571429,28.0,22.0,8.333333,27.333333,18.0,16.513761
0,EnFEVER,1.0,1.0,3.030303,4.0,0.0,1.041667,8.0,4.0,41.304348,4.333333,1.666667,14.634146
1,QACGEN,7.0,0.0,6.451613,18.0,0.0,30.487805,6.0,1.0,3.191489,10.333333,0.333333,12.639405


In [96]:
print(df_soft.to_latex(float_format="%.1f", index=False))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
     name &  s\_F &  s\_AE &  s\_M &  r\_F &  r\_AE &  r\_M &  n\_F &  n\_AE &  n\_M &  total\_F &  total\_AE &  total\_M \\
\midrule
  CsFEVER &  2.0 &   6.0 & 10.2 &  5.0 &   5.0 & 10.5 & 10.0 &   8.0 & 32.2 &      5.7 &       6.3 &     17.3 \\
   QACGCS & 11.0 &  12.0 & 15.7 & 13.0 &  16.0 & 27.6 &  6.0 &  21.0 &  8.5 &     10.0 &      16.3 &     17.0 \\
QACGCS FS &  4.0 &  20.0 & 12.5 & 18.0 &  12.0 & 29.3 &  6.0 &  22.0 &  8.5 &      9.3 &      18.0 &     16.2 \\
  EnFEVER &  0.0 &   1.0 &  3.0 &  4.0 &   0.0 &  1.0 &  4.0 &   4.0 & 42.7 &      2.7 &       1.7 &     15.4 \\
   QACGEN &  7.0 &   0.0 &  6.5 & 18.0 &   0.0 & 30.5 &  5.0 &   1.0 &  4.2 &     10.0 &       0.3 &     13.0 \\
\bottomrule
\end{tabular}



  print(df_soft.to_latex(float_format="%.1f", index=False))


In [99]:
print(df_strict.to_latex(float_format="%.1f", index=False))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
     name &  s\_FS &  s\_AE &  s\_MS &  r\_FS &  r\_AE &  r\_MS &  n\_FS &  n\_AE &  n\_MS &  total\_FS &  total\_AE &  total\_MS \\
\midrule
  CsFEVER &   8.0 &   6.0 &  10.9 &  10.0 &   5.0 &  11.1 &  18.0 &   8.0 &  30.5 &      12.0 &       6.3 &      17.0 \\
   QACGCS &  23.0 &  12.0 &  16.9 &  29.0 &  16.0 &  25.4 &  27.0 &  21.0 &   9.6 &      26.3 &      16.3 &      17.2 \\
QACGCS FS &  24.0 &  20.0 &  13.2 &  30.0 &  12.0 &  28.6 &  28.0 &  22.0 &   8.3 &      27.3 &      18.0 &      16.5 \\
  EnFEVER &   1.0 &   1.0 &   3.0 &   4.0 &   0.0 &   1.0 &   8.0 &   4.0 &  41.3 &       4.3 &       1.7 &      14.6 \\
   QACGEN &   7.0 &   0.0 &   6.5 &  18.0 &   0.0 &  30.5 &   6.0 &   1.0 &   3.2 &      10.3 &       0.3 &      12.6 \\
\bottomrule
\end{tabular}



  print(df_strict.to_latex(float_format="%.1f", index=False))


In [24]:
def compute_cmatrix(src_file):
    from sklearn.metrics import confusion_matrix
    src = read_jsonl(src_file)
    sources = set([e["source"] for e in src])
    for source in sources:
        T = [e['label'] for e in src if e["source"] == source and e["label"] != "w"]
        Y = [e['annotated_label'] for e in src if e["source"] == source and  e["label"] != "w"]
        cm = confusion_matrix(T, Y, labels=["s", "r", "n"])
        print(source)
        print(cm)
        print()


compute_cmatrix("/mnt/data/factcheck/wiki/cs/20230801/qacg/nli/cs_nli_human_annotations.jsonl")

CsFEVER
[[88  6  4]
 [ 4 85  6]
 [18 11 61]]

QACGCS
[[75 12  2]
 [20 63  4]
 [ 7  1 86]]

QACGCS FS
[[42  2  4]
 [ 9 29  3]
 [ 3  1 43]]



In [26]:
4/(20 + 63  + 4)

0.04597701149425287

In [25]:
compute_cmatrix("/mnt/data/factcheck/wiki/en/20230801/qacg/nli/en_nli_human_annotations.jsonl")


EnFEVER
[[97  1  2]
 [ 1 95  0]
 [16 25 55]]

QACGEN
[[87  5  1]
 [23 57  2]
 [ 3  1 91]]



In [27]:
2/(23 + 57 +  2)

0.024390243902439025