In [1]:
from collections import defaultdict, OrderedDict, Counter
import numpy as np
import pandas as pd
from pathlib import Path
import textwrap
from tqdm import tqdm

%cd /home/drchajan/devel/python/FC/automated-fact-checking
%load_ext autoreload
%autoreload 2

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize

/home/drchajan/devel/python/FC/automated-fact-checking


  from tqdm.autonotebook import tqdm


In [266]:
def get_samples(qg_file, claim_sup_file, claim_ref_file, lang, n=2, seed=1234):
    qgs = read_json(qg_file)
    csup = read_json(claim_sup_file)
    cref = read_json(claim_ref_file)
    print(len(qgs), len(csup), len(cref))
    assert len(csup) == len(cref)

    rng = np.random.RandomState(seed)
    pars = rng.choice(list(cref.keys()), 2*n, replace=False)
    res = []
    for par in pars:
        cs = csup[par]
        cr = cref[par]
        if len(cr) == 0:
            continue
        neid = rng.choice(list(cr.keys()))
        ne, ntype = neid.split(":::")
        # print(cs)
        # print(cr)
        res.append({
            "lang": lang,
            "paragraph": par,
            "entity": ne,
            "entity type": ntype,
            "question": qgs[par][neid][0],
            "SUP": cs[neid],
            "REF": cr[neid],
        })
    return pd.DataFrame(res[:n])


df_cs = get_samples("/mnt/data/factcheck/wiki/cs/20230801/qacg/qg/PAV-ner-CNEC/mt5-large_all-cp126k/test_qgs.json",
                   "/mnt/data/factcheck/wiki/cs/20230801/qacg/claim/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/test_support.json",
                   "/mnt/data/factcheck/wiki/cs/20230801/qacg/claim/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/test_refute.json",
                   "cs", seed=2023) # find seed to have different entity types for some diversity
df_cs

1413 1519 1519


Unnamed: 0,lang,paragraph,entity,entity type,question,SUP,REF
0,cs,Kršna_5,Kuruovci,P,S kým bojoval Kršna?,Kršna bojoval s Kuruovci.,Kršna bojoval s Bohem.
1,cs,Sainte-Chapelle_1,Konstantinopole,G,Kde byla trnová koruna uložena?,Trnová koruna byla uložena v Konstantinopole.,Trnová koruna byla uložena v Île de la Cité.


In [244]:
df_en = get_samples("/mnt/data/factcheck/wiki/en/20230801/qacg/qg/stanza/mt5-large_all-cp126k/test_qgs.json",
                   "/mnt/data/factcheck/wiki/en/20230801/qacg/claim/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_support.json",
                   "/mnt/data/factcheck/wiki/en/20230801/qacg/claim/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_refute.json",
                   "en", seed=1234)
df_en

989 1000 1000


Unnamed: 0,lang,paragraph,entity,entity type,question,SUP,REF
0,en,Ahn_Hyeon-beom_1,Ulsan Hyundai,PERSON,What team did Ahn join in 2015?,Ahn joined Ulsan Hyundai in 2015.,Ahn joined the Hanja team in 2015.
1,en,"Ball_Ground,_Georgia_10",1898,DATE,When did the first marble company open in Ball...,The first marble company opened in Ball Ground...,The first marble company opened in Ball Ground...


In [233]:
df_pl = get_samples("/mnt/data/factcheck/wiki/pl/20230801/qacg/qg/stanza/mt5-large_all-cp126k/test_qgs.json",
                   "/mnt/data/factcheck/wiki/pl/20230801/qacg/claim/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_support.json",
                   "/mnt/data/factcheck/wiki/pl/20230801/qacg/claim/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_refute.json",
                   "pl", seed=1235)
df_pl

1367 1424 1424


Unnamed: 0,lang,paragraph,entity,entity type,question,SUP,REF
0,pl,Rinzai_10,Myōshin-ji,geogName,Który japoński ogród jest najbardziej znany z ...,Najbardziej znanym japońskim ogrodem z tego ty...,Najbardziej znanym japońskim ogrodem z tego ty...
1,pl,Władimir_Iwanow_(lekkoatleta)_1,10 września 1979,date,Kiedy Władimir Iwanow poprawiał rekord Bułgari...,Władimir Iwanow poprawił rekord Bułgarii w bie...,Władimir Iwanow poprawił rekord Bułgarii w bie...


In [299]:
df_sk = get_samples("/mnt/data/factcheck/wiki/sk/20230801/qacg/qg/crabz_slovakbert-ner/mt5-large_all-cp126k/test_qgs.json",
                   "/mnt/data/factcheck/wiki/sk/20230801/qacg/claim/crabz_slovakbert-ner/mt5-large_all-cp126k/mt5-large_all-cp156k/test_support.json",
                   "/mnt/data/factcheck/wiki/sk/20230801/qacg/claim/crabz_slovakbert-ner/mt5-large_all-cp126k/mt5-large_all-cp156k/test_refute.json",
                   "sk", seed=888)
df_sk

1714 1745 1745


Unnamed: 0,lang,paragraph,entity,entity type,question,SUP,REF
0,sk,Società_Sportiva_Lazio_(ženy)_2,Società Sportiva Lazio,ORG,Aká je futbalová klubová sieť mužov?,Futbalová klubová sieť mužov je Società Sporti...,Futbalová klubová sieť mužov je Italia.
1,sk,Villers-Saint-Martin_1,Franche-Comté,LOC,V akom regióne sa nachádza obec Villers-Saint-...,Villers-Saint- Martin sa nachádza vo Franche-C...,Villers-Saint- Martin sa nachádza v oblasti Do...


In [300]:
df = pd.concat([df_cs, df_en, df_pl, df_sk])
df

Unnamed: 0,lang,paragraph,entity,entity type,question,SUP,REF
0,cs,Kršna_5,Kuruovci,P,S kým bojoval Kršna?,Kršna bojoval s Kuruovci.,Kršna bojoval s Bohem.
1,cs,Sainte-Chapelle_1,Konstantinopole,G,Kde byla trnová koruna uložena?,Trnová koruna byla uložena v Konstantinopole.,Trnová koruna byla uložena v Île de la Cité.
0,en,Ahn_Hyeon-beom_1,Ulsan Hyundai,PERSON,What team did Ahn join in 2015?,Ahn joined Ulsan Hyundai in 2015.,Ahn joined the Hanja team in 2015.
1,en,"Ball_Ground,_Georgia_10",1898,DATE,When did the first marble company open in Ball...,The first marble company opened in Ball Ground...,The first marble company opened in Ball Ground...
0,pl,Rinzai_10,Myōshin-ji,geogName,Który japoński ogród jest najbardziej znany z ...,Najbardziej znanym japońskim ogrodem z tego ty...,Najbardziej znanym japońskim ogrodem z tego ty...
1,pl,Władimir_Iwanow_(lekkoatleta)_1,10 września 1979,date,Kiedy Władimir Iwanow poprawiał rekord Bułgari...,Władimir Iwanow poprawił rekord Bułgarii w bie...,Władimir Iwanow poprawił rekord Bułgarii w bie...
0,sk,Società_Sportiva_Lazio_(ženy)_2,Società Sportiva Lazio,ORG,Aká je futbalová klubová sieť mužov?,Futbalová klubová sieť mužov je Società Sporti...,Futbalová klubová sieť mužov je Italia.
1,sk,Villers-Saint-Martin_1,Franche-Comté,LOC,V akom regióne sa nachádza obec Villers-Saint-...,Villers-Saint- Martin sa nachádza vo Franche-C...,Villers-Saint- Martin sa nachádza v oblasti Do...


In [301]:
# https://stackoverflow.com/questions/4578912/replace-all-accented-characters-by-their-latex-equivalent
latex_accents = [
  [ u"à", "\\`a" ], # Grave accent
  [ u"è", "\\`e" ],
  [ u"ì", "\\`\\i" ],
  [ u"ò", "\\`o" ],
  [ u"ù", "\\`u" ],
  [ u"ỳ", "\\`y" ],
  [ u"À", "\\`A" ],
  [ u"È", "\\`E" ],
  [ u"Ì", "\\`\\I" ],
  [ u"Ò", "\\`O" ],
  [ u"Ù", "\\`U" ],
  [ u"Ỳ", "\\`Y" ],
  [ u"á", "\\'a" ], # Acute accent
  [ u"é", "\\'e" ],
  # [ u"í", "\\'i" ],
  # [ u"í", "\\'\\i" ],
  [ u"ó", "\\'o" ],
  [ u"ú", "\\'u" ],
  [ u"ý", "\\'y" ],
  [ u"Á", "\\'A" ],
  [ u"É", "\\'E" ],
  [ u"Í", "\\'\\I" ],
  [ u"Ó", "\\'O" ],
  [ u"Ú", "\\'U" ],
  [ u"Ý", "\\'Y" ],
  [ u"â", "\\^a" ], # Circumflex
  [ u"ê", "\\^e" ],
  [ u"î", "\\^\\i" ],
  [ u"ô", "\\^o" ],
  [ u"û", "\\^u" ],
  [ u"ŷ", "\\^y" ],
  [ u"Â", "\\^A" ],
  [ u"Ê", "\\^E" ],
  [ u"Î", "\\^\\I" ],
  [ u"Ô", "\\^O" ],
  [ u"Û", "\\^U" ],
  [ u"Ŷ", "\\^Y" ],
  [ u"ä", "\\\"a" ],    # Umlaut or dieresis
  [ u"ë", "\\\"e" ],
  [ u"ï", "\\\"\\i" ],
  [ u"ö", "\\\"o" ],
  [ u"ü", "\\\"u" ],
  [ u"ÿ", "\\\"y" ],
  [ u"Ä", "\\\"A" ],
  [ u"Ë", "\\\"E" ],
  [ u"Ï", "\\\"\\I" ],
  [ u"Ö", "\\\"O" ],
  [ u"Ü", "\\\"U" ],
  [ u"Ÿ", "\\\"Y" ],
  [ u"ç", "\\c{c}" ],   # Cedilla
  [ u"Ç", "\\c{C}" ],
  [ u"œ", "{\\oe}" ],   # Ligatures
  [ u"Œ", "{\\OE}" ],
  [ u"æ", "{\\ae}" ],
  [ u"Æ", "{\\AE}" ],
  [ u"å", "{\\aa}" ],
  [ u"Å", "{\\AA}" ],
  [ u"–", "--" ],   # Dashes
  [ u"—", "---" ],
  [ u"ø", "{\\o}" ],    # Misc latin-1 letters
  [ u"Ø", "{\\O}" ],
  [ u"ß", "{\\ss}" ],
  [ u"¡", "{!`}" ],
  [ u"¿", "{?`}" ],
  # [ u"\\", "\\\\" ],    # Characters that should be quoted
  [ u"~", "\\~" ],
  [ u"&", "\\&" ],
  [ u"$", "\\$" ],
  [ u"{", "\\{" ],
  [ u"}", "\\}" ],
  [ u"%", "\\%" ],
  [ u"#", "\\#" ],
  [ u"_", "\\_" ],
  [ u"≥", "$\\ge$" ],   # Math operators
  [ u"≤", "$\\le$" ],
  [ u"≠", "$\\neq$" ],
  [ u"©", "\copyright" ], # Misc
  [ u"ı", "{\\i}" ],
  [ u"µ", "$\\mu$" ],
  [ u"°", "$\\deg$" ],
  [ u"‘", "`" ],    #Quotes
  [ u"’", "'" ],
  [ u"“", "``" ],
  [ u"”", "''" ],
  [ u"‚", "," ],
  [ u"„", ",," ],
]

def replace_latex(txt):
    for f, t in latex_accents:
        txt = txt.replace(f, t)
    return txt

print(replace_latex("Kdy byl RWD-11 poškozen při přistání?"))

Kdy byl RWD-11 poškozen při přist\'aní?


In [302]:
df_latex = df.copy()
# df_latex["paragraph"] = df_latex["paragraph"].apply(replace_latex)
# df_latex["question"] = df_latex["question"].apply(replace_latex)
# df_latex["SUP"] = df_latex["SUP"].apply(replace_latex)
# df_latex["REF"] = df_latex["REF"].apply(replace_latex)

In [306]:
def print_latex(df):
    res = []
    recs = df_latex.to_dict(orient='records')
    lang = recs[0]["lang"]
    for i, r in enumerate(recs):
        title_par = r["paragraph"].split("_")
        title, par = " ".join(title_par[:-1]), title_par[-1]
        entity = r["entity"]
        etype = r["entity type"]
        question = r["question"]
        sup = r["SUP"]
        ref = r["REF"]

        if r["lang"] != lang:
            lang = r["lang"]
            res.append("\\midrule")
        elif i > 0:
            res.append("\\vspace{1mm}")

        res.append(f"\\textbf{{{title}}} (par. {par}, {{\\footnotesize \\ttfamily entity:}} {entity}, {{\\footnotesize \\ttfamily type:}} \\texttt{{{etype}}})\\\\")
        res.append(f"\\makebox[1.3cm][r]{{\\footnotesize \\ttfamily Question:}} {question}\\\\")
        res.append(f"\\makebox[1.3cm][r]{{\\footnotesize \\SUPS:}} {sup}\\\\")
        res.append(f"\\makebox[1.3cm][r]{{\\footnotesize \\REFS:}} {ref}\\\\")
        res.append("")


    return "\n".join(res)

print(print_latex(df_latex))

\textbf{Kršna} (par. 5, {\footnotesize \ttfamily entity:} Kuruovci, {\footnotesize \ttfamily type:} \texttt{P})\\
\makebox[1.3cm][r]{\footnotesize \ttfamily Question:} S kým bojoval Kršna?\\
\makebox[1.3cm][r]{\footnotesize \SUPS:} Kršna bojoval s Kuruovci.\\
\makebox[1.3cm][r]{\footnotesize \REFS:} Kršna bojoval s Bohem.\\

\vspace{1mm}
\textbf{Sainte-Chapelle} (par. 1, {\footnotesize \ttfamily entity:} Konstantinopole, {\footnotesize \ttfamily type:} \texttt{G})\\
\makebox[1.3cm][r]{\footnotesize \ttfamily Question:} Kde byla trnová koruna uložena?\\
\makebox[1.3cm][r]{\footnotesize \SUPS:} Trnová koruna byla uložena v Konstantinopole.\\
\makebox[1.3cm][r]{\footnotesize \REFS:} Trnová koruna byla uložena v Île de la Cité.\\

\midrule
\textbf{Ahn Hyeon-beom} (par. 1, {\footnotesize \ttfamily entity:} Ulsan Hyundai, {\footnotesize \ttfamily type:} \texttt{PERSON})\\
\makebox[1.3cm][r]{\footnotesize \ttfamily Question:} What team did Ahn join in 2015?\\
\makebox[1.3cm][r]{\footnotesize 