In [1]:
from pathlib import Path
import sys
import time

from aic_nlp_utils.fever import fever_detokenize
from aic_nlp_utils.json import process_to_jsonl, write_jsonl, read_jsonl
import pandas as pd

  from tqdm.autonotebook import tqdm


Import as DataFrame. Fix formatting.

In [4]:
def import_qa2d(fname):
    df = pd.read_csv(fname, sep='\t')
    for col in ["question", "answer", "turker_answer", "rule-based"]:
        df[col] = df[col].apply(lambda txt: fever_detokenize(str(txt)).replace("` ", "'"))
    return df


ROOT_DIR = "/mnt/data/factcheck/qa2d"
df_train = import_qa2d(Path(ROOT_DIR, "raw", "train.tsv"))
df_dev = import_qa2d(Path(ROOT_DIR, "raw", "dev.tsv"))

Export texts from each column to separate files. Text per-line. This is an appropriate format for the DeepL translation. FireFox allows largest amount of text to be processed at once (vs. Safari, Chrome and Edge on Mac).

In [9]:
Path(ROOT_DIR, "en").mkdir(parents=True, exist_ok=True)
for col in ["question", "answer", "turker_answer", "rule-based"]:
    with open(Path(ROOT_DIR, "en", f"train_{col}.txt"), "wt") as f:
        f.write('\n'.join(list(map(fever_detokenize, df_train[col].values))))
for col in ["question", "answer", "turker_answer", "rule-based"]:
    with open(Path(ROOT_DIR, "en", f"dev_{col}.txt"), "wt") as f:
        f.write('\n'.join(list(map(fever_detokenize, df_dev[col].values))))

Now convert externally translated versions of files back to single JSONL file formatted for training QA2D models.

In [10]:
df_train.iloc[0].example_uid

'572812523acd2414000df3bf'

In [12]:
def convert_translated_to_jsonl(df, fin_prefix, fout):
    cols = ["question", "answer", "turker_answer", "rule-based"]
    data = {}
    for col in cols:
        fin = f"{fin_prefix}_{col}.txt"
        with open(fin) as f:
            data[col] = f.readlines()
    lens = [len(data[col]) for col in cols]
    assert len(set(lens)) == 1, set(lens)
    assert lens[0] == len(df), "The number of translated lines does not match the original data."
    n = list(set(lens))[0]
    
    result = []
    for i in range(n):
        Q, A, T, R = [data[col][i].strip() for col in cols]
        D = df_train.iloc[i].dataset
        uid = df_train.iloc[i].example_uid
        result.append({"question": Q, "answer": A, "turker_answer": T, "rule-based": R, "dataset": D, "example_uid": uid})
    write_jsonl(fout, result)

In [14]:
# convert also the original EN version to train our own models
convert_translated_to_jsonl(df_dev, Path(ROOT_DIR, "en", "dev"), Path(ROOT_DIR, "en", "dev.jsonl"))
convert_translated_to_jsonl(df_train, Path(ROOT_DIR, "en", "train"), Path(ROOT_DIR, "en", "train.jsonl"))

In [15]:
convert_translated_to_jsonl(df_dev, Path(ROOT_DIR, "cs", "dev"), Path(ROOT_DIR, "cs", "dev.jsonl"))
convert_translated_to_jsonl(df_train, Path(ROOT_DIR, "cs", "train"), Path(ROOT_DIR, "cs", "train.jsonl"))

In [16]:
convert_translated_to_jsonl(df_dev, Path(ROOT_DIR, "pl", "dev"), Path(ROOT_DIR, "pl", "dev.jsonl"))
convert_translated_to_jsonl(df_train, Path(ROOT_DIR, "pl", "train"), Path(ROOT_DIR, "pl", "train.jsonl"))

In [13]:
convert_translated_to_jsonl(df_dev, Path(ROOT_DIR, "sk", "dev"), Path(ROOT_DIR, "sk", "dev.jsonl"))
convert_translated_to_jsonl(df_train, Path(ROOT_DIR, "sk", "train"), Path(ROOT_DIR, "sk", "train.jsonl"))