In [2]:
import numpy as np
import sqlite3
import torch
from tqdm import tqdm
import unicodedata

from collections import defaultdict, OrderedDict, Counter
from dataclasses import dataclass
import datetime as dt
from itertools import chain
import os
import pathlib
from pathlib import Path
import string
import pandas as pd
import unicodedata as ud
from time import time
from typing import Dict, Type, Callable, List, Union
import sys
import ujson

from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize
from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.wiki import filter_and_fix_wiki_extract_for_lang

from sentence_transformers import CrossEncoder, util
import textwrap

sys.path.insert(0, '/home/drchajan/devel/python/FC/ColBERTv2') # ignore other ColBERT installations

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Indexer, Searcher
from colbert.data import Queries, Collection
from colbert import Trainer
from colbert.utilities.prepare_data import import_qacg_split, generate_original_id2pid_mapping, export_as_anserini_collection, anserini_retrieve_claims, sbert_CE_rerank, sbert_BI_rerank, generate_triples_by_retrieval, generate_triples_by_retrieval_nway

%load_ext autoreload
%autoreload 2

  from tqdm.autonotebook import tqdm
No CUDA runtime is found, using CUDA_HOME='/mnt/appl/software/CUDA/11.7.0'
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/drchajan/devel/python/FC/fc_env_hflarge/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


## ENFever Corpus Import

In [2]:
LANG = "en"
FEVER_ROOT = "/mnt/data/factcheck/fever/data-en-lrev"
FEVER_CORPUS = f"{FEVER_ROOT}/enwiki.jsonl"
FEVER_CORPUS_SQLITE = f"{FEVER_ROOT}/fever/fever.db"
FEVER_PREDICTIONS = f"{FEVER_ROOT}/predictions"
COLBERT_ROOT = f"{FEVER_ROOT}/colbertv2"

# QACG_CLAIMS_ROOT = f"/mnt/data/factcheck/fever/data-en-lrev/qacg"

TRAIN_DIR = "fever" # for original FEVER splits
# TRAIN_DIR = "qacg" # for QACG generated splits

SPLIT_DIR = Path("fever-data")
SPLIT_ROOT = Path(FEVER_ROOT, SPLIT_DIR)

QUERIES_ROOT = Path(COLBERT_ROOT, "queries")
TRIPLES_ROOT = Path(COLBERT_ROOT, "triples")

ANSERINI_ROOT = Path(FEVER_ROOT, "anserini")
ANSERINI_COLLECTION = str(Path(ANSERINI_ROOT, "collection"))
ANSERINI_INDEX = str(Path(ANSERINI_ROOT, "index"))
ANSERINI_RETRIEVED = Path(ANSERINI_ROOT, "retrieved")

JSON corpus does not match with the SQLITE3 one. Other methods were tested using SQLITE, so use that for experiments.

In [7]:
def import_corpus(corpus_file):
    raw = read_jsonl(corpus_file)
    original_ids = set()
    corpus = []
    for r in tqdm(raw):
        # id_ = unicodedata.normalize("NFC", fever_detokenize(r["id"]).strip())
        id_ = r["id"]
        assert r["id"] == unicodedata.normalize("NFC", r["id"])
        if id_ in original_ids: # this happens sometimes due to Wiki snapshot errors...
            print(f"Original ID not unique! {id_}. Skipping...")
            continue
        text = unicodedata.normalize("NFC", fever_detokenize(r["text"]).strip())
        corpus.append({"id": id_, "text": text})
        original_ids.add(id_)
    return corpus

# corpus = import_corpus(FEVER_CORPUS)

In [3]:
def import_corpus_from_sqlite(corpus_db_file):
    original_ids = set()
    corpus = []
    with sqlite3.connect(corpus_db_file, detect_types=sqlite3.PARSE_DECLTYPES) as connection:
        cursor = connection.cursor()
        cursor.execute(f"SELECT id, text FROM documents")
        for id_, text in cursor.fetchall():
            if id_ in original_ids: # this happens sometimes due to Wiki snapshot errors...
                print(f"Original ID not unique! {id_}. Skipping...")
                continue
            
            text = fever_detokenize(text)
            corpus.append({"id": nfc(id_), "text": nfc(text)})
            original_ids.add(id_)
    return corpus

In [4]:
corpus = import_corpus_from_sqlite(FEVER_CORPUS_SQLITE)

In [5]:
corpus[0]

{'id': '1928_in_association_football',
 'text': 'The following are the football (soccer) events of the year 1928 throughout the world.'}

In [6]:
def print_stats(corpus):
    did_set = set([r["id"] for r in corpus])
    ndoc = len(did_set)
    npar = len(corpus)
    print(f"documents: {ndoc} paragraphs: {npar}, paragraphs per document: {npar/ndoc}")
    plens = [len(r["text"]) for r in corpus]
    print(f"paragraph len: min:{np.min(plens)}, max:{np.max(plens)}, mean:{np.mean(plens)}, median:{np.median(plens)}")

print_stats(corpus)

documents: 5396106 paragraphs: 5396106, paragraphs per document: 1.0
paragraph len: min:1, max:323210, mean:506.96684646298644, median:318.0


In [6]:
original_id2pid = generate_original_id2pid_mapping(corpus)

In [7]:
# save corpus as a collection for ColBERT training
write_jsonl(Path(COLBERT_ROOT, "collection.jsonl"), corpus, mkdir=True)
write_json(Path(COLBERT_ROOT, "original_id2pid.json"), original_id2pid, mkdir=True)

## CSFever Corpus Import

In [3]:
LANG = "cs"
FEVER_ROOT = "/mnt/data/factcheck/fever/data-cs-lrev"
FEVER_CORPUS_SQLITE = f"{FEVER_ROOT}/fever/fever.db"
FEVER_PREDICTIONS = f"{FEVER_ROOT}/predictions"
COLBERT_ROOT = f"{FEVER_ROOT}/colbertv2"

SPLIT_DIR = Path("fever-data")
SPLIT_ROOT = Path(FEVER_ROOT, SPLIT_DIR)

QUERIES_ROOT = Path(COLBERT_ROOT, "queries")
TRIPLES_ROOT = Path(COLBERT_ROOT, "triples")

ANSERINI_ROOT = Path(FEVER_ROOT, "anserini")
ANSERINI_COLLECTION = str(Path(ANSERINI_ROOT, "collection"))
ANSERINI_INDEX = str(Path(ANSERINI_ROOT, "index"))
ANSERINI_RETRIEVED = Path(ANSERINI_ROOT, "retrieved")

#---------------------------------------------

# LANG = "cs"
# FEVER_ROOT = Path("/mnt/data/factcheck/fever/data_full_nli-filtered-cs")
# FEVER_DATA = Path(FEVER_ROOT, "fever-data/F1_titles_anserininew_threshold")
# FEVER_CORPUS_SQLITE = Path(FEVER_ROOT, "fever/cs_wiki_revid_db_sqlite.db")

# # FEVER_CORPUS = "/mnt/data/factcheck/fever/data-en-latest/enwiki.jsonl"
# # FEVER_CORPUS_SQLITE = "/mnt/data/factcheck/fever/data-en-lrev/fever/fever.db"
# # FEVER_ROOT = "/mnt/data/factcheck/fever/data-en-lrev/fever-data"
# # FEVER_PREDICTIONS = "/mnt/data/factcheck/fever/data-en-lrev/predictions"
# # COLBERT_ROOT = "/mnt/data/factcheck/fever/data-en-lrev/colbertv2-jsonl"
# COLBERT_ROOT = Path(FEVER_ROOT, "colbertv2")

# # 
# # TRAIN_DIR = "fever" # for original FEVER splits
# # TRAIN_DIR = "qacg" # for QACG generated splits
# TRAIN_DIR = "qacg-r" # for QACG generated splits based on randomly selected Wiki pages

# QACG_CLAIMS_ROOT = Path(FEVER_ROOT, TRAIN_DIR, "claim")


# HARD_NEGATIVES_ROOT = Path(COLBERT_ROOT, "hard_negatives")
# ANSERINI_COLLECTION = str(Path(HARD_NEGATIVES_ROOT, "anserini", "collection"))
# ANSERINI_INDEX = str(Path(HARD_NEGATIVES_ROOT, "anserini", "index"))
# HARD_NEGATIVES_RETRIEVED = Path(HARD_NEGATIVES_ROOT, "retrieved")



In [4]:
corpus = import_corpus_from_sqlite(FEVER_CORPUS_SQLITE)
original_id2pid = generate_original_id2pid_mapping(corpus)

In [5]:
# corpus = filter_and_fix_wiki_extract_for_lang(
#     Path(FEVER_ROOT, "wiki-pages"),
#     Path(FEVER_ROOT, "fever", "wiki_extract_filtered_and_fixed_drchajan.jsonl"), "cs", textcol="contents")


In [6]:
corpus[2]

{'id': 'Jean Alesi',
 'text': 'Jean Alesi\n Jean Alesi (narozen jako Giovanni Alesi) (* 11. června 1964, Avignon) je bývalý francouzský pilot Formule 1 italského původu. Jeho kariéra ve Formuli 1 zahrnovala účast v týmech Tyrrell, Benetton F1, Sauber, Prost, Jordan a hlavně Ferrari, kde byl velice oblíbený mezi tifosi. V roce 2006 byl Alesi oceněn Řádem čestné legie (Chevalier de la Legion d ’ honneur.'}

In [10]:
# save corpus as a collection for ColBERT training
write_jsonl(Path(COLBERT_ROOT, "collection.jsonl"), corpus, mkdir=True)
write_json(Path(COLBERT_ROOT, "original_id2pid.json"), original_id2pid, mkdir=True)

## Split Import

In [7]:
# Not needed for Sqlite Import
class EnFEVER_LREV_ID_Fixer:
    '''Our snapshot of EnFEVER most likely does not exactly match the snapshot used by the authors of the FEVER paper. Some of the evidence documents are missing. It seems that most often they have only slightly different names differing in use of underscores ("_"). This class tries to match them. 
    '''
    def __init__(self, corpus):
        self.fixed_id2original_id = defaultdict(list)
        for r in tqdm(corpus):
            original_id = unicodedata.normalize("NFC", r["id"])
            fixed_id = original_id.replace("_", "")
            self.fixed_id2original_id[fixed_id].append(original_id)

    def fix(self, id_):
        fixed_id = unicodedata.normalize("NFC", id_).replace("_", "")
        if fixed_id not in self.fixed_id2original_id:
            return id_
        original_ids = self.fixed_id2original_id[fixed_id]
        assert len(original_ids) == 1, f"{id_} => {fixed_id} => {original_ids}"
        return original_ids[0]
    
enfever_lrev_id_fixer = EnFEVER_LREV_ID_Fixer(corpus)

  0%|          | 0/453553 [00:00<?, ?it/s]

100%|██████████| 453553/453553 [00:00<00:00, 644907.08it/s]


In [7]:
def import_split(split_file, original_id2pid, fixer=None, new_format=False, ignore_missing_ids=False, bilingual=False):
    # use `bilingual` option for splits by Tomas (interleaved CS, EN pages)
    raw = read_jsonl(split_file)
    data = []
    evidence_total = 0
    evidence_not_found = 0
    no_evidence = 0
    for r in tqdm(raw):
        if r["verifiable"] == "VERIFIABLE":
            claim = unicodedata.normalize("NFC", r["claim"])
            assert claim == unicodedata.normalize("NFC", claim), "Claims not NFC! Probably fix it here"
            evidence = set()
            for eset in r["evidence"]:
                if bilingual:
                    assert len(eset) % 2 == 0, eset
                    eset = eset[::2]
                for e in eset:

                    if new_format:
                        original_id = e
                    else:
                        original_id = e[2]
                    original_id = unicodedata.normalize("NFC", original_id)
                    evidence_total += 1
                    if original_id not in original_id2pid:
                        if ignore_missing_ids:
                            # print(f"WARNING: missing original_id: {original_id} => REMOVING from evidence!")
                            evidence_not_found += 1
                        else:
                            assert False, f"Should not happen with SQLITE3 corpus, comment for JSONL corpus, original_id='{original_id}'"
                            original_id = fixer.fix(original_id) # TODO: check if not using anymore andremove
                    else:
                        evidence.add(original_id)
            id_ = r["id"]
            if len(evidence) > 0:
                data.append({"claim": claim, "evidence": list(evidence), "fever_id": id_})
            else:
                # print(f"WARNING: no evidence for: {r}")
                no_evidence += 1
    print(f"Not found {evidence_not_found}/{evidence_total} evidence documents, {no_evidence} claims had zero evidence")
    return data

In [8]:
# EnFEVER
enfever_lrev_id_fixer = None # SQLITE IMPORT
trn_data = import_split(Path(SPLIT_ROOT, "train.jsonl"), original_id2pid, fixer=enfever_lrev_id_fixer)
dev_data = import_split(Path(SPLIT_ROOT, "paper_dev.jsonl"), original_id2pid, fixer=enfever_lrev_id_fixer)
tst_data = import_split(Path(SPLIT_ROOT, "paper_test.jsonl"), original_id2pid, fixer=enfever_lrev_id_fixer)

100%|██████████| 145449/145449 [00:00<00:00, 495802.19it/s]


Not found 0/263822 evidence documents, 0 claims had zero evidence


100%|██████████| 9999/9999 [00:00<00:00, 641080.51it/s]


Not found 0/14475 evidence documents, 0 claims had zero evidence


100%|██████████| 9999/9999 [00:00<00:00, 608082.55it/s]

Not found 0/14150 evidence documents, 0 claims had zero evidence





In [9]:
# CsFEVER LREV
trn_data = import_split(Path(SPLIT_ROOT, "train_deepl.jsonl"), original_id2pid, ignore_missing_ids=True)
dev_data = import_split(Path(SPLIT_ROOT, "dev_deepl.jsonl"), original_id2pid, ignore_missing_ids=True)
tst_data = import_split(Path(SPLIT_ROOT, "test_deepl.jsonl"), original_id2pid, ignore_missing_ids=True)

100%|██████████| 107330/107330 [00:00<00:00, 646736.68it/s]


Not found 377/164506 evidence documents, 142 claims had zero evidence


100%|██████████| 9999/9999 [00:00<00:00, 605641.34it/s]


Not found 23/15358 evidence documents, 16 claims had zero evidence


100%|██████████| 9999/9999 [00:00<00:00, 625934.24it/s]

Not found 21/15614 evidence documents, 12 claims had zero evidence





## REMOVE? QACG Training Data

### EnFEVER

In [17]:
trn_data = import_qacg_split([Path(QACG_CLAIMS_ROOT, "train_sup_claims.json"), Path(QACG_CLAIMS_ROOT, "train_ref_claims.json")])
dev_data = import_qacg_split([Path(QACG_CLAIMS_ROOT, "dev_sup_claims.json"), Path(QACG_CLAIMS_ROOT, "dev_ref_claims.json")])

reading: /mnt/data/factcheck/fever/data-en-lrev/qacg/train_sup_claims.json


100%|██████████| 12549/12549 [00:04<00:00, 2518.98it/s]


reading: /mnt/data/factcheck/fever/data-en-lrev/qacg/train_ref_claims.json


100%|██████████| 9807/9807 [00:00<00:00, 36919.38it/s]


reading: /mnt/data/factcheck/fever/data-en-lrev/qacg/dev_sup_claims.json


100%|██████████| 1460/1460 [00:00<00:00, 20199.91it/s]


reading: /mnt/data/factcheck/fever/data-en-lrev/qacg/dev_ref_claims.json


100%|██████████| 1460/1460 [00:00<00:00, 42556.32it/s]


### CsFEVER

In [10]:
trn_data = import_qacg_split([Path(QACG_CLAIMS_ROOT, "train_sup_claims-PAV-ner-CNEC_mt5-large-cp59000.json"), Path(QACG_CLAIMS_ROOT, "train_ref_claims-PAV-ner-CNEC_mt5-large-cp59000.json")])
dev_data = import_qacg_split([Path(QACG_CLAIMS_ROOT, "dev_sup_claims-PAV-ner-CNEC_mt5-large-cp59000.json"), Path(QACG_CLAIMS_ROOT, "dev_ref_claims-PAV-ner-CNEC_mt5-large-cp59000.json")])

reading: /mnt/data/factcheck/fever/data_full_nli-filtered-cs/qacg-r/claim/train_sup_claims-PAV-ner-CNEC_mt5-large-cp59000.json


  5%|▌         | 518/10000 [00:00<00:02, 3266.74it/s]

WARN>> claim not NFC, fixing...


100%|██████████| 10000/10000 [00:00<00:00, 21492.87it/s]


WARN>> claim not NFC, fixing...
WARN>> claim not NFC, fixing...
reading: /mnt/data/factcheck/fever/data_full_nli-filtered-cs/qacg-r/claim/train_ref_claims-PAV-ner-CNEC_mt5-large-cp59000.json


  6%|▋         | 648/10000 [00:00<00:03, 2606.91it/s]

WARN>> claim not NFC, fixing...
WARN>> claim not NFC, fixing...


100%|██████████| 10000/10000 [00:00<00:00, 15174.38it/s]


reading: /mnt/data/factcheck/fever/data_full_nli-filtered-cs/qacg-r/claim/dev_sup_claims-PAV-ner-CNEC_mt5-large-cp59000.json


100%|██████████| 1000/1000 [00:00<00:00, 75806.61it/s]


reading: /mnt/data/factcheck/fever/data_full_nli-filtered-cs/qacg-r/claim/dev_ref_claims-PAV-ner-CNEC_mt5-large-cp59000.json


100%|██████████| 1000/1000 [00:00<00:00, 83184.01it/s]


## Anserini Hard Negatives

We will use Anserini in the first stage to get hard negatives. 

In [15]:
export_as_anserini_collection(corpus, ANSERINI_COLLECTION)

In [13]:
!python -m pyserini.index.lucene \
    -collection JsonCollection \
    -generator DefaultLuceneDocumentGenerator \
    -threads 4 \
    -input {ANSERINI_COLLECTION} \
    -language {LANG} \
    -index {ANSERINI_INDEX} \
    -storePositions -storeDocvectors -storeRaw

2023-10-25 11:07:44,403 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-10-25 11:07:44,404 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: /mnt/data/factcheck/fever/data-en-lrev/anserini/collection
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 4
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-10-25 11:07:44,405 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-10-25 11:07:44,406 INFO  [main] index.IndexColl

In [18]:
anserini_retrieve_claims(ANSERINI_INDEX, tst_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, "test_anserini.jsonl"), tst_data, mkdir=True)

anserini_retrieve_claims(ANSERINI_INDEX, dev_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, "dev_anserini.jsonl"), dev_data, mkdir=True)

anserini_retrieve_claims(ANSERINI_INDEX, trn_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, "train_anserini.jsonl"), trn_data, mkdir=True)

100%|██████████| 6654/6654 [00:41<00:00, 162.26it/s]
100%|██████████| 6650/6650 [00:41<00:00, 161.00it/s]
100%|██████████| 71549/71549 [07:23<00:00, 161.20it/s]


In [19]:
def ensure_k(data, k):
    # keep only claims for which all "k" evidence documents were retrieved
    olen = len(data)
    ndata = list(filter(lambda e: len(e["retrieved"]) >= k, data))
    print(f"filtering k from {olen} to {len(ndata)}")
    return ndata

trn_data = ensure_k(trn_data, 128)
dev_data = ensure_k(dev_data, 128)
tst_data = ensure_k(tst_data, 128)

write_jsonl(Path(ANSERINI_RETRIEVED, "train_128_anserini.jsonl"), trn_data, mkdir=True)
write_jsonl(Path(ANSERINI_RETRIEVED, "dev_128_anserini.jsonl"), dev_data, mkdir=True)
write_jsonl(Path(ANSERINI_RETRIEVED, "test_128_anserini.jsonl"), tst_data, mkdir=True)

filtering k from 71549 to 71502
filtering k from 6650 to 6648
filtering k from 6654 to 6653


## SBERT Reranked Anserini

In [36]:
# import splits with hard negatives retrieved by Anserini
trn_data_anserini = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "train_128_anserini.jsonl"))
dev_data_anserini = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "dev_128_anserini.jsonl"))

### EnFEVER

In [None]:
sbert_CE_rerank(dev_data_anserini, corpus, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "dev_128_anserini+minilm.jsonl"))

In [None]:
sbert_CE_rerank(trn_data_anserini, corpus, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "train_128_anserini+minilm.jsonl"))

### CsFEVER
No Czech cross-encoders exist. No luck with bi-encoders. I'll stay with Anserini...

In [None]:
# sbert_BI_rerank(dev_data_anserini[:10], corpus, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "dev_128_anserini+minilm.jsonl"), model_name="deepset/xlm-roberta-base-squad2")

In [None]:
# sbert_BI_rerank(trn_data_anserini, corpus, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "trn_128_anserini+minilm.jsonl"), model_name="deepset/xlm-roberta-base-squad2")

## Triplet Generation

In [11]:
# EnFEVER MiniLM
# trn_data = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "train_128_anserini+minilm.jsonl")))
# dev_data = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "dev_128_anserini+minilm.jsonl")))

# EnFEVER and CsFEVER LREV
trn_data = read_jsonl(Path(ANSERINI_RETRIEVED, "train_128_anserini.jsonl"))
dev_data = read_jsonl(Path(ANSERINI_RETRIEVED, "dev_128_anserini.jsonl"))
tst_data = read_jsonl(Path(ANSERINI_RETRIEVED, "test_128_anserini.jsonl"))

# CsFEVER
# trn_data = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "train_128_anserini.jsonl")))
# dev_data = read_jsonl(Path(HARD_NEGATIVES_RETRIEVED, Path(HARD_NEGATIVES_RETRIEVED, TRAIN_DIR, "dev_128_anserini.jsonl")))

In [21]:
# TODO move below as for QACG!
def export_queries(data, out_file):
    queries = []
    for r in tqdm(data):
        queries.append({"query": r["claim"]})
    write_jsonl(out_file, queries, mkdir=True)


# EnFEVER MiniLM
# export_queries(trn_data, Path(COLBERT_ROOT, TRAIN_DIR, "queries", "train_anserini+minilm_queries.jsonl"))
# export_queries(dev_data, Path(COLBERT_ROOT, TRAIN_DIR, "queries", "dev_anserini+minilm_queries.jsonl"))

#EnFEVER and CsFEVER
export_queries(trn_data, Path(QUERIES_ROOT, "train_anserini_queries.jsonl"))
export_queries(dev_data, Path(QUERIES_ROOT, "dev_anserini_queries.jsonl"))
export_queries(tst_data, Path(QUERIES_ROOT, "test_anserini_queries.jsonl"))

100%|██████████| 71502/71502 [00:00<00:00, 1698203.98it/s]
100%|██████████| 6648/6648 [00:00<00:00, 1593902.65it/s]
100%|██████████| 6653/6653 [00:00<00:00, 1839102.65it/s]


### Triples by Retrieval

In [18]:
# n_preretrieve = 64
# offset = 0
# trn_triples = generate_triples_by_retrieval(trn_data, corpus, original_id2pid, n_preretrieve, offset=offset)
# dev_triples = generate_triples_by_retrieval(dev_data, corpus, original_id2pid, n_preretrieve, offset=offset)
# write_jsonl(Path(COLBERT_ROOT, TRAIN_DIR, "triples", f"train_triples{n_preretrieve}_o{offset}_anserini+minilm.jsonl"), trn_triples)
# write_jsonl(Path(COLBERT_ROOT,TRAIN_DIR,  "triples", f"dev_triples{n_preretrieve}_o{offset}_anserini+minilm.jsonl"), dev_triples)

In [28]:
Counter([len(e["evidence"]) for e in trn_data])

Counter({1: 63473,
         2: 6340,
         3: 895,
         4: 361,
         5: 181,
         7: 87,
         6: 84,
         8: 34,
         9: 19,
         10: 10,
         11: 6,
         12: 3,
         13: 3,
         15: 2,
         14: 2,
         17: 1,
         21: 1})

In [12]:
nway = 128

# EnFEVER MiniLM
# trn_triples = generate_triples_by_retrieval_nway(trn_data, corpus, original_id2pid, nway=nway)
# write_jsonl(Path(COLBERT_ROOT, TRAIN_DIR, "triples", f"train_triples_nway{nway}_anserini+minilm.jsonl"), trn_triples, mkdir=True)

# dev_triples = generate_triples_by_retrieval_nway(dev_data, corpus, original_id2pid, nway=nway)
# write_jsonl(Path(COLBERT_ROOT, TRAIN_DIR, "triples", f"dev_triples_nway{nway}_anserini+minilm.jsonl"), dev_triples, mkdir=True)

# EnFEVER and CsFEVER
trn_triples = generate_triples_by_retrieval_nway(trn_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"train_triples_nway{nway}_evidence+anserini.jsonl"), trn_triples, mkdir=True)

dev_triples = generate_triples_by_retrieval_nway(dev_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"dev_triples_nway{nway}_evidence+anserini.jsonl"), dev_triples, mkdir=True)

tst_triples = generate_triples_by_retrieval_nway(tst_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"test_triples_nway{nway}_evidence+anserini.jsonl"), tst_triples, mkdir=True)

100%|██████████| 109807/109807 [00:13<00:00, 8003.40it/s]


generated 109807 triples with 0 failures and 0 random fixes


100%|██████████| 6666/6666 [00:00<00:00, 7080.56it/s]


generated 6666 triples with 0 failures and 0 random fixes


100%|██████████| 6666/6666 [00:00<00:00, 7051.86it/s]


generated 6666 triples with 0 failures and 0 random fixes


In [38]:
TRIPLES_ROOT

PosixPath('/mnt/data/factcheck/fever/data-cs-lrev/colbertv2/triples')

### Fake Claims
Extending actual FEVER claims by claim selected as random sentences from random corpus documents.

In [9]:
# CS
# language, factor, suffix = "czech", 0.0, ""
# language, factor, suffix = "czech", 3.0, "_factor3" # this should roughly match QACG number of triples

# EN
language, factor, suffix = "english", 0.0, ""
language, factor, suffix = "english", 2.0, "_factor2" # this should roughly match QACG number of triples

In [11]:
def extend_by_fake_claims(data, corpus, factor=1.0, language="czech", seed=1234):
    from nltk.tokenize import sent_tokenize
    # take random sentences from random documents as claims
    rng = np.random.RandomState(seed)
    n = int(len(data) * factor)
    print(f"extending by {n} fake claims")
    docs = rng.choice(corpus, n, replace=False)
    claims = []
    for doc in docs:
        claim = str(rng.choice(sent_tokenize(doc["text"], language=language)))
        claims.append({"claim": claim, "evidence": [doc["id"]]})
    data += claims
    rng.shuffle(data)
    return data


# CS
trn_data = extend_by_fake_claims(trn_data, corpus, factor=factor, language=language, seed=1234)
dev_data = extend_by_fake_claims(dev_data, corpus, factor=factor, language=language, seed=1235)
tst_data = extend_by_fake_claims(tst_data, corpus, factor=factor, language=language, seed=1236)

extending by 219620 fake claims
extending by 13332 fake claims
extending by 13332 fake claims


In [12]:
dev_data[:5]

[{'claim': 'Miranda Otto began her film acting career at age 18.',
  'evidence': ['Miranda_Otto'],
  'fever_id': 23198},
 {'claim': 'Chirognathus is an extinct genus of conodonts in the family Chirognathidae.',
  'evidence': ['Chirognathus']},
 {'claim': 'Vedam stars only Canadian film actors and actresses.',
  'evidence': ['Manoj_Bajpayee',
   'Saranya_Ponvannan',
   'Anushka_Shetty',
   'Allu_Arjun',
   'Manchu_Manoj',
   'Vedam_-LRB-film-RRB-',
   'Lekha_Washington',
   'Deeksha_Seth'],
  'fever_id': 135962},
 {'claim': 'Once in the tubule wall, the glucose and amino acids diffuse directly into the blood capillaries along a concentration gradient.',
  'evidence': ['Renal_glucose_reabsorption']},
 {'claim': 'Paramore is a classic rock band.',
  'evidence': ['Paramore'],
  'fever_id': 96739}]

In [13]:
anserini_retrieve_claims(ANSERINI_INDEX, tst_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, f"test_normal+fake{suffix}_anserini.jsonl"), tst_data, mkdir=True)

anserini_retrieve_claims(ANSERINI_INDEX, dev_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, f"dev_normal+fake{suffix}_anserini.jsonl"), dev_data, mkdir=True)

anserini_retrieve_claims(ANSERINI_INDEX, trn_data, 128)
write_jsonl(Path(ANSERINI_RETRIEVED, f"train_normal+fake{suffix}_anserini.jsonl"), trn_data, mkdir=True)

100%|██████████| 19998/19998 [13:14<00:00, 25.18it/s]
100%|██████████| 19998/19998 [12:53<00:00, 25.85it/s]
100%|██████████| 329430/329430 [3:37:11<00:00, 25.28it/s]  


In [20]:
ANSERINI_RETRIEVED

PosixPath('/mnt/data/factcheck/fever/data-en-lrev/anserini/retrieved')

In [21]:
def ensure_k(data, k):
    # keep only claims for which all "k" evidence documents were retrieved
    olen = len(data)
    ndata = list(filter(lambda e: len(e["retrieved"]) >= k, data))
    print(f"filtering k from {olen} to {len(ndata)}")
    return ndata

trn_data = ensure_k(trn_data, 128)
dev_data = ensure_k(dev_data, 128)
tst_data = ensure_k(tst_data, 128)

write_jsonl(Path(ANSERINI_RETRIEVED, f"train_normal+fake{suffix}_128_anserini.jsonl"), trn_data, mkdir=True)
write_jsonl(Path(ANSERINI_RETRIEVED, f"dev_normal+fake{suffix}_128_anserini.jsonl"), dev_data, mkdir=True)
write_jsonl(Path(ANSERINI_RETRIEVED, f"test_normal+fake{suffix}_128_anserini.jsonl"), tst_data, mkdir=True)

filtering k from 329200 to 329200
filtering k from 19980 to 19980
filtering k from 19979 to 19979


In [22]:
trn_data = read_jsonl(Path(ANSERINI_RETRIEVED, f"train_normal+fake{suffix}_128_anserini.jsonl"))
dev_data = read_jsonl(Path(ANSERINI_RETRIEVED, f"dev_normal+fake{suffix}_128_anserini.jsonl"))
tst_data = read_jsonl(Path(ANSERINI_RETRIEVED, f"test_normal+fake{suffix}_128_anserini.jsonl"))

nway = 128
# EnFEVER and CsFEVER
trn_triples = generate_triples_by_retrieval_nway(trn_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"train_triples_normal+fake{suffix}_nway{nway}_evidence+anserini.jsonl"), trn_triples, mkdir=True)

dev_triples = generate_triples_by_retrieval_nway(dev_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"dev_triples_normal+fake{suffix}_nway{nway}_evidence+anserini.jsonl"), dev_triples, mkdir=True)

tst_triples = generate_triples_by_retrieval_nway(tst_data, corpus, original_id2pid, nway=nway, use_evidence=True)
write_jsonl(Path(TRIPLES_ROOT, f"test_triples_normal+fake{suffix}_nway{nway}_evidence+anserini.jsonl"), tst_triples, mkdir=True)

100%|██████████| 329200/329200 [00:37<00:00, 8705.13it/s]


generated 329200 triples with 0 failures and 0 random fixes


100%|██████████| 19980/19980 [00:02<00:00, 8467.68it/s]


generated 19980 triples with 0 failures and 0 random fixes


100%|██████████| 19979/19979 [00:02<00:00, 8388.33it/s]


generated 19979 triples with 0 failures and 0 random fixes


In [23]:
TRIPLES_ROOT

PosixPath('/mnt/data/factcheck/fever/data-en-lrev/colbertv2/triples')

In [24]:
def export_queries(data, out_file):
    queries = []
    for r in tqdm(data):
        queries.append({"query": r["claim"]})
    write_jsonl(out_file, queries, mkdir=True)

#EnFEVER and CsFEVER
export_queries(trn_data, Path(QUERIES_ROOT, f"train_normal+fake{suffix}_queries.jsonl"))
export_queries(dev_data, Path(QUERIES_ROOT, f"dev_normal+fake{suffix}_queries.jsonl"))
export_queries(tst_data, Path(QUERIES_ROOT, f"test_normal+fake{suffix}_queries.jsonl"))

100%|██████████| 329200/329200 [00:00<00:00, 1513452.26it/s]
100%|██████████| 19980/19980 [00:00<00:00, 2021229.44it/s]
100%|██████████| 19979/19979 [00:00<00:00, 1726227.74it/s]


In [19]:
QUERIES_ROOT

PosixPath('/mnt/data/factcheck/fever/data-en-lrev/colbertv2/queries')

### Random

In [20]:
def generate_triples_random(data, corpus, original_id2pid, k, seed=1234):
    # generate soft negatives by choosing random documents
    idx2id = {i: doc["id"] for i, doc in enumerate(corpus)}
    id2idx = {doc["id"]: i for i, doc in enumerate(corpus)}
    
    rng = np.random.RandomState(seed)

    def random_docs(posidx: int):
        docs = set()
        while len(docs) < k:
            doc = rng.choice(len(id2idx))
            if doc != pos and doc not in docs:
                docs.add(doc)
        return list(docs)

    triples = []
    for qid, r in enumerate(tqdm(data)):
        for pos in r["evidence"]:
            posidx = id2idx[pos]
            for neg in random_docs(posidx):
                neg = idx2id[neg]
                triples.append((qid, original_id2pid[pos], original_id2pid[neg]))
    print(f"generated {len(triples)} triples")
    return triples

In [21]:
k = 32
trn_triples = generate_triples_random(trn_data, corpus, original_id2pid, k, seed=1234)
dev_triples = generate_triples_random(dev_data, corpus, original_id2pid, k, seed=1235)
write_jsonl(Path(COLBERT_ROOT, TRAIN_DIR, "triples", f"train_triples{k}_random.jsonl"), trn_triples)
write_jsonl(Path(COLBERT_ROOT, TRAIN_DIR, "triples", f"dev_triples{k}_random.jsonl"), dev_triples)

100%|██████████| 310798/310798 [01:29<00:00, 3464.42it/s]


generated 9945536 triples


100%|██████████| 31650/31650 [00:09<00:00, 3480.83it/s]


generated 1012800 triples


In [28]:
# TODO implement this, but for full Wikipedia corpora, EnFEVER has only single paragraph=document per page
def generate_triples_by_page(data, corpus, original_id2pid, k):
    # documents = paragraphs, I use page as term describing whole original text (e.g., Wikipedia page composed of documents=paragraphs)
    # takes the positive document and adds k-1 negatives from the same page
    id2txt = {doc["id"]: doc["text"] for doc in corpus}
    failures = 0
    triples = []
    # for qid, r in enumerate(data):
    #     # those retrieved but not in the annotated evidence will become hard negatives 
    #     retrieved = set(r["retrieved"][offset:]).difference(r["evidence"])
    #     for pos in r["evidence"]:
    #         if pos not in id2txt:
    #             # may happen for EnFEVER when the snapshot does not exactly match 
    #             failures += 1
    #             continue
    #         for neg in list(retrieved)[:k]:
    #             triples.append((qid, original_id2pid[pos], original_id2pid[neg]))
    # print(f"generated {len(triples)} triples with {failures} failures")
    return triples

# k = 8
# dev_triples = generate_triples_by_page(dev_data, corpus, original_id2pid, k=k)

In [29]:
corpus[1]

{'id': 'Astronomie',
 'revid': '21356648',
 'url': 'https://cs.wikipedia.org/wiki?curid=10',
 'title': 'Astronomie',
 'original_id': 10,
 'text': 'Astronomie, řecky αστρονομία z άστρον (astron) hvězda a νόμος (nomos) zákon, česky též hvězdářství, je věda, která se zabývá jevy za hranicemi zemské atmosféry. Zvláště tedy výzkumem vesmírných těles, jejich soustav, různých dějů ve vesmíru i vesmírem jako celkem. Historie astronomie. Antika. Astronomie se podobně jako další vědy začala rozvíjet ve starověku. Na území Babylonie však nebylo k popisu používáno již vynalezené geometrie (grafy). První se z astronomie rozvíjela astrometrie, zabývající se měřením poloh hvězd a planet na obloze. Tato oblast astronomie měla velký význam pro navigaci. Podstatnou částí astrometrie je sférická astronomie sloužící k popisu poloh objektů na nebeské sféře, zavádí souřadnice a popisuje významné křivky a body na nebeské sféře. Pojmy ze sférické astronomie se také používají při měření času. Další oblastí ast

In [22]:
c = Counter(pid2original_id[t[1]] for t in trn_triples)
c.most_common(20)

[('Snoop_Dogg', 6912),
 ('Marlon_Brando', 6880),
 ('United_States', 6112),
 ('Wyatt_Earp', 5888),
 ('Michael_Jackson', 5856),
 ('United_Kingdom', 5760),
 ('Adele', 5152),
 ('Miley_Cyrus', 5024),
 ('Tim_Rice', 4832),
 ('The_Beatles', 4544),
 ('International_relations', 4480),
 ('A_Song_of_Ice_and_Fire', 4480),
 ('Abraham_Lincoln', 4448),
 ('David_Beckham', 4320),
 ('Anne_Hathaway', 4192),
 ('One_Direction', 4192),
 ('Frank_Sinatra', 4160),
 ('Oliver_Reed', 4096),
 ('Deadpool_-LRB-_film_-RRB-', 4064),
 ('Bradley_Cooper', 4064)]

In [15]:
write_jsonl(Path(COLBERT_ROOT, "triples", f"train_triples{n_preretrieve}_o{offset}.jsonl"), trn_triples)
write_jsonl(Path(COLBERT_ROOT, "triples", f"dev_triples{n_preretrieve}_o{offset}.jsonl"), dev_triples)

In [52]:
dev_triples = read_jsonl(Path(COLBERT_ROOT, f"dev_triples1_random.jsonl"))

In [56]:
import textwrap
for qid, pos, neg in dev_triples[1:]:
    claim = dev_data[qid]['claim']
    # pos = textwrap.fill(corpus[pos]['text'])
    # neg = textwrap.fill(corpus[neg]['text'])
    pos = corpus[pos]["id"]
    neg = corpus[neg]["id"]

    print(f"{claim}")
    print(f"{pos}")
    print()
    print(f"{neg}")
    print("-----------")
    break

Telemundo is a English-language television network.
Hispanic_and_Latino_Americans

Cnaphalocrocis_poeyalis
-----------


## Subsample QACG-(CS, EN) to (Cs, En)FEVER Size

In [6]:
def sample_qacg_to_fever(src_split, size_split, dst_split, seed=1234):
    rng = np.random.RandomState(seed)
    src_data = read_jsonl(src_split)
    size_data = read_jsonl(size_split)
    rename_labels = {"SUPPORTS": "s", "REFUTES": "r", "NOT ENOUGH INFO": "n"}
    sizes = Counter(rename_labels[r["label"]] for r in size_data)
    print(f"target sizes: {sizes}, total: {np.sum(list(sizes.values()))}")
    dst_data = []
    for label in sizes.keys():
        samples = [r for r in src_data if r["label"] == label]
        # print(label, samples)
        dst_data += list(rng.choice(samples, sizes[label], replace=False))
    rng.shuffle(dst_data)
    sizes = Counter(r["label"] for r in dst_data)
    print(f" final sizes: {sizes}, total: {np.sum(list(sizes.values()))}")
    write_jsonl(dst_split, dst_data)


# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/train_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-cs-lrev/fever-data/train_deepl.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/train_fever_size.jsonl"
#                      )

# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/dev_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-cs-lrev/fever-data/dev_deepl.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/dev_fever_size.jsonl"
#                      )
# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/test_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-cs-lrev/fever-data/test_deepl.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/cs/20230801/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/test_fever_size.jsonl"
#                      )

# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/train_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-en-lrev/fever-data/train.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/train_fever_size.jsonl"
#                      )

# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/dev_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-en-lrev/fever-data/paper_dev.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/dev_fever_size.jsonl"
#                      )
# sample_qacg_to_fever(src_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_balanced.jsonl",
#                      size_split="/mnt/data/factcheck/fever/data-en-lrev/fever-data/paper_test.jsonl",
#                      dst_split="/mnt/data/factcheck/wiki/en/20230801/qacg/splits/stanza/mt5-large_all-cp126k/mt5-large_all-cp156k/test_fever_size.jsonl"
#                      )


target sizes: Counter({'s': 53542, 'n': 35639, 'r': 18149}), total: 107330
 final sizes: Counter({'s': 53542, 'n': 35639, 'r': 18149}), total: 107330
target sizes: Counter({'n': 3333, 's': 3333, 'r': 3333}), total: 9999
 final sizes: Counter({'n': 3333, 'r': 3333, 's': 3333}), total: 9999
target sizes: Counter({'s': 3333, 'n': 3333, 'r': 3333}), total: 9999
 final sizes: Counter({'s': 3333, 'r': 3333, 'n': 3333}), total: 9999


## Combine FEVER and QACG

In [3]:
def combine_data(src_queries_lsts, src_triples_lsts, dst_queries, dst_triples, seed=1234):
    assert len(src_queries_lsts) == len(src_triples_lsts)

    queries = []
    triples = []
    offset = 0
    for src_queries, src_triples in zip(src_queries_lsts, src_triples_lsts):
        Q = read_jsonl(src_queries)
        T = read_jsonl(src_triples)
        assert len(Q) == len(T)
        queries += Q
        for t in T:
            t[0] += offset
            triples.append(t)
        offset += len(Q)
        
    rng = np.random.RandomState(seed)

    write_jsonl(dst_queries, queries, mkdir=True)
    write_jsonl(dst_triples, triples, mkdir=True)

In [9]:
# EnFEVER
combine_data(
    src_queries_lsts=[
        Path(COLBERT_ROOT, "fever", "queries", "train_anserini+minilm_queries.jsonl"),
        Path(COLBERT_ROOT, "qacg", "queries", "train_anserini+minilm_queries.jsonl")],
    src_triples_lsts=[
        Path(COLBERT_ROOT, "fever", "triples", "train_triples_nway128_anserini+minilm.jsonl"),
        Path(COLBERT_ROOT, "qacg", "triples", "train_triples_nway128_anserini+minilm.jsonl")],
    dst_queries=Path(COLBERT_ROOT, "fever+qacg", "queries", "train_anserini+minilm_queries.jsonl"),
    dst_triples=Path(COLBERT_ROOT, "fever+qacg", "triples", "train_triples_nway128_anserini+minilm.jsonl")
)

combine_data(
    src_queries_lsts=[
        Path(COLBERT_ROOT, "fever", "queries", "dev_anserini+minilm_queries.jsonl"),
        Path(COLBERT_ROOT, "qacg", "queries", "dev_anserini+minilm_queries.jsonl")],
    src_triples_lsts=[
        Path(COLBERT_ROOT, "fever", "triples", "dev_triples_nway128_anserini+minilm.jsonl"),
        Path(COLBERT_ROOT, "qacg", "triples", "dev_triples_nway128_anserini+minilm.jsonl")],
    dst_queries=Path(COLBERT_ROOT, "fever+qacg", "queries", "dev_anserini+minilm_queries.jsonl"),
    dst_triples=Path(COLBERT_ROOT, "fever+qacg", "triples", "dev_triples_nway128_anserini+minilm.jsonl")
)

In [5]:
# CsFEVER
combine_data(
    src_queries_lsts=[
        Path(COLBERT_ROOT, "fever", "queries", "train_anserini_queries.jsonl"),
        Path(COLBERT_ROOT, "qacg", "queries", "train_anserini_queries.jsonl")],
    src_triples_lsts=[
        Path(COLBERT_ROOT, "fever", "triples", "train_triples_nway128_anserini.jsonl"),
        Path(COLBERT_ROOT, "qacg", "triples", "train_triples_nway128_anserini.jsonl")],
    dst_queries=Path(COLBERT_ROOT, "fever+qacg", "queries", "train_anserini_queries.jsonl"),
    dst_triples=Path(COLBERT_ROOT, "fever+qacg", "triples", "train_triples_nway128_anserini.jsonl")
)

combine_data(
    src_queries_lsts=[
        Path(COLBERT_ROOT, "fever", "queries", "dev_anserini_queries.jsonl"),
        Path(COLBERT_ROOT, "qacg", "queries", "dev_anserini_queries.jsonl")],
    src_triples_lsts=[
        Path(COLBERT_ROOT, "fever", "triples", "dev_triples_nway128_anserini.jsonl"),
        Path(COLBERT_ROOT, "qacg", "triples", "dev_triples_nway128_anserini.jsonl")],
    dst_queries=Path(COLBERT_ROOT, "fever+qacg", "queries", "dev_anserini_queries.jsonl"),
    dst_triples=Path(COLBERT_ROOT, "fever+qacg", "triples", "dev_triples_nway128_anserini.jsonl")
)