In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")
from pathlib import Path

from joblib import Parallel, delayed

from retrieval.retrieval import BM25DocumentRetriever

# FEVER

In [3]:
fever_inp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
fever_db = "/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever/fever.db"

fever_outp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/fever-nei-sampled")

In [4]:
def run_document_retrieve(in_file, out_file, db_path, pyserini_index_name):
    doc_retrieval = BM25DocumentRetriever(
        in_file, 
        fever_db, 
        pyserini_index_name=pyserini_index_name, 
        bm25_top_k=5, 
        n_jobs=20
    )
    doc_retrieval.batch_document_retrieve()
    doc_retrieval.to_jsonl(out_file, overwrite=True)
    
    return 0

In [5]:
datasets = ["dev", "train", "test"]
n = 5

for data in datasets:
    doc_retrieval = BM25DocumentRetriever(
        fever_inp / f"{data}.jsonl", 
        fever_db, 
        pyserini_index_name="beir-v1.0.0-fever-flat", 
        bm25_top_k=n, 
        n_jobs=20
    )
    doc_retrieval.batch_document_retrieve()
    doc_retrieval.to_jsonl(fever_outp / f"{data}.n{n}.jsonl", overwrite=True)

Attempting to initialize pre-built index beir-v1.0.0-fever-flat.
/users/k21190024/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-fever-flat.20220501.1842ee.63cd5f369b5952386f138efe45571d41 already exists, skipping download.
Initializing beir-v1.0.0-fever-flat...
2023-07-01 14:18:56,503 INFO  [pool-2-thread-17] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 500 (0.027 s/query)
2023-07-01 14:19:00,359 INFO  [pool-2-thread-17] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 1000 (0.017 s/query)
2023-07-01 14:19:03,492 INFO  [pool-2-thread-11] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 1500 (0.014 s/query)
2023-07-01 14:19:05,999 INFO  [pool-2-thread-11] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 2000 (0.011 s/query)
2023-07-01 14:19:08,485 INFO  [pool-2-thread-14] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 2500 (0.010 s/query)
2023-07-01 14:19:10,661 INFO  [pool-2-thread-10] se

100%|██████████| 9999/9999 [03:55<00:00, 42.49it/s] 


# Climate-FEVER

In [4]:
cfever_inp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_sent/finetune/baseline")
cfever_db = "/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_sent/feverised-climatefever-titleid.db"

cfever_outp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/climatefever-neg-sampled")

In [5]:
datasets = ["train", "dev"]
n = 5

for data in datasets:
    doc_retrieval = BM25DocumentRetriever(
        cfever_inp / f"climatefever_{data}.jsonl", 
        cfever_db, 
        pyserini_index_name="beir-v1.0.0-climate-fever-flat", 
        bm25_top_k=100, 
        n_jobs=20
    )
    doc_retrieval.batch_document_retrieve()
    for doc in doc_retrieval.results:
        doc["predicted_pages"] = doc["predicted_pages"][:n]
        doc["predicted_pages_score"] = [page for page in doc["predicted_pages_score"] if page[0] in doc["predicted_pages"]]
    doc_retrieval.to_jsonl(cfever_outp / f"{data}.n{n}.jsonl", overwrite=False)

Attempting to initialize pre-built index beir-v1.0.0-climate-fever-flat.
/users/k21190024/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee.9af959cf58139d086d52121917913a02 already exists, skipping download.
Initializing beir-v1.0.0-climate-fever-flat...
2023-07-05 23:43:35,457 INFO  [pool-2-thread-19] search.SimpleSearcher (SimpleSearcher.java:580) - Retrieving query 500 (0.046 s/query)


100%|██████████| 966/966 [00:54<00:00, 17.76it/s]


Attempting to initialize pre-built index beir-v1.0.0-climate-fever-flat.
/users/k21190024/.cache/pyserini/indexes/lucene-index.beir-v1.0.0-climate-fever-flat.20220501.1842ee.9af959cf58139d086d52121917913a02 already exists, skipping download.
Initializing beir-v1.0.0-climate-fever-flat...


100%|██████████| 278/278 [00:15<00:00, 17.82it/s]
