In [1]:
import gzip
import json
import os

from pathlib import Path
from pprint import pprint
from copy import deepcopy

from dotenv import load_dotenv
from pymongo import MongoClient
from joblib import Parallel, delayed

In [2]:
load_dotenv(dotenv_path="../.env")

datap = Path("../data")
client = MongoClient(f"mongodb://{os.getenv('MONGO_USERNAME')}:{os.getenv('MONGO_PASSWORD')}@localhost:27017/")
db = client["mprjraw"]

In [3]:
def read_gzip_data(fp):
    with gzip.GzipFile(fp, "r") as gzfn:
        data = [json.loads(l.decode("utf8")) for l in gzfn.readlines()]
    return data

# FEVER

# Climate-FEVER

In [4]:
cfever = read_gzip_data(datap.joinpath("climatefever.jsonl.gz"))
climatefever = db["climatefever_data"]
climatefever.drop()
cfever_col = climatefever.insert_many(cfever)
len(cfever_col.inserted_ids), climatefever.create_index("claim_id", unique=True), climatefever.create_index("claim_label")

(1535, 'claim_id_1', 'claim_label_1')

# SciFact

In [5]:
def scifact_preproc(doc, corpus_col):
    if "evidence" not in doc:
        return doc
    for iind, i in enumerate(doc["cited_doc_ids"]):
        cite = corpus_col.find_one({"doc_id": i})
        if doc["evidence"] and str(i) in doc["evidence"]:
            for jind, j in enumerate(doc["evidence"][str(i)]):
                sentences = [cite["abstract"][k] for k in j["sentences"]]
                doc["evidence"][str(i)][jind]["sentences"] = sentences
        del cite["_id"]
        doc["cited_doc_ids"][iind] = cite
    return doc

In [6]:
sfp = datap.joinpath("scifact")
sf_corpus = read_gzip_data(sfp.joinpath("corpus.jsonl.gz"))
sf_data = (read_gzip_data(sfp.joinpath("claims_dev.jsonl.gz")) 
           + read_gzip_data(sfp.joinpath("claims_train.jsonl.gz")) 
           + read_gzip_data(sfp.joinpath("claims_test.jsonl.gz")))

In [7]:
for i in range(len(sf_corpus)):
    sf_corpus[i]["abstract"] = [l.strip() for l in sf_corpus[i]["abstract"]]

scifact_corpus = db["scifact_corpus"]
scifact_corpus.drop()
sf_corpus_col = scifact_corpus.insert_many(sf_corpus)
len(sf_corpus_col.inserted_ids), scifact_corpus.create_index("doc_id", unique=True)

(5183, 'doc_id_1')

In [8]:
scifact_data = db["scifact_data"]
scifact_data.drop()
sf_data_proc = Parallel(2, backend="threading", verbose=3)(delayed(scifact_preproc)(doc, scifact_corpus) for doc in sf_data)
sf_data_col = scifact_data.insert_many(sf_data_proc)
len(sf_data_col.inserted_ids), scifact_data.create_index("id", unique=True)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done 1148 tasks      | elapsed:    1.9s
[Parallel(n_jobs=2)]: Done 1409 out of 1409 | elapsed:    2.0s finished


(1409, 'id_1')