In [1]:
import gzip
import json
import os

from pathlib import Path
from pprint import pprint
from copy import deepcopy

from dotenv import load_dotenv
from pymongo import MongoClient
from joblib import Parallel, delayed

In [2]:
load_dotenv(dotenv_path="../.env")

datap = Path("../data")
client = MongoClient(f"mongodb://{os.getenv('MONGO_USERNAME')}:{os.getenv('MONGO_PASSWORD')}@localhost:27017/")
db = client["mprjraw"]

In [3]:
def read_gzip_data(fp):
    with gzip.GzipFile(fp, "r") as gzfn:
        data = [json.loads(l.decode("utf8")) for l in gzfn.readlines()]
    return data

# FEVER

In [4]:
feverp = datap.joinpath("fever")

In [5]:
def fever_corpus_preproc(fp, fever_corpus_col):
    wiki = read_gzip_data(fp)
    for i in range(len(wiki)):
        lines = wiki[i]["lines"].strip().split("\n")
        lines = [l.split("\t") for l in lines]
        lines = [[l[0], l[1], list(set(l[2:]))] for l in lines if len(l) > 2]
        wiki[i]["lines"] = lines
        wiki[i]["source"] = fp.stem
    fever_corpus_col.insert_many(wiki)
    return fever_corpus_col

In [6]:
fevercorpus = db["fevercorpus"]
wiki = feverp.joinpath("wiki-pages")

fevercorpus.drop()
fcorpus_col = Parallel(n_jobs=3, backend="threading", verbose=1)(delayed(fever_corpus_preproc)(fp, fevercorpus) for fp in wiki.iterdir())
len(fcorpus_col), fevercorpus.create_index("id", unique=True)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done 109 out of 109 | elapsed:  6.6min finished


(109, 'id_1')

In [7]:
fevertrain = db["feverlabel"]
fn = "train.jsonl"
ftrain = read_gzip_data(feverp.joinpath(fn + ".gz"))

fevertrain.drop()
ftrain_col = fevertrain.insert_many(ftrain)
fevertrain.update_many({}, {"$set": {"source": fn}})
len(ftrain_col.inserted_ids), fevertrain.create_index("id", unique=True)

del ftrain

In [8]:
fn = "paper_dev.jsonl"
feverstpaperdev = read_gzip_data(feverp.joinpath(fn + ".gz"))

ftrain_stdev = fevertrain.insert_many(feverstpaperdev)
fevertrain.update_many({'source': {"$exists": False}}, {"$set": {"source": fn}})
len(ftrain_stdev.inserted_ids)

del feverstpaperdev

In [9]:
fn = "paper_test.jsonl"
feverstpapertest = read_gzip_data(feverp.joinpath(fn + ".gz"))

ftrain_sttest = fevertrain.insert_many(feverstpapertest)
fevertrain.update_many({'source': {"$exists": False}}, {"$set": {"source": fn}})
len(ftrain_sttest.inserted_ids)

del feverstpapertest

In [10]:
fn = "shared_task_test.jsonl"
fevertest = db["feverunlab"]
feversttest = read_gzip_data(feverp.joinpath(fn + ".gz"))

fevertest.drop()
ftest_sttest = fevertest.insert_many(feversttest)
fevertest.update_many({'source': {"$exists": False}}, {"$set": {"source": fn}})
len(ftest_sttest.inserted_ids)

del feversttest

In [11]:
fn = "shared_task_dev_public.jsonl"
feverstdevpub = read_gzip_data(feverp.joinpath(fn + ".gz"))

ftest_stdevpub = fevertest.insert_many(feverstdevpub)
fevertest.update_many({'source': {"$exists": False}}, {"$set": {"source": fn}})
len(ftest_stdevpub.inserted_ids)

del feverstdevpub

# Climate-FEVER

In [17]:
climatefever = db["climatefeverdata"]
fn = "climatefever.jsonl"
cfever = read_gzip_data(datap.joinpath(fn + ".gz"))

climatefever.drop()
cfever_col = climatefever.insert_many(cfever)
climatefever.update_many({}, {"$set": {"source": fn}})
len(cfever_col.inserted_ids), climatefever.create_index("claim_id", unique=True), climatefever.create_index("claim_label")

(1535, 'claim_id_1', 'claim_label_1')

# SciFact

In [18]:
def scifact_preproc(doc, corpus_col):
    """
    Preprocess SciFact claims to include support into each claim document for ease of processing later
    """
    if "evidence" not in doc:
        return doc
    for iind, i in enumerate(doc["cited_doc_ids"]):
        cite = corpus_col.find_one({"doc_id": i})
        if doc["evidence"] and str(i) in doc["evidence"]:
            for jind, j in enumerate(doc["evidence"][str(i)]):
                sentences = [cite["abstract"][k] for k in j["sentences"]]
                doc["evidence"][str(i)][jind]["sentences"] = sentences
        del cite["_id"]
        doc["cited_doc_ids"][iind] = cite
    return doc

In [19]:
sfp = datap.joinpath("scifact")

In [20]:
fn = "corpus.jsonl"
sf_corpus = read_gzip_data(sfp.joinpath(fn + ".gz"))
for i in range(len(sf_corpus)):
    sf_corpus[i]["abstract"] = [l.strip() for l in sf_corpus[i]["abstract"]]

scifact_corpus = db["scifactcorpus"]
scifact_corpus.drop()
sf_corpus_col = scifact_corpus.insert_many(sf_corpus)
scifact_corpus.update_many({}, {"$set": {"source": fn}})
len(sf_corpus_col.inserted_ids), scifact_corpus.create_index("doc_id", unique=True)

(5183, 'doc_id_1')

In [21]:
scifact_data = db["scifactdata"]
scifact_data.drop()

cnt = 0
for fn in sfp.iterdir():
    if fn.is_file() and "corpus" not in fn.stem:
        sf_data = read_gzip_data(fn)
        sf_data_col = scifact_data.insert_many(sf_data)
        scifact_data.update_many({"source": {"$exists": False}}, {"$set": {"source": fn.stem}})
        cnt = cnt + len(sf_data_col.inserted_ids)
cnt, scifact_data.create_index("id", unique=True)

(1409, 'id_1')