In [1]:
import gzip
import json
import os

from pathlib import Path
from pprint import pprint
from copy import deepcopy

from joblib import Parallel, delayed

In [2]:
datap = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data2")
procp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1")

In [3]:
def read_gzip_data(fp):
    with gzip.GzipFile(fp, "r") as gzfn:
        data = [json.loads(l.decode("utf8")) for l in gzfn.readlines()]
    return data

def write_gzip_data(fp, payload):
    with gzip.GzipFile(fp, "w") as gzfn:
        gzfn.write(json.dumps(payload).encode("utf-8"))
    return fp

# FEVER

In [4]:
feverp = datap.joinpath("fever")

fever_proc = procp.joinpath("fever")
fever_corpus_proc = fever_proc.joinpath("corpus")

if not fever_corpus_proc.exists():
    fever_corpus_proc.mkdir(parents=True)

## Corpus

In [27]:
def fever_corpus_preproc(fin, fout):
    wiki = read_gzip_data(fin)
    for i in range(len(wiki)):
        lines = wiki[i]["lines"].strip().split("\n")
        lines = [l.split("\t") for l in lines]
        lines = [[l[0], l[1], list(set(l[2:]))] for l in lines if len(l) > 2]
        wiki[i]["lines"] = lines
    return write_gzip_data(fout.joinpath(fin.name), wiki)

In [34]:
wiki = feverp.joinpath("wiki-pages")

fcorpus_col = Parallel(n_jobs=20, verbose=1)(delayed(fever_corpus_preproc)(fp, fever_corpus_proc) for fp in wiki.iterdir())
len(fcorpus_col)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:   13.3s
[Parallel(n_jobs=20)]: Done 109 out of 109 | elapsed:  1.1min finished


109

## Claims

### Labelled

In [25]:
fn = "train.jsonl"
ftrain = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(ftrain)):
    ftrain[i]["paper_partition"] = "train"

In [26]:
fn = "paper_dev.jsonl"
feverstpaperdev = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(feverstpaperdev)):
    feverstpaperdev[i]["paper_partition"] = "dev"

In [27]:
fn = "paper_test.jsonl"
feverstpapertest = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(feverstpapertest)):
    feverstpapertest[i]["paper_partition"] = "test"

In [28]:
feverdata = ftrain + feverstpaperdev + feverstpapertest
write_gzip_data(fever_proc.joinpath("fulltrain.json.gz"), feverdata)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/fulltrain.json.gz')

### Unlabelled test set

In [29]:
# unlabelled test set

fn = "shared_task_test.jsonl"
feversttest = read_gzip_data(feverp.joinpath(fn + ".gz"))
write_gzip_data(fever_proc.joinpath("truetest.json.gz"), feversttest)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/truetest.json.gz')

### Unlabelled paper_dev + paper_test

In [30]:
# unlabelled paper_dev + paper_test

fn = "shared_task_dev_public.jsonl"
feverstdevpub = read_gzip_data(feverp.joinpath(fn + ".gz"))
write_gzip_data(fever_proc.joinpath("paperdevtest_unlabelled.json.gz"), feverstdevpub)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/paperdevtest_unlabelled.json.gz')

# SciFact

In [18]:
def scifact_preproc(doc, corpus_col):
    """
    Preprocess SciFact claims to include support into each claim document for ease of processing later
    """
    if "evidence" not in doc:
        return doc
    for iind, i in enumerate(doc["cited_doc_ids"]):
        cite = corpus_col.find_one({"doc_id": i})
        if doc["evidence"] and str(i) in doc["evidence"]:
            for jind, j in enumerate(doc["evidence"][str(i)]):
                sentences = [cite["abstract"][k] for k in j["sentences"]]
                doc["evidence"][str(i)][jind]["sentences"] = sentences
        del cite["_id"]
        doc["cited_doc_ids"][iind] = cite
    return doc

In [17]:
sfp = datap.joinpath("scifact")
sfoutp = procp.joinpath("scifact")

if not sfoutp.exists():
    sfoutp.mkdir()

In [22]:
fn = "corpus.jsonl"
sf_corpus = read_gzip_data(sfp.joinpath(fn + ".gz"))
for i in range(len(sf_corpus)):
    sf_corpus[i]["abstract"] = [l.strip() for l in sf_corpus[i]["abstract"]]

write_gzip_data(sfoutp.joinpath(fn + ".gz"), sf_corpus)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/scifact/corpus.jsonl.gz')