In [5]:
import os

from pathlib import Path

from joblib import Parallel, delayed
from tinydb import TinyDB
from src.gen.util import read_gzip_data, write_gzip_data

In [2]:
datap = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data")
procp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1")

# FEVER

In [3]:
feverp = datap.joinpath("fever")

fever_proc = procp.joinpath("fever")
fever_corpus_proc = fever_proc.joinpath("corpus1")

if not fever_corpus_proc.exists():
    fever_corpus_proc.mkdir(parents=True)

## Corpus

In [4]:
def fever_corpus_preproc(fin, fout, debug=False):
    wiki = read_gzip_data(fin)
    for i in range(len(wiki)):
        lines = wiki[i]["lines"].strip().split("\n")
        lines = [l.split("\t") for l in lines]
        lines = [[l[0], l[1], list(set(l[2:]))] for l in lines if len(l) > 2]
        wiki[i]["lines"] = lines
        wiki[i]["paper_partition"] = fin.stem
    return wiki if debug else write_gzip_data(fout.joinpath(fin.name), wiki)

In [5]:
wiki = feverp.joinpath("wiki-pages")

fcorpus_col = Parallel(n_jobs=30, verbose=10)(delayed(fever_corpus_preproc)(fp, fever_corpus_proc) for fp in wiki.iterdir())
len(fcorpus_col)

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:   12.1s
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:   16.2s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:   25.1s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:   31.4s
[Parallel(n_jobs=30)]: Done  61 out of 109 | elapsed:   44.7s remaining:   35.1s
[Parallel(n_jobs=30)]: Done  72 out of 109 | elapsed:   49.3s remaining:   25.3s
[Parallel(n_jobs=30)]: Done  83 out of 109 | elapsed:   53.9s remaining:   16.9s
[Parallel(n_jobs=30)]: Done  94 out of 109 | elapsed:   59.7s remaining:    9.5s
[Parallel(n_jobs=30)]: Done 105 out of 109 | elapsed:  1.0min remaining:    2.4s
[Parallel(n_jobs=30)]: Done 109 out of 109 | elapsed:  1.1min finished


109

In [12]:
def make_index(fin):
    wiki = read_gzip_data(fin)
    wiki = wiki if len(wiki) > 1 else wiki[0]
    ind = [{"id": d["id"], "paper_partition": d["paper_partition"]} for d in wiki]
    return ind
fcorpus_ind = Parallel(n_jobs=30, verbose=10)(delayed(make_index)(f) for f in fever_corpus_proc.iterdir())

fwiki_ind = TinyDB(fever_proc.joinpath("corpus.index.json"))
for d in fcorpus_ind:
    fwiki_ind.insert_multiple(d)

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    4.6s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    6.6s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    7.9s
[Parallel(n_jobs=30)]: Done  61 out of 109 | elapsed:   11.2s remaining:    8.8s
[Parallel(n_jobs=30)]: Done  72 out of 109 | elapsed:   12.5s remaining:    6.4s
[Parallel(n_jobs=30)]: Done  83 out of 109 | elapsed:   13.5s remaining:    4.2s
[Parallel(n_jobs=30)]: Done  94 out of 109 | elapsed:   14.4s remaining:    2.3s
[Parallel(n_jobs=30)]: Done 105 out of 109 | elapsed:   15.0s remaining:    0.6s
[Parallel(n_jobs=30)]: Done 109 out of 109 | elapsed:   15.2s finished


## Claims

### Labelled

In [25]:
fn = "train.jsonl"
ftrain = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(ftrain)):
    ftrain[i]["paper_partition"] = "train"

In [26]:
fn = "paper_dev.jsonl"
feverstpaperdev = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(feverstpaperdev)):
    feverstpaperdev[i]["paper_partition"] = "dev"

In [27]:
fn = "paper_test.jsonl"
feverstpapertest = read_gzip_data(feverp.joinpath(fn + ".gz"))
for i in range(len(feverstpapertest)):
    feverstpapertest[i]["paper_partition"] = "test"

In [28]:
feverdata = ftrain + feverstpaperdev + feverstpapertest
write_gzip_data(fever_proc.joinpath("fulltrain.json.gz"), feverdata)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/fulltrain.json.gz')

### Unlabelled test set

In [29]:
# unlabelled test set

fn = "shared_task_test.jsonl"
feversttest = read_gzip_data(feverp.joinpath(fn + ".gz"))
write_gzip_data(fever_proc.joinpath("truetest.json.gz"), feversttest)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/truetest.json.gz')

### Unlabelled paper_dev + paper_test

In [30]:
# unlabelled paper_dev + paper_test

fn = "shared_task_dev_public.jsonl"
feverstdevpub = read_gzip_data(feverp.joinpath(fn + ".gz"))
write_gzip_data(fever_proc.joinpath("paperdevtest_unlabelled.json.gz"), feverstdevpub)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/fever/paperdevtest_unlabelled.json.gz')

# SciFact

In [18]:
def scifact_preproc(doc, corpus_col):
    """
    Preprocess SciFact claims to include support into each claim document for ease of processing later
    """
    if "evidence" not in doc:
        return doc
    for iind, i in enumerate(doc["cited_doc_ids"]):
        cite = corpus_col.find_one({"doc_id": i})
        if doc["evidence"] and str(i) in doc["evidence"]:
            for jind, j in enumerate(doc["evidence"][str(i)]):
                sentences = [cite["abstract"][k] for k in j["sentences"]]
                doc["evidence"][str(i)][jind]["sentences"] = sentences
        del cite["_id"]
        doc["cited_doc_ids"][iind] = cite
    return doc

In [25]:
sfp = datap.joinpath("scifact")
sfoutp = procp.joinpath("scifact")

if not sfoutp.exists():
    sfoutp.mkdir()

In [26]:
fn = "corpus.jsonl"
sf_corpus = read_gzip_data(sfp.joinpath(fn + ".gz"))
for i in range(len(sf_corpus)):
    sf_corpus[i]["abstract"] = [l.strip() for l in sf_corpus[i]["abstract"]]

write_gzip_data(sfoutp.joinpath("scifact.json.gz"), sf_corpus)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/scifact/scifact.json.gz')

# Climate Fever

In [7]:
cdata = read_gzip_data(datap.joinpath("climatefever.jsonl.gz"))
write_gzip_data(procp.joinpath("climatefever", "climatefever.json.gz"), cdata)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1/climatefever/climatefever.json.gz')