In [1]:
import json
import gzip
import pickle as pkl
import threading
import queue
from pathlib import Path

import spacy
from spacy import displacy
from matplotlib import pyplot as plt
from joblib import Parallel, delayed
from tinydb import TinyDB, Query

from src.gen.util import read_gzip_data, write_gzip_data

2023-02-15 11:17:23.944159: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
datap = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1")

# Preprocess Level 2

In [3]:
procp = datap.parent.joinpath("2")
if not procp.exists():
    procp.mkdir(parents=True)

nlp = spacy.load("en_core_web_lg")

In [4]:
def extract_key_lemma(nlp, doc_ls, n_jobs=30, batch_size=100):
    # https://stackoverflow.com/questions/48199353/how-to-use-spacy-in-large-dataset-with-short-sentences-efficiently
    def token_filter(token):
        return not (token.is_punct | token.is_space | token.is_stop | token.is_digit)
    filtered_tokens = []
    for doc in nlp.pipe(doc_ls, n_process=n_jobs, batch_size=batch_size):
        tokens = [token.lemma_ for token in doc if token_filter(token)]
        filtered_tokens.append(tokens)
    return filtered_tokens

def compressed_pkl(fp, payload):
    with gzip.GzipFile(fp, "w") as gzfn:
        gzfn.write(pkl.dumps(payload))
    return fp

## SciFact

In [6]:
sfp = procp.joinpath("scifact")
if not sfp.exists():
    sfp.mkdir()

## Corpus

In [43]:
sf_corpus = read_gzip_data(datap.joinpath("scifact", "corpus.json.gz"))

sf_corpus_title_tokens = extract_key_lemma(nlp, [d["title"] for d in sf_corpus.values()])
sf_corpus_abstract_tokens = extract_key_lemma(nlp, [" ".join(d["abstract"]) for d in sf_corpus.values()])

compressed_pkl(sfp.joinpath("corpus_lemma.pkl.gz"), {"title": sf_corpus_title_tokens, "evidence": sf_corpus_abstract_tokens})

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/scifact/corpus_lemma.pkl.gz')

## Claims

In [25]:
sf_claims = read_gzip_data(datap.joinpath("scifact", "fullscifact.json.gz"))

sf_claims_tokens = extract_key_lemma(nlp, [d["claim"] for d in sf_claims], 60)

compressed_pkl(sfp.joinpath("claims_lemma.pkl.gz"), sf_claims_tokens)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/scifact/claims_lemma.pkl.gz')

## Climate Fever

In [49]:
cfp = procp.joinpath("climatefever")
if not cfp.exists():
    cfp.mkdir()
    
cfall = read_gzip_data(datap.joinpath("climatefever", "climatefever.json.gz"))

In [50]:
cf_claims_tokens = extract_key_lemma(nlp, [d["claim"] for d in cfall], 60)
compressed_pkl(cfp.joinpath("claims_lemma.pkl.gz"), cf_claims_tokens)

cf_title = []
cf_corpus = []
for d in cfall:
    t_tmp = []
    for e in d["evidences"]:
        t_tmp.append(e["article"])
        cf_corpus.append(e["evidence"])
    # evidence title are duplicative within each evidence document
    cf_title.append(" ".join(list(set(t_tmp))))

cf_title_tokens = extract_key_lemma(nlp, cf_title)
cf_corpus_tokens = extract_key_lemma(nlp, cf_corpus, 60)
compressed_pkl(cfp.joinpath("corpus_lemma.pkl.gz"), {"title": cf_title_tokens, "evidence": cf_corpus_tokens})

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/climatefever/corpus_lemma.pkl.gz')

## FEVER

In [5]:
feverp = procp.joinpath("fever")
if not feverp.exists():
    feverp.mkdir()

## Corpus

In [6]:
def comsumer_fever_corpus():
    while True:
        fp, payload = q.get()
        if fp == None:
            break
        print(compressed_pkl(fp, payload))
        q.task_done()
    q.task_done()

In [7]:
corpus_part = feverp.joinpath("part")
if not corpus_part.exists():
    corpus_part.mkdir()
    
t = threading.Thread(target=comsumer_fever_corpus)
q = queue.Queue()

comp_pf = [f.name.split(".")[0] for f in list(corpus_part.iterdir())]
t.start()
for fp in datap.joinpath("fever", "corpus").iterdir():
    if fp.name.split(".")[0] not in comp_pf:
        wiki = read_gzip_data(fp)
        fwiki_tokens = extract_key_lemma(nlp, [d["text"] for d in wiki.values()], batch_size=300)
        q.put((corpus_part.joinpath(fp.name.split(".")[0] + ".part.pkl.gz"), fwiki_tokens))
q.put((None, None))
q.join()
t.join()

/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-037.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-072.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-071.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-014.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-006.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-052.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-058.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-048.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-010.part.pkl.gz
/

In [9]:
fcorpus_full = []
for pf in corpus_part.iterdir():
    with gzip.open(pf.as_posix()) as fn:
        fcorpus_full += pkl.load(fn)
compressed_pkl(feverp.joinpath("corpus_lemma.pkl.gz"), fcorpus_full)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/corpus_lemma.pkl.gz')

## Claims

In [59]:
fclaimsall = read_gzip_data(datap.joinpath("fever", "fulltrain.json.gz")) + read_gzip_data(datap.joinpath("fever", "truetest.json.gz"))

f_claims_tokens = extract_key_lemma(nlp, [d["claim"] for d in fclaimsall], batch_size=1000)
compressed_pkl(feverp.joinpath("claims_lemma.pkl.gz"), cf_claims_tokens)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/claims_lemma.pkl.gz')