In [1]:
try:
    from torch.cuda import device_count
    # not really best way to find out if using gpu
    is_gpu = device_count() > 0
except Exception:
    is_gpu = False

import json
import gzip
import gc
import pickle as pkl
import threading
import queue
from pathlib import Path

import spacy
from spacy import displacy
from matplotlib import pyplot as plt
from joblib import Parallel, delayed
from tinydb import TinyDB, Query
if is_gpu:
    from thinc.api import set_gpu_allocator, require_gpu
    set_gpu_allocator("pytorch")
    require_gpu(0)

from src.gen.util import read_gzip_data, write_gzip_data

datap = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/1")

# Preprocess Level 2

In [2]:
procp = datap.parent.joinpath("2")
if not procp.exists():
    procp.mkdir(parents=True)

# nlp = spacy.load("en_core_web_lg")
nlp = spacy.load("en_core_web_trf")

In [3]:
def extract_key_lemma(nlp, doc_ls, n_jobs=30, batch_size=100, with_ner=True):
    # https://stackoverflow.com/questions/48199353/how-to-use-spacy-in-large-dataset-with-short-sentences-efficiently
    def token_filter(token):
        return not (
            token.is_punct 
            or token.is_space 
            or token.is_stop 
            or token.is_digit 
            or len(token.lemma_) < 2
        )
    if is_gpu:
        n_jobs = 1
        batch_size = 2500
    
    filtered_tokens = []
    ners = []
    for doc in nlp.pipe(doc_ls, n_process=n_jobs, batch_size=batch_size, disable=[] if with_ner else ["ner"]):
        filtered_tokens.append([token.lemma_ for token in doc if token_filter(token)])
        
        if with_ner:
            ner = [(token.text, token.label_) for token in doc.ents]
            if ner:
                ners.append(ner)
    return (filtered_tokens, ners) if ners else filtered_tokens

def compressed_pkl(fp, payload):
    with gzip.GzipFile(fp, "w") as gzfn:
        gzfn.write(pkl.dumps(payload))
    return fp

## SciFact

In [4]:
sfp = procp.joinpath("scifact")
if not sfp.exists():
    sfp.mkdir()

### Corpus

In [9]:
sf_corpus = read_gzip_data(datap.joinpath("scifact", "corpus.json.gz"))

sf_corpus_title_tokens = extract_key_lemma(nlp, [d["title"] for d in sf_corpus.values()], with_ner=False)
sf_corpus_abstract_tokens, sf_corpus_abstract_ner = extract_key_lemma(nlp, [" ".join(d["abstract"]) for d in sf_corpus.values()])

write_gzip_data(sfp.joinpath("corpus_lemma.pkl.gz"), {"title": sf_corpus_title_tokens, "evidence": sf_corpus_abstract_tokens, "ner": sf_corpus_abstract_ner})

'/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/scifact/corpus_lemma.pkl.gz'

### Claims

In [10]:
sf_claims = read_gzip_data(datap.joinpath("scifact", "fullscifact.json.gz"))

sf_claims_tokens, sf_claims_ner = extract_key_lemma(nlp, [d["claim"] for d in sf_claims])

compressed_pkl(sfp.joinpath("claims_lemma.pkl.gz"), {"claims": sf_claims_tokens, "ner": sf_claims_ner})

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/scifact/claims_lemma.pkl.gz')

In [6]:
del sf_corpus_title_tokens, sf_corpus_abstract_tokens, sf_corpus_abstract_ner, sf_claims_tokens, sf_claims_ner

gc.collect()

NameError: name 'sf_claims_tokens' is not defined

## Climate Fever

In [None]:
cfp = procp.joinpath("climatefever")
if not cfp.exists():
    cfp.mkdir()
    
cfall = read_gzip_data(datap.joinpath("climatefever", "climatefever.json.gz"))

### Claims

In [13]:
cf_claims_tokens, cf_claims_ner = extract_key_lemma(nlp, [d["claim"] for d in cfall], 60)
compressed_pkl(cfp.joinpath("claims_lemma.pkl.gz"), {"claims": cf_claims_tokens, "ner": cf_claims_ner})

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/climatefever/claims_lemma.pkl.gz')

### Corpus

In [None]:
cf_title = []
cf_corpus = []
for d in cfall:
    t_tmp = []
    for e in d["evidences"]:
        t_tmp.append(e["article"])
        cf_corpus.append(" ".join(e["evidence"]))
    # evidence title are duplicative within each evidence document
    cf_title.append(" ".join(list(set(t_tmp))))

cf_title_tokens = extract_key_lemma(nlp, cf_title, with_ner=False)
cf_corpus_tokens, cf_corpus_ner = extract_key_lemma(nlp, cf_corpus, 60)
compressed_pkl(cfp.joinpath("corpus_lemma.pkl.gz"), {"title": cf_title_tokens, "evidence": cf_corpus_tokens, "ner": cf_corpus_ner})

In [None]:
del cf_claims_tokens, cf_claims_ner, cf_title_tokens, cf_corpus_tokens, cf_corpus_ner

gc.collect()

## FEVER

In [4]:
feverp = procp.joinpath("fever")
if not feverp.exists():
    feverp.mkdir()

### Claims

In [17]:
fclaimsall = read_gzip_data(datap.joinpath("fever", "fulltrain.json.gz")) + read_gzip_data(datap.joinpath("fever", "truetest.json.gz"))

f_claims_tokens, f_claims_ner = extract_key_lemma(nlp, [d["claim"] for d in fclaimsall], batch_size=1000)
compressed_pkl(feverp.joinpath("claims_lemma.pkl.gz"), {"claims": f_claims_tokens, "ner": f_claims_ner})

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/claims_lemma.pkl.gz')

In [18]:
del f_claims_tokens, f_claims_ner

gc.collect()

18

### Corpus

In [5]:
def comsumer_fever_corpus():
    while True:
        fp, payload = q.get()
        if fp is None:
            break
        print(write_gzip_data(fp, payload))
        q.task_done()
    q.task_done()

In [None]:
corpus_part = feverp.joinpath("part")
if not corpus_part.exists():
    corpus_part.mkdir()
    
t = threading.Thread(target=comsumer_fever_corpus)
q = queue.Queue()

comp_pf = [f.name.split(".")[0] for f in list(corpus_part.iterdir())]
t.start()
for fp in datap.joinpath("fever", "corpus").iterdir():
    if fp.name.split(".")[0] not in comp_pf:
        wiki = read_gzip_data(fp)
        fwiki_tokens, fwiki_ner = extract_key_lemma(nlp, [d["text"] for d in wiki.values()], batch_size=1000, n_jobs=10)
        q.put((corpus_part.joinpath(fp.name.split(".")[0] + ".part.pkl.gz"), {"evidence": fwiki_tokens, "ner": fwiki_ner}))
q.put((None, None))
q.join()
t.join()

Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors


/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-013.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-062.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-058.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-068.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-104.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-102.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-056.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-089.part.pkl.gz
/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/data/level/2/fever/part/wiki-053.part.pkl.gz
/