In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import random
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict

import constants
from gen.util import read_data, write_jsonl
from feverise import scifact, climatefever

In [3]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/climatefever")
scifactp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/scifact")

climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

climatefdp.mkdir(exist_ok=True), scifactdp.mkdir(exist_ok=True)

(None, None)

fever_dev_sample, wiki_sample = None, None

fever_dev_sample = read_data(feverp / "train.jsonl")
for p in feverp.joinpath("wiki-pages").iterdir():
    if random.uniform(0, 1) > 0.7:
        wiki_sample = read_data(p)
        break

# SciFact

 - Every claim has only 1 type of label

In [4]:
sf_dev = read_data(scifactp / "claims_dev.jsonl")
sf_test = read_data(scifactp / "claims_test.jsonl")
sf_train = read_data(scifactp / "claims_train.jsonl")
sf_corpus = read_data(scifactp / "corpus.jsonl")

In [50]:
sf_dev_f = scifact.feverise_claims(sf_dev)
sf_train_f = scifact.feverise_claims(sf_train)

sf_corpus_f = scifact.feverise_corpus(sf_corpus)

In [6]:
scifactdp.joinpath("wiki-pages").mkdir(exist_ok=True)
write_jsonl(scifactdp / "scifact_dev.jsonl", sf_dev_f)
write_jsonl(scifactdp / "scifact_train.jsonl", sf_train_f)
write_jsonl(scifactdp / "scifact_all.jsonl", sf_train_f + sf_dev_f)
write_jsonl(scifactdp / "scifact_test.jsonl", sf_test)
write_jsonl(scifactdp / "scifact_corpus.jsonl", sf_corpus_f)  # copy

# default no title
write_jsonl(scifactdp / "wiki-pages" / "wiki-001.jsonl", sf_corpus_f)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/wiki-pages/wiki-001.jsonl')

In [10]:
sf_all = read_data(scifactdp / "scifact_all.jsonl")
sf_test = [{"id": doc["id"], "claim": doc["claim"]} for doc in sf_all]

write_jsonl(scifactdp / "scifact_all_test.jsonl", sf_test)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_all_test.jsonl')

# Climate-FEVER

- Has multiple labels for each sentence for each claim

In [18]:
cf_data = read_data(climatefp / "climatefever.jsonl")
cf_paper_f, cf_assumed_f, cf_corpus_f = climatefever.feverise_climatefever(cf_data)

In [8]:
climatefdp.joinpath("wiki-pages").mkdir(exist_ok=True)
write_jsonl(climatefdp / "climatefever_paper_all.jsonl", cf_paper_f)
write_jsonl(climatefdp / "climatefever_assumed_all.jsonl", cf_assumed_f)
write_jsonl(climatefdp / "climatefever_corpus.jsonl", cf_corpus_f)  # copy, for consistency
write_jsonl(climatefdp / "wiki-pages" / "wiki-001.jsonl", cf_corpus_f)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever/wiki-pages/wiki-001.jsonl')

# Count Claim-Evidence

In [4]:
fever_claims = read_data(feverp / "train.jsonl") + read_data(feverp / "shared_task_dev.jsonl")
scifact_claims = read_data(scifactdp / "scifact_all.jsonl")
cfever_claims = read_data(climatefdp / "climatefever_assumed_all.jsonl")

In [54]:
cfever_claims[122]

{'id': 272,
 'claim': 'The melting ice has led to global sea level rise of around eight inches since reliable record keeping began in 1880.',
 'label': 'SUPPORTS',
 'elab': ['SUPPORTS', 'SUPPORTS'],
 'is_disputed': False,
 'evidence': [[[None, None, 'Greenland ice sheet', 35],
   [None, None, 'Sea level rise', 4]]],
 'other_elab': ['NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO'],
 'other_evidence': [[[None, None, 'Sea level rise', 26],
   [None, None, 'Sea level rise', 3],
   [None, None, 'Sea level rise', 691]]],
 'verifiable': 'VERIFIABLE'}

In [36]:
def count_evidences(claims):
    from collections import defaultdict
    
    n_evidences = defaultdict(list)
    for d in claims:
        n_evidences[d["label"]].append(0)
        for i in d["evidence"]:
            n_evidences[d["label"]][-1] += len(i)
        if "other_evidence" in d:
            for i in d["other_evidence"]:
                n_evidences[d["label"]][-1] += len(i)
    
    return n_evidences

In [37]:
fever_cnt = count_evidences(fever_claims)
scifact_cnt = count_evidences(scifact_claims)
cfever_cnt = count_evidences(cfever_claims)

In [38]:
fever_grand = np.concatenate((np.array(fever_cnt["SUPPORTS"]), np.array(fever_cnt["REFUTES"])))
np.min(fever_grand), scipy.stats.mode(fever_grand), np.mean(fever_grand), np.max(fever_grand)

  np.min(fever_grand), scipy.stats.mode(fever_grand), np.mean(fever_grand), np.max(fever_grand)


(1, ModeResult(mode=array([1]), count=array([76273])), 2.3748761592308067, 338)

In [39]:
scifact_grand = np.concatenate((np.array(scifact_cnt["SUPPORTS"]), np.array(scifact_cnt["REFUTES"])))
np.min(scifact_grand), scipy.stats.mode(scifact_grand), np.mean(scifact_grand), np.max(scifact_grand)

  np.min(scifact_grand), scipy.stats.mode(scifact_grand), np.mean(scifact_grand), np.max(scifact_grand)


(1, ModeResult(mode=array([1]), count=array([321])), 2.0072150072150072, 12)

In [40]:
cfever_grand = np.concatenate((
    np.array(cfever_cnt["SUPPORTS"])
    , np.array(cfever_cnt["REFUTES"])
    , np.array(cfever_cnt["NOT ENOUGH INFO"])
))
np.min(cfever_grand), scipy.stats.mode(cfever_grand), np.mean(cfever_grand), np.max(cfever_grand)

  np.min(cfever_grand), scipy.stats.mode(cfever_grand), np.mean(cfever_grand), np.max(cfever_grand)


(5, ModeResult(mode=array([5]), count=array([907])), 5.409120521172638, 6)