In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import random
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict
from joblib import Parallel, delayed

import constants
from gen.util import read_data, write_jsonl
from feverise import scifact, climatefever, climatefever_sent, build_db_mod
from feverise.util import replace_id_with_titleid

In [3]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/climatefever")
scifactp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/scifact")

# climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_sent")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

climatefdp.mkdir(exist_ok=True), scifactdp.mkdir(exist_ok=True)

(None, None)

fever_dev_sample, wiki_sample = None, None

fever_dev_sample = read_data(feverp / "train.jsonl")
for p in feverp.joinpath("wiki-pages").iterdir():
    if random.uniform(0, 1) > 0.7:
        wiki_sample = read_data(p)
        break

# SciFact

 - Every claim has only 1 type of label

In [4]:
sf_dev = read_data(scifactp / "claims_dev.jsonl")
sf_test = read_data(scifactp / "claims_test.jsonl")
sf_train = read_data(scifactp / "claims_train.jsonl")
sf_corpus = read_data(scifactp / "corpus.jsonl")

In [5]:
sf_dev_f = scifact.feverise_claims(sf_dev)
sf_train_f = scifact.feverise_claims(sf_train)

sf_corpus_f = scifact.feverise_corpus(sf_corpus)
sf_corpus_titleid_f = scifact.feverise_corpus_titleid(sf_corpus_f)

In [6]:
sf_p_ls = []
sf_wikipages = scifactdp.joinpath("wiki-pages")
sf_wikipages_titleid = scifactdp.joinpath("titleid-wiki-pages")

sf_p_ls.append(write_jsonl(scifactdp / "scifact_dev.jsonl", sf_dev_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_train.jsonl", sf_train_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_all.jsonl", sf_train_f + sf_dev_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_test.jsonl", sf_test))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_corpus.jsonl", sf_corpus_f))  # copy

# default no title
sf_wikipages.mkdir(exist_ok=True)
sf_p_ls.append(write_jsonl(sf_wikipages / "wiki-001.jsonl", sf_corpus_f))

# title ID
sf_wikipages_titleid.mkdir(exist_ok=True)
sf_p_ls.append(write_jsonl(sf_wikipages_titleid / "wiki-001.jsonl", sf_corpus_titleid_f))

sf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_dev.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_train.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_test.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/wiki-pages/wiki-001.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/titleid-wiki-pages/wiki-001.jsonl')]

In [7]:
sf_db_p = scifactdp / "feverised-scifact-titleid.db"
build_db_mod.run(data_path=sf_wikipages_titleid, save_path=sf_db_p, num_workers=5)

Reading into database...


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  6.85it/s]
1it [00:00,  6.85it/s][A
100%|██████████| 1/1 [00:00<00:00,  6.74it/s]


Read 5183 docs.
Committing...


In [8]:
titleid_sf_claims_train = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(sf_db_p, doc) for doc in sf_train_f)
titleid_sf_claims_dev = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(sf_db_p, doc) for doc in sf_dev_f)
titleid_sf_claims = titleid_sf_claims_train + titleid_sf_claims_dev

write_jsonl(scifactdp / "scifact_train_titleid.jsonl", titleid_sf_claims_train)
write_jsonl(scifactdp / "scifact_dev_titleid.jsonl", titleid_sf_claims_dev)
write_jsonl(scifactdp / "scifact_all_titleid.jsonl", titleid_sf_claims)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_all_titleid.jsonl')

In [9]:
sf_test = [{"id": doc["id"], "claim": doc["claim"]} for doc in titleid_sf_claims]

write_jsonl(scifactdp / "scifact_all_test.jsonl", sf_test)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_all_test.jsonl')

# Climate-FEVER

- Has multiple labels for each sentence for each claim

## Pure claims

Only evidences that support the claim are included, used for concatenated sentence model training

In [4]:
cf_data = read_data(climatefp / "climatefever.jsonl")
cf_paper_f, cf_assumed_f, cf_corpus_f, cf_lineid_translator = climatefever.feverise_climatefever(cf_data)
cf_corpus_titleid_f = climatefever.feverise_corpus_titleid(cf_corpus_f)

In [5]:
cf_p_ls = []

cf_wikipages = climatefdp.joinpath("wiki-pages")
cf_wikipages_titleid = climatefdp.joinpath("titleid-wiki-pages")

cf_p_ls.append(write_jsonl(climatefdp / "climatefever_paper_all.jsonl", cf_paper_f))
# cf_p_ls.append(write_jsonl(climatefdp / "climatefever_assumed_all.jsonl", cf_assumed_f))
cf_p_ls.append(write_jsonl(climatefdp / "climatefever_corpus.jsonl", cf_corpus_f))  # copy, for consistency

cf_wikipages.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages / "wiki-001.jsonl", cf_corpus_f))
cf_wikipages_titleid.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages_titleid / "wiki-001.jsonl", cf_corpus_titleid_f))

with climatefdp.joinpath("lineid_translator.json").open("w") as fn:
    json.dump(cf_lineid_translator, fn)
cf_p_ls.append(fn.name)

cf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/wiki-pages/wiki-001.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/titleid-wiki-pages/wiki-001.jsonl'),
 '/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/lineid_translator.json']

In [6]:
cf_db_p = climatefdp / "feverised-climatefever-titleid.db"
build_db_mod.run(data_path=cf_wikipages_titleid, save_path=cf_db_p, num_workers=5)

Reading into database...


  0%|          | 0/1 [00:00<?, ?it/s]
1it [00:00, 26.84it/s]
100%|██████████| 1/1 [00:00<00:00, 25.41it/s]

Read 1344 docs.
Committing...





In [7]:
titleid_cf_claims = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(cf_db_p, doc) for doc in cf_paper_f)
write_jsonl(climatefdp / "climatefever_paper_all_titleid.jsonl", titleid_cf_claims)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all_titleid.jsonl')

In [8]:
cf_test = [{"id": doc["id"], "claim": doc["claim"]} for doc in titleid_cf_claims]

write_jsonl(climatefdp / "climatefever_paper_all_test.jsonl", cf_test)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all_test.jsonl')

## Original claims

Uses the original Climate-FEVER impure claim-evidence pair (evidence may not entail the claim)

In [4]:
cf_data = read_data(climatefp / "climatefever.jsonl")
cf_paper_f, cf_assumed_f, cf_corpus_f, cf_lineid_translator = climatefever_sent.feverise_climatefever(cf_data)
cf_corpus_titleid_f = climatefever.feverise_corpus_titleid(cf_corpus_f)

In [5]:
cf_p_ls = []

cf_wikipages = climatefdp.joinpath("wiki-pages")
cf_wikipages_titleid = climatefdp.joinpath("titleid-wiki-pages")

cf_p_ls.append(write_jsonl(climatefdp / "climatefever_paper_all.jsonl", cf_paper_f))
# cf_p_ls.append(write_jsonl(climatefdp / "climatefever_assumed_all.jsonl", cf_assumed_f))
cf_p_ls.append(write_jsonl(climatefdp / "climatefever_corpus.jsonl", cf_corpus_f))  # copy, for consistency

cf_wikipages.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages / "wiki-001.jsonl", cf_corpus_f))
cf_wikipages_titleid.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages_titleid / "wiki-001.jsonl", cf_corpus_titleid_f))

with climatefdp.joinpath("lineid_translator.json").open("w") as fn:
    json.dump(cf_lineid_translator, fn)
cf_p_ls.append(fn.name)

cf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/wiki-pages/wiki-001.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/titleid-wiki-pages/wiki-001.jsonl'),
 '/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/lineid_translator.json']

In [6]:
cf_db_p = climatefdp / "feverised-climatefever-titleid.db"
build_db_mod.run(data_path=cf_wikipages_titleid, save_path=cf_db_p, num_workers=5)

Reading into database...


  0%|          | 0/1 [00:00<?, ?it/s]
1it [00:00, 26.84it/s]
100%|██████████| 1/1 [00:00<00:00, 25.41it/s]

Read 1344 docs.
Committing...





In [7]:
titleid_cf_claims = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(cf_db_p, doc) for doc in cf_paper_f)
write_jsonl(climatefdp / "climatefever_paper_all_titleid.jsonl", titleid_cf_claims)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all_titleid.jsonl')

In [8]:
cf_test = [{"id": doc["id"], "claim": doc["claim"]} for doc in titleid_cf_claims]

write_jsonl(climatefdp / "climatefever_paper_all_test.jsonl", cf_test)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever_v2/climatefever_paper_all_test.jsonl')