# Init

In [1]:
import sys
sys.path.insert(0, "../src")

import json
import random
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict
from joblib import Parallel, delayed

import constants
from gen.util import read_data, write_jsonl
from feverise import scifact, climatefever, climatefever_sent, build_db, build_db_mod
from feverise.util import replace_id_with_titleid

[INFO] 2023-08-09 12:06:41,933 - LogHelper - Log Helper set up


In [2]:
root_data = Path("../data").resolve()

In [3]:
climatefp = root_data / "climatefever"
scifactp = root_data / "scifact"

climatefdp = root_data / "feverised-climatefever"
climate_s_fdp = root_data / "feverised-climatefever_sent"
scifactdp = root_data / "feverised-scifact"

climatefdp.mkdir(exist_ok=True), scifactdp.mkdir(exist_ok=True), climate_s_fdp.mkdir(exist_ok=True)

(None, None, None)

# SciFact

 - Every claim has only 1 type of label

In [4]:
sf_dev = read_data(scifactp / "claims_dev.jsonl")
sf_test = read_data(scifactp / "claims_test.jsonl")
sf_train = read_data(scifactp / "claims_train.jsonl")
sf_corpus = read_data(scifactp / "corpus.jsonl")

In [5]:
sf_dev_f = scifact.feverise_claims(sf_dev)
sf_train_f = scifact.feverise_claims(sf_train)

sf_corpus_f = scifact.feverise_corpus(sf_corpus)

In [6]:
sf_p_ls = []
sf_wikipages = scifactdp.joinpath("wiki-pages")

sf_p_ls.append(write_jsonl(scifactdp / "scifact_dev.jsonl", sf_dev_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_train.jsonl", sf_train_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_all.jsonl", sf_train_f + sf_dev_f))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_test.jsonl", sf_test))
sf_p_ls.append(write_jsonl(scifactdp / "scifact_corpus.jsonl", sf_corpus_f))  # copy

# default no title
sf_wikipages.mkdir(exist_ok=True)
sf_p_ls.append(write_jsonl(sf_wikipages / "wiki-001.jsonl", sf_corpus_f))

sf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_dev.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_train.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_test.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/wiki-pages/wiki-001.jsonl')]

In [7]:
sf_db_p = scifactdp / "feverised-scifact.db"
build_db.run(data_path=sf_wikipages, save_path=sf_db_p, num_workers=5)

[INFO] 2023-08-09 12:06:44,229 - DrQA BuildDB - Reading into database...
  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  5.88it/s]
1it [00:00,  5.88it/s][A
100%|██████████| 1/1 [00:00<00:00,  5.73it/s]
[INFO] 2023-08-09 12:06:44,462 - DrQA BuildDB - Read 5183 docs.
[INFO] 2023-08-09 12:06:44,463 - DrQA BuildDB - Committing...


In [16]:
sf_test = [{"id": doc["id"], "claim": doc["claim"]} for doc in sf_train_f + sf_dev_f]

write_jsonl(scifactdp / "scifact_all_test.jsonl", sf_test)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-scifact/scifact_all_test.jsonl')

# Climate-FEVER

## Pure claims

Only evidences that support the claim are included, used for concatenated sentence model training

In [8]:
cf_data = read_data(climatefp / "climatefever.jsonl")
cf_paper_f, cf_assumed_f, cf_corpus_f, cf_lineid_translator = climatefever.feverise_climatefever(cf_data)
cf_corpus_titleid_f = climatefever.feverise_corpus_titleid(cf_corpus_f)

In [9]:
cf_p_ls = []

cf_wikipages_titleid = climatefdp.joinpath("titleid-wiki-pages")

cf_p_ls.append(write_jsonl(climatefdp / "climatefever_paper_all.jsonl", cf_paper_f))
# cf_p_ls.append(write_jsonl(climatefdp / "climatefever_assumed_all.jsonl", cf_assumed_f))
cf_p_ls.append(write_jsonl(climatefdp / "climatefever_corpus.jsonl", cf_corpus_f))  # copy, for consistency

cf_wikipages_titleid.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages_titleid / "wiki-001.jsonl", cf_corpus_titleid_f))

with climatefdp.joinpath("lineid_translator.json").open("w") as fn:
    json.dump(cf_lineid_translator, fn)
cf_p_ls.append(fn.name)

cf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever/climatefever_paper_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever/climatefever_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever/titleid-wiki-pages/wiki-001.jsonl'),
 '/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever/lineid_translator.json']

In [10]:
cf_db_p = climatefdp / "feverised-climatefever-titleid.db"
build_db_mod.run(data_path=cf_wikipages_titleid, save_path=cf_db_p, num_workers=5)

Reading into database...


  0%|          | 0/1 [00:00<?, ?it/s]
1it [00:00, 26.29it/s]
100%|██████████| 1/1 [00:00<00:00, 24.49it/s]

Read 1344 docs.
Committing...





In [11]:
titleid_cf_claims = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(cf_db_p, doc) for doc in cf_paper_f)
write_jsonl(climatefdp / "climatefever_paper_all_titleid.jsonl", titleid_cf_claims)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever/climatefever_paper_all_titleid.jsonl')

## Original claims

Uses the original Climate-FEVER impure claim-evidence pair (evidence may not entail the claim)

In [12]:
cf_data = read_data(climatefp / "climatefever.jsonl")
cf_paper_f, cf_assumed_f, cf_corpus_f, cf_lineid_translator = climatefever_sent.feverise_climatefever(cf_data)
cf_corpus_titleid_f = climatefever.feverise_corpus_titleid(cf_corpus_f)

In [13]:
cf_p_ls = []

cf_wikipages_titleid = climate_s_fdp.joinpath("titleid-wiki-pages")

cf_p_ls.append(write_jsonl(climate_s_fdp / "climatefever_paper_all.jsonl", cf_paper_f))
# cf_p_ls.append(write_jsonl(climate_s_fdp / "climatefever_assumed_all.jsonl", cf_assumed_f))
cf_p_ls.append(write_jsonl(climate_s_fdp / "climatefever_corpus.jsonl", cf_corpus_f))  # copy, for consistency

cf_wikipages_titleid.mkdir(exist_ok=True)
cf_p_ls.append(write_jsonl(cf_wikipages_titleid / "wiki-001.jsonl", cf_corpus_titleid_f))

with climate_s_fdp.joinpath("lineid_translator.json").open("w") as fn:
    json.dump(cf_lineid_translator, fn)
cf_p_ls.append(fn.name)

cf_p_ls

[PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever_sent/climatefever_paper_all.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever_sent/climatefever_corpus.jsonl'),
 PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever_sent/titleid-wiki-pages/wiki-001.jsonl'),
 '/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever_sent/lineid_translator.json']

In [14]:
cf_db_p = climate_s_fdp / "feverised-climatefever-titleid.db"
build_db_mod.run(data_path=cf_wikipages_titleid, save_path=cf_db_p, num_workers=5)

Reading into database...


  0%|          | 0/1 [00:00<?, ?it/s]
1it [00:00, 16.16it/s]
100%|██████████| 1/1 [00:00<00:00, 15.66it/s]

Read 1344 docs.
Committing...





In [15]:
titleid_cf_claims = Parallel(n_jobs=5)(delayed(replace_id_with_titleid)(cf_db_p, doc) for doc in cf_paper_f)
write_jsonl(climate_s_fdp / "climatefever_paper_all_titleid.jsonl", titleid_cf_claims)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/nb/data/feverised-climatefever_sent/climatefever_paper_all_titleid.jsonl')