In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.append("../src")
import pickle as pkl

import json
import sqlite3
import numpy as np
import scipy.sparse as sp
from pathlib import Path

import constants
from gen.util import read_data, write_jsonl

# Approaches

## Approach 1
FEVER corpus + SciFact Claims => FEVER trained information retrieval module => FEVER trained RTE => SciFact claims with FEVER evidence

This approach is truly no knowledge pipeline as there is also no SciFact corpus. However, this pipeline disallow evaluation of the quality of information retrieval module and if relevant evidence for each SciFact claim was retrieved correctly. Can only evaluate if the claim was correctly predicted.

## Approach 2 (Use this approach)
SciFact corpus => FEVER trained information retrieval module => FEVER trained RTE => SciFact claims with SciFact evidence

This approach is most sensible for zero-shot as there is SciFact corpus. However, this pipeline does not feel like pure zero-shot (assumes SciFact corpus exist). This pipeline allows evaluation of information retrieval module and claim-evidence pair. May require rewritting the entire information retrieval code as most are indexed on trained corpus.

## Approach 3
FEVER+SciFact corpus => FEVER trained information retrieval module => FEVER trained RTE => SciFact claims with FEVER and/or SciFact evidence

Similar to Approach 2 but with FEVER corpus mixed in. However, the information retrieval module may be bias to FEVER corpus since it is trained on it. Still a sensible pipeline and allows evaluation of information retrieval module and claim-evidence pair.

# Baseline (Executed)

TF-IDF (SciFact Corpus Indexed) => Decomposable Attention (FEVER trained)

Use "/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline-fever2-sample/predict.sh" to make predictions on SciFact
 1. Run command in executing server.
 2. ml purge
 3. source activate fever-baseline
 4. Corpus index: ```/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/fever2-sample/data/index``` 
 5. Corpus database: ```/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/fever2-sample/data/fever/fever.db```
 6. Prediction: ```bash /users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/thesis/zeroshot/feveronly/baseline/scripts/01_predict_zeroshot_feveronly.sh /scratch/users/k21190024/fact-check-transfer-learning/dumps/feverised-scifact/scifact_all_test.jsonl /users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/thesis/zeroshot/feveronly/baseline/scifact_all_test.pred.jsonl```

In [7]:
os.chdir("/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline")
os.getcwd()

'/users/k21190024/study/fact-checking-repos/fever/baseline'

## Peek into DB

In [8]:
f_con = sqlite3.connect("/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/thesis/zeroshot/feveronly/baseline/fever.db")
f_cur = f_con.cursor()

f_cur.execute("""select * from documents""")
f_cur.fetchone()

('Year_book',
 'Year book may refer to :  Yearbook , a book to record , highlight , and commemorate the past year of a school  The Year Books , the earliest law reports of England ',
 '0\tYear book may refer to :\n1\t\n2\tYearbook , a book to record , highlight , and commemorate the past year of a school\tYearbook\tYearbook\n3\t\n4\tThe Year Books , the earliest law reports of England\tYear Books\tYear Books\n5\t')

In [5]:
import sqlite3

sf_con = sqlite3.connect("/users/k21190024/study/fact-check-transfer-learning/repos/fever/baseline/thesis/zeroshot/feveronly/baseline/feverised_scifact.db")
sf_cur = sf_con.cursor()

sf_cur.execute("""select * from documents""")
sf_cur.fetchone()

('4983',
 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was to term with

# UNC NLP (Executed but information retrieval fails)

Neural Semantic Matching Networks
  - Seems to use doc_id as search key?

1. Run command in executing server
2. `ml purge`
3. source activate \`which conda\`
4. `conda activate fever-uncnlp36`
5. `ml load openjdk/1.8.0_265-b01-gcc-9.4.0`
6. `export PYTHONPATH=$PYTHONPATH:/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/src && cd /users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp`
7. Tokenisation: ```python src/pipeline/prepare_data.py tokenization```
8. Indexing: ```python src/pipeline/prepare_data.py build_database```
9. AFTER all subsection code is run and required files are generated, run `python src/pipeline/auto_pipeline.py`

In [6]:
os.chdir("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp")
os.getcwd()

'/users/k21190024/study/fact-checking-repos/fever/uncnlp'

## Peek into DB

In [3]:
f_con = sqlite3.connect("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/data/fever.db")
f_cur = f_con.cursor()

sf_con = sqlite3.connect("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/thesis/zeroshot/feveronly/uncnlp/feverised_scifact_id_title.db")
sf_cur = sf_con.cursor()

In [4]:
f_cur.execute("""select * from documents limit 3""")
f_cur.fetchall()

[('Salām-e_Shāh',
  "Salām-e Shāh -LRB- سلام شاه , -LSB- sæˈlɒːme ʃɒːh -RSB- , `` Royal Salute '' -RRB- was the royal and national anthem of Persia -LRB- Iran -RRB- between 1873 and 1909 . Alfred Jean Baptiste Lemaire composed this anthem in 1873 on the orders of Naser al-Din Shah . It had no lyrics .   Salām-e Shāh was played in official ceremonies during the reign of Naser al-Din Shah , Mozaffar ad-Din Shah and Mohammad Ali Shah . It was also played as Persian national anthem during Naser al-Din Shah 's and Mozaffar ad-Din Shah 's European tours .   In 1909 after the fall of Mohammad Ali Shah , the anthem was abolished and after coronation of his son and successor Ahmad Shah in 1914 , Salute of Sublime State of Persia was adopted as Persian national anthem .",
  '[{"line_num": 0, "sentences": "Sala\\u0304m-e Sha\\u0304h -LRB- \\u0633\\u0644\\u0627\\u0645 \\u0634\\u0627\\u0647 , -LSB- s\\u00e6\\u02c8l\\u0252\\u02d0me \\u0283\\u0252\\u02d0h -RSB- , `` Royal Salute \'\' -RRB- was 

In [6]:
sf_cur.execute("""select * from documents limit 3""")
sf_cur.fetchall()

[('Microstructural_development_of_human_newborn_cerebral_white_matter_assessed_in_vivo_by_diffusion_tensor_magnetic_resonance_imaging.',
  'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coef

## Look into PageView

Not useful for evaluating SciFact as it is the view count for a particular Wiki page. Do not use pageview!

In [8]:
with open("/users/k21190024/study/fact-checking-repos/fever/uncnlp/thesis/zeroshot/feveronly/uncnlp/chaonan99/pageviews.pkl", mode="rb") as fn:
    pageview = pkl.load(fn)
type(pageview), len(pageview)

dict

In [13]:
pvk = list(pageview.keys())
pvk[1000]

'.theprodukkt'

In [18]:
for i, k in enumerate(pvk):
    if i == 10:
        break
    print(k, ":", pageview[k])

!!! : 1448
!!!Fuck_You!!! : 80
!Action_Pact! : 67
!Kung_language : 455
!T.O.O.H.! : 100
!Wowow! : 126
!_(The_Dismemberment_Plan_album) : 121
" : 338
"A"_Device : 189
"And"_theory_of_conservatism : 169


## Adapt "title" as "doc_id" for doc_retr_1

FEVER corpus does not have a title but its doc_id does have some keywords which this pipeline attempts to use for document retrieval.

SciFact corpus title is far more informative than FEVER corpus id thus it may create an optimistic information retrieval result

In [25]:
from hashlib import md5
from collections import Counter

In [58]:
# check for duplicated titles
sf_corpus = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/wiki-pages/wiki-001.jsonl"))
sf_title_hash = {doc["id"]: md5(bytes(doc["title"].strip().replace(" ", "_"), encoding="utf8")).hexdigest() for doc in sf_corpus}
len(sf_corpus), len(set(sf_title_hash.values()))

(5183, 5181)

In [59]:
sf_title_counter = Counter(sf_title_hash.values())
sf_title_counter.most_common(3)

[('290612199861c31d1036b185b4e69b75', 3),
 ('93eb90751149ccf38b66e47a192f496b', 1),
 ('553824c52f8f6a9fa9919589e63b585d', 1)]

In [60]:
dup_ids = [doc_id for doc_id, title_hash in sf_title_hash.items() if title_hash == sf_title_counter.most_common(1)[0][0]]
[doc for doc in sf_corpus if doc["id"] in dup_ids]

[{'id': '885056',
  'title': 'Summary',
  'structured': False,
  'text': 'Steroid receptor RNA activator (SRA), the only known RNA coactivator, augments transactivation by nuclear receptors (NRs). We identified SLIRP (SRA stem-loop interacting RNA binding protein) binding to a functional substructure of SRA, STR7. SLIRP is expressed in normal and tumor tissues, contains an RNA recognition motif (RRM), represses NR transactivation in a SRA- and RRM-dependent manner, augments the effect of Tamoxifen, and modulates association of SRC-1 with SRA. SHARP, a RRM-containing corepressor, also binds STR7, augmenting repression with SLIRP. SLIRP colocalizes with SKIP (Chr14q24.3), another NR coregulator, and reduces SKIP-potentiated NR signaling. SLIRP is recruited to endogenous promoters (pS2 and metallothionein), the latter in a SRA-dependent manner, while NCoR promoter recruitment is dependent on SLIRP. The majority of the endogenous SLIRP resides in the mitochondria. Our data demonstrate that

In [62]:
sf_corpus_title_id = []

for i, doc in enumerate(sf_corpus):
    orig_id = doc["id"]
    id_title = doc["title"].strip().replace(" ", "_")  # FEVER IDs have no space
    
    if orig_id in dup_ids:
        id_title += f"_{str(i)}"  # Since the title is uninformative, just make it unique
    sf_corpus_title_id.append({
        "id": id_title,
        "original_id": orig_id,
        "structured": doc["structured"],
        "text": doc["text"],
        "lines": doc["lines"]
    })
[doc for doc in sf_corpus_title_id if doc["original_id"] in dup_ids]

[{'id': 'Summary_163',
  'original_id': '885056',
  'structured': False,
  'text': 'Steroid receptor RNA activator (SRA), the only known RNA coactivator, augments transactivation by nuclear receptors (NRs). We identified SLIRP (SRA stem-loop interacting RNA binding protein) binding to a functional substructure of SRA, STR7. SLIRP is expressed in normal and tumor tissues, contains an RNA recognition motif (RRM), represses NR transactivation in a SRA- and RRM-dependent manner, augments the effect of Tamoxifen, and modulates association of SRC-1 with SRA. SHARP, a RRM-containing corepressor, also binds STR7, augmenting repression with SLIRP. SLIRP colocalizes with SKIP (Chr14q24.3), another NR coregulator, and reduces SKIP-potentiated NR signaling. SLIRP is recruited to endogenous promoters (pS2 and metallothionein), the latter in a SRA-dependent manner, while NCoR promoter recruitment is dependent on SLIRP. The majority of the endogenous SLIRP resides in the mitochondria. Our data demons

In [63]:
sf_title_id_hash = {doc["id"]: md5(bytes(doc["id"].strip(), encoding="utf8")).hexdigest() for doc in sf_corpus_title_id}
len(sf_corpus), len(set(sf_title_id_hash.values()))

(5183, 5183)

In [69]:
write_jsonl(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/uncnlp_wiki-pages/wiki-001.jsonl"), sf_corpus_title_id)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/uncnlp_wiki-pages/wiki-001.jsonl')

### Generate tokenized_doc_id.json

1. Run `python /users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/src/boey_src/01_generate_tokenized_doc_id.py`

In [70]:
tokenized_doc_id = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/data/tokenized_doc_id.json"))
len(tokenized_doc_id)

5396013

In [72]:
tokenized_doc_id["Snooker_world_rankings_2013/2014"]

{'words': ['Snooker', 'world', 'rankings', '2013/2014'],
 'lemmas': ['Snooker', 'world', 'ranking', '2013/2014']}

In [73]:
sf_tokenized_doc_id = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/thesis/zeroshot/feveronly/uncnlp/tokenized_doc_id.json"))
len(sf_tokenized_doc_id)

5183

In [74]:
sf_tokenized_doc_id["Inducible_nitric_oxide_synthase_in_pulmonary_alveolar_macrophages_from_patients_with_tuberculosis"]

{'words': ['Inducible',
  'nitric',
  'oxide',
  'synthase',
  'in',
  'pulmonary',
  'alveolar',
  'macrophages',
  'from',
  'patients',
  'with',
  'tuberculosis'],
 'lemmas': ['inducible',
  'nitric',
  'oxide',
  'synthase',
  'in',
  'pulmonary',
  'alveolar',
  'macrophage',
  'from',
  'patient',
  'with',
  'tuberculosis']}

## Peek into their results

In [4]:
results_p = Path("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/results/pipeline_r_aaai_doc/2023_03_06_14:38:23_r")
[f.name for f in results_p.iterdir()]

['nn_doc_retr_1_shared_task_dev.jsonl',
 'dev_sent_score_1_shared_task_dev_docnum(10)_ensembled.jsonl',
 'nn_doc_list_1_shared_task_dev.jsonl',
 'dev_sent_score_1_shared_task_dev_docnum(10)_e1.jsonl',
 'dev_sent_score_2_shared_task_dev.jsonl',
 'dev_sent_score_1_shared_task_dev_docnum(10).jsonl',
 'doc_retr_1_shared_task_dev.jsonl',
 'dev_sent_score_1_shared_task_dev_scaled_for_doc2.jsonl',
 'auto_pipeline.py',
 'doc_retr_2_shared_task_dev.jsonl',
 'single_sent_nli_r_shared_task_dev_with_doc_scale:0.1_e0.jsonl',
 'dev_sent_score_1_shared_task_dev_docnum(10)_e2.jsonl',
 't_shared_task_dev.jsonl']

In [5]:
# document retrieval with page count stage 1

doc_retr1 = read_data(results_p / 'doc_retr_1_shared_task_dev.jsonl')
type(doc_retr1)

list

In [20]:
doc_retr1[0]

{'id': 91198,
 'verifiable': 'NOT VERIFIABLE',
 'label': 'NOT ENOUGH INFO',
 'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League .',
 'evidence': [[[108548, None, None, None]]],
 'prioritized_docids': [['Colin_Kaepernick', 5.0],
  ['Football_League_-LRB-Greece-RRB-', 1.0]],
 'structured_docids': {'Colin Kaepernick': [['Colin_Kaepernick', 5.0]],
  'Football League': [['Football_League_-LRB-Greece-RRB-', 1.0]]},
 'claim_lemmas': ['Colin',
  'Kaepernick',
  'become',
  'a',
  'start',
  'quarterback',
  'during',
  'the',
  '49ers',
  '63rd',
  'season',
  'in',
  'the',
  'National',
  'Football',
  'League',
  '.'],
 'claim_tokens': ['Colin',
  'Kaepernick',
  'became',
  'a',
  'starting',
  'quarterback',
  'during',
  'the',
  '49ers',
  '63rd',
  'season',
  'in',
  'the',
  'National',
  'Football',
  'League',
  '.'],
 'processed_claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in

In [19]:
tmp = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uncnlp/thesis/zeroshot/feveronly/uncnlp/pipeline_r_aaai_doc/2023_06_05_22:39:10_r/doc_retr_1_shared_task_dev.jsonl"))
tmp[0]

{'id': 91198,
 'verifiable': 'NOT VERIFIABLE',
 'label': 'NOT ENOUGH INFO',
 'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League .',
 'evidence': [[[108548, None, None, None]]],
 'prioritized_docids': [['Football_League_-LRB-Greece-RRB-', 1.0],
  ['Colin_Kaepernick', 5.0]],
 'structured_docids': {'Colin Kaepernick': [['Colin_Kaepernick', 5.0]],
  'Football League': [['Football_League_-LRB-Greece-RRB-', 1.0]]},
 'claim_lemmas': ['Colin',
  'Kaepernick',
  'become',
  'a',
  'start',
  'quarterback',
  'during',
  'the',
  '49ers',
  '63rd',
  'season',
  'in',
  'the',
  'National',
  'Football',
  'League',
  '.'],
 'claim_tokens': ['Colin',
  'Kaepernick',
  'became',
  'a',
  'starting',
  'quarterback',
  'during',
  'the',
  '49ers',
  '63rd',
  'season',
  'in',
  'the',
  'National',
  'Football',
  'League',
  '.'],
 'processed_claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in

In [23]:
tmp_match = [len(set(x["predicted_docids"]) - set(y["predicted_docids"])) for x, y in zip(doc_retr1, tmp)]
sum(tmp_match)

0

# UCL NLP

(1) Logistic Reg => (2) Logistic Reg => (3) ESIM => (4) MLP

(1) Document retrieval
  - Trained on using nltk corpus of names and places, stopwords, ... in doc_ir.py phrase_features()
  
(2) Sentence retrieval

In [3]:
os.chdir("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uclnlp/fever")
sys.path.insert(1, os.getcwd())
os.getcwd()

'/users/k21190024/study/fact-checking-repos/fever/uclnlp/fever'

In [16]:
from line_ir import line_ir
from doc_ir import doc_ir
from doc_ir_model import doc_ir_model, load_selected
from line_ir_model import line_ir_model

In [18]:
dirdocs = load_selected()

100000it [00:00, 270299.96it/s]


In [20]:
ls = list(dirdocs.keys())

In [23]:
ftrain = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/repos/fever/uclnlp/fever/fever_data/train.jsonl"))

In [26]:
[c for c in ftrain if c["id"] == ls[0]]

[{'id': 150448,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Roman Atwood is a content creator.',
  'evidence': [[[174271, 187498, 'Roman_Atwood', 1]],
   [[174271, 187499, 'Roman_Atwood', 3]]]}]

In [22]:
ls[0]

150448

In [21]:
dirdocs[ls[0]]

{1: ['Roman_Atwood', ' Roman Atwood', 0],
 0: ['Atwood_-LRB-crater-RRB-', ' Atwood', 1]}