In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.insert(0, "../src")
from copy import deepcopy

import numpy as np
from pathlib import Path
from collections import Counter

import constants
from gen.util import read_data, write_jsonl

In [4]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

# Generate Climate-FEVER Stratified 70:20:10 fine tune dataset

In [5]:
from sklearn.model_selection import train_test_split
masked_label = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}

climatefd_ft_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever/finetune")
climatefd_ft_p.mkdir(exist_ok=True)
cf = read_data(climatefdp / "climatefever_paper_all_titleid.jsonl")
cf_idt = np.array([[d["id"], masked_label[d["label"]]] for d in cf])

In [5]:
train, stage = train_test_split(cf_idt, test_size=0.3, shuffle=True, stratify=cf_idt[:,1], random_state=constants.SEED)
dev, test = train_test_split(stage, test_size=0.33, shuffle=True, stratify=stage[:,1], random_state=constants.SEED)
len(train), len(dev), len(test)

(966, 278, 137)

In [6]:
def ratio(counts):
    total = sum(counts.values())
    return {k: round(v/total*100, 2) for k, v in counts.items()}

count_train, count_dev, count_test = Counter(train[:, 1]), Counter(dev[:, 1]), Counter(test[:, 1])
ratio_train, ratio_dev, ratio_test = ratio(count_train), ratio(count_dev), ratio(count_test)

ratio_train, ratio_dev, ratio_test

({1: 18.32, 0: 47.31, 2: 34.37},
 {0: 47.48, 1: 18.35, 2: 34.17},
 {0: 47.45, 2: 34.31, 1: 18.25})

In [7]:
splits = {
    "climatefever_train.jsonl": [doc for doc in cf if doc["id"] in train[:, 0]], 
    "climatefever_dev.jsonl": [doc for doc in cf if doc["id"] in dev[:, 0]], 
    "climatefever_test.jsonl": [doc for doc in cf if doc["id"] in test[:, 0]]
}

## Baseline

Has sampling for NEI labels but Climate-FEVER already has evidence available for NEI claims. Therefore, we will train the model with the evidences attached to the claim

In [9]:
baseline_finetune_p = climatefd_ft_p / "baseline"
baseline_finetune_p.mkdir(exist_ok=True)

In [15]:
# move "other_evidences" to "evidences" alongside elab/other_elab

baseline_cfever = []
for doc in cf:
    new_doc = deepcopy(doc)
    if doc["label"] == constants.LOOKUP["label"]["nei"]:
        new_doc["evidence"] = doc["other_evidence"]
        new_doc["elab"] = doc["other_elab"]
        new_doc["other_evidence"] = None
        new_doc["other_elab"] = None
    baseline_cfever.append(new_doc)
len(baseline_cfever)

1381

In [19]:
baseline_splits = {
    "climatefever_train.jsonl": [doc for doc in baseline_cfever if doc["id"] in train[:, 0]], 
    "climatefever_dev.jsonl": [doc for doc in baseline_cfever if doc["id"] in dev[:, 0]], 
    "climatefever_test.jsonl": [doc for doc in baseline_cfever if doc["id"] in test[:, 0]]
}

for filename, claims in baseline_splits.items():
    write_jsonl(baseline_finetune_p / filename, claims)

## Generate Oracle IR

In [10]:
cf_train = read_data(baseline_finetune_p / "climatefever_train.jsonl")
cf_dev = read_data(baseline_finetune_p / "climatefever_dev.jsonl")

In [None]:
for doc in cf_train:
    doc["predicted_sentences"] = [[ev[0][2], ev[0][3]] for ev in doc["evidence"]]
    
for doc in cf_dev:
    doc["predicted_sentences"] = [[ev[0][2], ev[0][3]] for ev in doc["evidence"]]
    
write_jsonl(baseline_finetune_p / "climatefever_oracle_train.jsonl", cf_train)
write_jsonl(baseline_finetune_p / "climatefever_oracle_dev.jsonl", cf_dev)