In [1]:
import sys
sys.path.insert(0, "../src")
from copy import deepcopy
from pathlib import Path
from collections import Counter

import numpy as np
from sklearn.model_selection import train_test_split

import constants
from gen.util import read_data, write_jsonl

In [2]:
root_data = Path("../data").resolve()

In [3]:
climatefdp = root_data / "feverised-climatefever"
climate_s_fdp = root_data / "feverised-climatefever_sent"

In [4]:
masked_label = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
rev_masked_label = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO"}

def ratio(counts):
    total = sum(counts.values())
    return {rev_masked_label[k]: (v, round(v/total*100, 2)) for k, v in counts.items()}

def split_data(cfp):
    cf = read_data(cfp / "climatefever_paper_all.jsonl")
    cf_idt = np.array([[d["id"], masked_label[d["label"]]] for d in cf])

    train, stage = train_test_split(cf_idt, test_size=0.3, shuffle=True, stratify=cf_idt[:,1], random_state=constants.SEED)
    dev, test = train_test_split(stage, test_size=0.33, shuffle=True, stratify=stage[:,1], random_state=constants.SEED)
    return cf, train, dev, test

# Climate-FEVER Doc

In [5]:
cf, train, dev, test = split_data(climatefdp)
s_cf, s_train, s_dev, s_test = split_data(climate_s_fdp)

In [6]:
count_train, count_dev, count_test = Counter(train[:, 1]), Counter(dev[:, 1]), Counter(test[:, 1])
ratio_train, ratio_dev, ratio_test = ratio(count_train), ratio(count_dev), ratio(count_test)

ratio_train, ratio_dev, ratio_test

({'REFUTES': (177, 18.32),
  'SUPPORTS': (457, 47.31),
  'NOT ENOUGH INFO': (332, 34.37)},
 {'SUPPORTS': (132, 47.48),
  'REFUTES': (51, 18.35),
  'NOT ENOUGH INFO': (95, 34.17)},
 {'SUPPORTS': (65, 47.45),
  'NOT ENOUGH INFO': (47, 34.31),
  'REFUTES': (25, 18.25)})

In [7]:
splits = {
    "climatefever_train.jsonl": [doc for doc in cf if doc["id"] in train[:, 0]], 
    "climatefever_dev.jsonl": [doc for doc in cf if doc["id"] in dev[:, 0]], 
    "climatefever_test.jsonl": [doc for doc in cf if doc["id"] in test[:, 0]]
}

for filename, claims in splits.items():
    write_jsonl(climatefdp / filename, claims)

In [8]:
count_train, count_dev, count_test = Counter(s_train[:, 1]), Counter(s_dev[:, 1]), Counter(s_test[:, 1])
ratio_train, ratio_dev, ratio_test = ratio(count_train), ratio(count_dev), ratio(count_test)

ratio_train, ratio_dev, ratio_test

({'REFUTES': (177, 18.32),
  'SUPPORTS': (457, 47.31),
  'NOT ENOUGH INFO': (332, 34.37)},
 {'SUPPORTS': (132, 47.48),
  'REFUTES': (51, 18.35),
  'NOT ENOUGH INFO': (95, 34.17)},
 {'SUPPORTS': (65, 47.45),
  'NOT ENOUGH INFO': (47, 34.31),
  'REFUTES': (25, 18.25)})

In [9]:
splits = {
    "climatefever_train.jsonl": [doc for doc in s_cf if doc["id"] in s_train[:, 0]], 
    "climatefever_dev.jsonl": [doc for doc in s_cf if doc["id"] in s_dev[:, 0]], 
    "climatefever_test.jsonl": [doc for doc in s_cf if doc["id"] in s_test[:, 0]]
}

for filename, claims in splits.items():
    write_jsonl(climate_s_fdp / filename, claims)