In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import random
import numpy as np
import scipy
import spacy
from pathlib import Path
from collections import Counter, defaultdict
from joblib import Parallel, delayed

import constants
import feverise.analysis as fa
from gen.util import read_data, write_jsonl

In [3]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/climatefever")
scifactp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/scifact")

climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

In [4]:
fever_claims = read_data(feverp / "train.jsonl") + read_data(feverp / "shared_task_dev.jsonl")
scifact_claims = read_data(scifactdp / "scifact_all.jsonl")
cfever_claims = read_data(climatefdp / "climatefever_paper_all.jsonl")

# Label proportions

In [69]:
fever_labels = Counter([doc["label"] for doc in fever_claims])
cfever_labels = Counter([doc["label"] for doc in cfever_claims])
scifact_labels = Counter([doc["label"] for doc in scifact_claims])

In [70]:
def label_ratio(cnt):
    total = sum(cnt.values())
    return {k: round(v / total, 2) for k, v in cnt.items()}

In [71]:
label_ratio(fever_labels), label_ratio(cfever_labels), label_ratio(scifact_labels)

({'SUPPORTS': 0.52, 'REFUTES': 0.22, 'NOT ENOUGH INFO': 0.26},
 {'SUPPORTS': 0.47, 'REFUTES': 0.18, 'NOT ENOUGH INFO': 0.34},
 {'NOT ENOUGH INFO': 0.38, 'REFUTES': 0.21, 'SUPPORTS': 0.41})

# Count Claim-Evidence

In [72]:
fever_cnt, fever_stats = fa.count_evidences(fever_claims)
scifact_cnt, scifact_stats = fa.count_evidences(scifact_claims)
cfever_cnt, cfever_stats = fa.count_evidences(cfever_claims)

## FEVER

In [73]:
Counter(fever_cnt[constants.LOOKUP["label"]["s"]]), Counter(fever_cnt[constants.LOOKUP["label"]["r"]]), Counter(fever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 62251,
          2: 10520,
          4: 2473,
          16: 79,
          5: 2694,
          8: 589,
          13: 160,
          3: 3433,
          6: 1595,
          18: 70,
          7: 842,
          20: 51,
          9: 413,
          14: 128,
          27: 20,
          10: 334,
          138: 1,
          17: 70,
          21: 44,
          62: 2,
          15: 75,
          24: 33,
          11: 257,
          12: 215,
          44: 8,
          32: 11,
          34: 9,
          42: 7,
          50: 3,
          26: 30,
          43: 3,
          37: 6,
          25: 27,
          19: 55,
          22: 31,
          52: 4,
          29: 17,
          38: 7,
          28: 9,
          23: 20,
          36: 8,
          33: 11,
          31: 11,
          40: 4,
          59: 1,
          58: 1,
          51: 2,
          35: 6,
          45: 4,
          30: 16,
          140: 1,
          53: 3,
          48: 3,
          122: 1,
          39: 6,
          63: 1,


In [89]:
total = Counter(fever_cnt[constants.LOOKUP["label"]["s"]]) + Counter(fever_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.2811550892465609,
 2: 0.15980737684949084,
 3: 0.11906579396144289,
 4: 0.09055399457536828,
 5: 0.06152246999399068,
 6: 0.04394926182780855,
 7: 0.034139448766464735,
 8: 0.027358659108996118,
 9: 0.022494356109207258,
 10: 0.018523330788845396}

In [43]:
fever_stats

{'SUPPORTS': (1, ModeResult(mode=1, count=62251), 2.003979192858214, 251),
 'REFUTES': (1, ModeResult(mode=1, count=26269), 1.9882549875140638, 78),
 'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=42305), 0.0, 0)}

## Climate-FEVER

In [10]:
# climate-fever
Counter(cfever_cnt[constants.LOOKUP["label"]["s"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["r"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({5: 654}), Counter({5: 253}), Counter({5: 474}))

In [45]:
cfever_stats

{'SUPPORTS': (5, ModeResult(mode=5, count=654), 5.0, 5),
 'REFUTES': (5, ModeResult(mode=5, count=253), 5.0, 5),
 'NOT ENOUGH INFO': (5, ModeResult(mode=5, count=474), 5.0, 5)}

## SciFact

In [9]:
# scifact
Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["r"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 218, 2: 128, 4: 31, 5: 4, 3: 64, 9: 2, 11: 1, 8: 4, 6: 3, 7: 1}),
 Counter({1: 103, 3: 35, 5: 7, 2: 68, 4: 19, 7: 2, 8: 1, 6: 1, 12: 1}),
 Counter({0: 416}))

In [90]:
total = Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]) + Counter(scifact_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.5367965367965368,
 2: 0.25396825396825395,
 3: 0.1111111111111111,
 4: 0.03896103896103896,
 5: 0.023088023088023088,
 6: 0.017316017316017316,
 7: 0.012987012987012988,
 8: 0.005772005772005772,
 9: 0.002886002886002886,
 10: 0.002886002886002886}

In [44]:
scifact_stats

{'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=416), 0.0, 0),
 'REFUTES': (1, ModeResult(mode=1, count=103), 2.088607594936709, 12),
 'SUPPORTS': (1, ModeResult(mode=1, count=218), 1.9649122807017543, 11)}

# Check claim-evidence tokenizer length

In [4]:
nlp = spacy.load("en_core_web_sm")

def count_claim_evidence_length(data):
    res = defaultdict(dict)
    cl_len = defaultdict(list)
    el_len = defaultdict(list)
    for doc in data:
        cl = nlp.tokenizer(doc["claim"])
        el = nlp.tokenizer(doc["evidence"])
        cl_len[constants.ID2LABEL[doc["labels"]]].append(len(cl))
        el_len[constants.ID2LABEL[doc["labels"]]].append(len(el))
    for label in constants.LABEL2ID.keys():
        res[label] = {
            "claim": cl_len[label],
            "evidence": el_len[label],
            "concatenate": [sum(x) for x in zip(cl_len[label], el_len[label])]
        }
    return res, {k: Counter(v["concatenate"]) for k, v in res.items()}
        
def pct_over_max_seq_length(cnt, max_length):
    over_all = []
    total_all = 0
    d = {}
    for label, count in cnt.items():
        over = []
        total = 0
        for c, n in count.items():
            if c > max_length:
                over.append(n)
            total += n
        over_all += over
        total_all += total
        d[label] = sum(over) / total
        d[label + "_count"] = sum(over)
    d["overall"] = sum(over_all) / total_all
    d["overall_count"] = sum(over_all)
    
    return d

## Document level evidence

In [5]:
bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-doc-evidence")
fever_all = read_data(bert_data_p / "fever.all.jsonl")
cfeverpure_all = read_data(bert_data_p / "climatefeverpure.all.jsonl")
scifact_all = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [8]:
fever_all_raw, fever_all_cnt = count_claim_evidence_length(fever_all)
cfeverpure_all_raw, cfeverpure_all_cnt = count_claim_evidence_length(cfeverpure_all)
scifact_all_raw, scifact_all_cnt = count_claim_evidence_length(scifact_all)

In [9]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in fever_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in fever_all_raw.items()}, 
    {k: max(v["evidence"]) for k, v in fever_all_raw.items()}, 
    pct_over_max_seq_length(fever_all_cnt, 512),
    pct_over_max_seq_length(fever_all_cnt, 1024),
)

({'SUPPORTS': 9.519936332914268,
  'NOT ENOUGH INFO': 9.461600283654414,
  'REFUTES': 9.52600093301501},
 {'SUPPORTS': 76.53544941811514,
  'NOT ENOUGH INFO': 119.47152818815742,
  'REFUTES': 70.61172854751516},
 {'SUPPORTS': 13903, 'NOT ENOUGH INFO': 429, 'REFUTES': 4625},
 {'SUPPORTS': 0.013575391287297725,
  'SUPPORTS_count': 1177,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.01185477895776735,
  'REFUTES_count': 432,
  'overall': 0.009725168785169873,
  'overall_count': 1609},
 {'SUPPORTS': 0.0026181935617812942,
  'SUPPORTS_count': 227,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.002085562964792404,
  'REFUTES_count': 76,
  'overall': 0.0018314022013091807,
  'overall_count': 303})

In [10]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in cfeverpure_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in cfeverpure_all_raw.items()},
    pct_over_max_seq_length(cfeverpure_all_cnt, 128),
    pct_over_max_seq_length(cfeverpure_all_cnt, 256),
    pct_over_max_seq_length(cfeverpure_all_cnt, 512),
)

({'SUPPORTS': 24.007645259938837,
  'NOT ENOUGH INFO': 23.69831223628692,
  'REFUTES': 20.992094861660078},
 {'SUPPORTS': 83.17737003058105,
  'NOT ENOUGH INFO': 163.63291139240508,
  'REFUTES': 72.93675889328063},
 {'SUPPORTS': 0.3058103975535168,
  'SUPPORTS_count': 200,
  'NOT ENOUGH INFO': 0.9008438818565401,
  'NOT ENOUGH INFO_count': 427,
  'REFUTES': 0.2015810276679842,
  'REFUTES_count': 51,
  'overall': 0.49094858797972485,
  'overall_count': 678},
 {'SUPPORTS': 0.010703363914373088,
  'SUPPORTS_count': 7,
  'NOT ENOUGH INFO': 0.0949367088607595,
  'NOT ENOUGH INFO_count': 45,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.03765387400434468,
  'overall_count': 52},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.002109704641350211,
  'NOT ENOUGH INFO_count': 1,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.000724112961622013,
  'overall_count': 1})

In [11]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in scifact_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in scifact_all_raw.items()}, 
    pct_over_max_seq_length(scifact_all_cnt, 128),
    pct_over_max_seq_length(scifact_all_cnt, 256),
    pct_over_max_seq_length(scifact_all_cnt, 512),
)

({'SUPPORTS': 14.322368421052632,
  'NOT ENOUGH INFO': 15.19951923076923,
  'REFUTES': 14.350210970464135},
 {'SUPPORTS': 71.18859649122807,
  'NOT ENOUGH INFO': 144.84134615384616,
  'REFUTES': 82.07594936708861},
 {'SUPPORTS': 0.15570175438596492,
  'SUPPORTS_count': 71,
  'NOT ENOUGH INFO': 0.7980769230769231,
  'NOT ENOUGH INFO_count': 332,
  'REFUTES': 0.21940928270042195,
  'REFUTES_count': 52,
  'overall': 0.4102795311091073,
  'overall_count': 455},
 {'SUPPORTS': 0.02412280701754386,
  'SUPPORTS_count': 11,
  'NOT ENOUGH INFO': 0.03125,
  'NOT ENOUGH INFO_count': 13,
  'REFUTES': 0.029535864978902954,
  'REFUTES_count': 7,
  'overall': 0.027953110910730387,
  'overall_count': 31},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0})

## Sentence level evidence

In [6]:
bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence")
fever_all = read_data(bert_data_p / "fever.all.jsonl")
cfeverpure_all = read_data(bert_data_p / "climatefeverpure.all.jsonl")
cfever_all = read_data(bert_data_p / "climatefever.all.jsonl")
scifact_all = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [7]:
fever_all_raw, fever_all_cnt = count_claim_evidence_length(fever_all)
cfeverpure_all_raw, cfeverpure_all_cnt = count_claim_evidence_length(cfeverpure_all)
cfever_all_raw, cfever_all_cnt = count_claim_evidence_length(cfever_all)
scifact_all_raw, scifact_all_cnt = count_claim_evidence_length(scifact_all)

In [8]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in fever_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in fever_all_raw.items()}, 
    {k: max(v["evidence"]) for k, v in fever_all_raw.items()}, 
    pct_over_max_seq_length(fever_all_cnt, 512),
    pct_over_max_seq_length(fever_all_cnt, 1024),
)

({'SUPPORTS': 9.03313951488044,
  'NOT ENOUGH INFO': 9.461600283654414,
  'REFUTES': 9.19823055074924},
 {'SUPPORTS': 38.27282986409771,
  'NOT ENOUGH INFO': 24.74264271362723,
  'REFUTES': 35.538327184878064},
 {'SUPPORTS': 1333, 'NOT ENOUGH INFO': 263, 'REFUTES': 1102},
 {'SUPPORTS': 8.945467056597281e-05,
  'SUPPORTS_count': 13,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 9.793999542946688e-05,
  'REFUTES_count': 6,
  'overall': 6.524792494428171e-05,
  'overall_count': 19},
 {'SUPPORTS': 1.3762257010149665e-05,
  'SUPPORTS_count': 2,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 1.6323332571577813e-05,
  'REFUTES_count': 1,
  'overall': 1.0302303938570796e-05,
  'overall_count': 3})

In [9]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in cfeverpure_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in cfeverpure_all_raw.items()}, 
    {k: max(v["evidence"]) for k, v in cfeverpure_all_raw.items()}, 
    pct_over_max_seq_length(cfeverpure_all_cnt, 128),
    pct_over_max_seq_length(cfeverpure_all_cnt, 256),
    pct_over_max_seq_length(cfeverpure_all_cnt, 512),
)

({'SUPPORTS': 23.86741973840666,
  'NOT ENOUGH INFO': 23.69831223628692,
  'REFUTES': 19.870689655172413},
 {'SUPPORTS': 32.34126040428062,
  'NOT ENOUGH INFO': 32.72658227848101,
  'REFUTES': 31.81551724137931},
 {'SUPPORTS': 135, 'NOT ENOUGH INFO': 301, 'REFUTES': 101},
 {'SUPPORTS': 0.004161712247324614,
  'SUPPORTS_count': 7,
  'NOT ENOUGH INFO': 0.008860759493670886,
  'NOT ENOUGH INFO_count': 21,
  'REFUTES': 0.0034482758620689655,
  'REFUTES_count': 2,
  'overall': 0.006476683937823834,
  'overall_count': 30},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0008438818565400844,
  'NOT ENOUGH INFO_count': 2,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0004317789291882556,
  'overall_count': 2},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0})

In [10]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in cfever_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in cfever_all_raw.items()}, 
    {k: max(v["evidence"]) for k, v in cfever_all_raw.items()}, 
    pct_over_max_seq_length(cfever_all_cnt, 128),
    pct_over_max_seq_length(cfever_all_cnt, 256),
    pct_over_max_seq_length(cfever_all_cnt, 512),
)

({'SUPPORTS': 23.86741973840666,
  'NOT ENOUGH INFO': 23.59573551583028,
  'REFUTES': 19.870689655172413},
 {'SUPPORTS': 32.34126040428062,
  'NOT ENOUGH INFO': 32.03790652595305,
  'REFUTES': 31.81551724137931},
 {'SUPPORTS': 135, 'NOT ENOUGH INFO': 389, 'REFUTES': 101},
 {'SUPPORTS': 0.004161712247324614,
  'SUPPORTS_count': 7,
  'NOT ENOUGH INFO': 0.007538229592935602,
  'NOT ENOUGH INFO_count': 35,
  'REFUTES': 0.0034482758620689655,
  'REFUTES_count': 2,
  'overall': 0.006372194062273715,
  'overall_count': 44},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0006461339651087659,
  'NOT ENOUGH INFO_count': 3,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.00043446777697320784,
  'overall_count': 3},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0})

In [11]:
(
    {k: sum(v["claim"]) / len(v["claim"]) for k, v in scifact_all_raw.items()}, 
    {k: sum(v["evidence"]) / len(v["evidence"]) for k, v in scifact_all_raw.items()}, 
    {k: max(v["evidence"]) for k, v in scifact_all_raw.items()}, 
    pct_over_max_seq_length(scifact_all_cnt, 128),
    pct_over_max_seq_length(scifact_all_cnt, 256),
    pct_over_max_seq_length(scifact_all_cnt, 512),
)

({'SUPPORTS': 14.3671875,
  'NOT ENOUGH INFO': 15.19951923076923,
  'REFUTES': 14.482828282828283},
 {'SUPPORTS': 36.229910714285715,
  'NOT ENOUGH INFO': 30.381009615384617,
  'REFUTES': 39.2969696969697},
 {'SUPPORTS': 166, 'NOT ENOUGH INFO': 114, 'REFUTES': 166},
 {'SUPPORTS': 0.011160714285714286,
  'SUPPORTS_count': 10,
  'NOT ENOUGH INFO': 0.002403846153846154,
  'NOT ENOUGH INFO_count': 2,
  'REFUTES': 0.014141414141414142,
  'REFUTES_count': 7,
  'overall': 0.008547008547008548,
  'overall_count': 19},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0},
 {'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0})

### Label proportions

In [48]:
fever_train = read_data(feverp / "train.jsonl")
scifact_all = read_data(scifactdp / "scifact_all.jsonl")
cfeverpure_train = read_data(climatefdp / "finetune" / "baseline" / "climatefever_train.jsonl")
cfever_train = read_data(climatefdp.parent / "feverised-climatefever_sent" / "finetune" / "baseline" / "climatefever_train.jsonl")

fever_strain = read_data(bert_data_p / "fever.train.n5.jsonl")
cfeverpure_strain = read_data(bert_data_p / "climatefeverpure.train.n5.jsonl")
cfever_strain = read_data(bert_data_p / "climatefever.train.n5.jsonl")
scifact_sall = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [49]:
def convert_to_sentence(data, nei_val: int):
    res = defaultdict(list)
    for doc in data:
        for evidence in doc["evidence"]:
            ev_len = len(evidence)
            if doc["label"] == constants.LOOKUP["label"]["nei"]:
                res[doc["label"]].append(nei_val)
            else:
                res[doc["label"]].append(ev_len)
    return res

In [51]:
(
    {k: sum(v) for k, v in convert_to_sentence(fever_train, 5).items()}, 
    Counter([constants.ID2LABEL[doc["labels"]] for doc in fever_strain])
)

({'SUPPORTS': 193756, 'REFUTES': 70066, 'NOT ENOUGH INFO': 178195},
 Counter({'SUPPORTS': 132827, 'REFUTES': 49035, 'NOT ENOUGH INFO': 71278}))

In [61]:
Counter([doc["label"] for doc in cfever_train])

Counter({'SUPPORTS': 457, 'REFUTES': 177, 'NOT ENOUGH INFO': 332})

In [55]:
Counter([constants.ID2LABEL[doc["labels"]] for doc in cfever_strain])

Counter({'NOT ENOUGH INFO': 3244, 'SUPPORTS': 1186, 'REFUTES': 400})

In [59]:
Counter([constants.ID2LABEL[doc["labels"]] for doc in cfeverpure_strain])

Counter({'SUPPORTS': 1186, 'REFUTES': 400, 'NOT ENOUGH INFO': 1660})

In [56]:
(
    {k: sum(v) for k, v in convert_to_sentence(scifact_all, 5).items()}, 
    Counter([constants.ID2LABEL[doc["labels"]] for doc in scifact_sall])
)

({'NOT ENOUGH INFO': 2080, 'REFUTES': 495, 'SUPPORTS': 896},
 Counter({'NOT ENOUGH INFO': 832, 'REFUTES': 495, 'SUPPORTS': 896}))