In [2]:
%load_ext autoreload
%autoreload 2

# Init

In [3]:
import sys
sys.path.insert(0, "../src")

import json
import random
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict
from joblib import Parallel, delayed

import constants
import feverise.analysis as fa
from gen.util import read_data, write_jsonl

In [67]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/climatefever")
scifactp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/scifact")

climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

In [68]:
fever_claims = read_data(feverp / "train.jsonl") + read_data(feverp / "shared_task_dev.jsonl")
scifact_claims = read_data(scifactdp / "scifact_all.jsonl")
cfever_claims = read_data(climatefdp / "climatefever_paper_all.jsonl")

# Label proportions

In [69]:
fever_labels = Counter([doc["label"] for doc in fever_claims])
cfever_labels = Counter([doc["label"] for doc in cfever_claims])
scifact_labels = Counter([doc["label"] for doc in scifact_claims])

In [70]:
def label_ratio(cnt):
    total = sum(cnt.values())
    return {k: round(v / total, 2) for k, v in cnt.items()}

In [71]:
label_ratio(fever_labels), label_ratio(cfever_labels), label_ratio(scifact_labels)

({'SUPPORTS': 0.52, 'REFUTES': 0.22, 'NOT ENOUGH INFO': 0.26},
 {'SUPPORTS': 0.47, 'REFUTES': 0.18, 'NOT ENOUGH INFO': 0.34},
 {'NOT ENOUGH INFO': 0.38, 'REFUTES': 0.21, 'SUPPORTS': 0.41})

# Count Claim-Evidence

In [72]:
fever_cnt, fever_stats = fa.count_evidences(fever_claims)
scifact_cnt, scifact_stats = fa.count_evidences(scifact_claims)
cfever_cnt, cfever_stats = fa.count_evidences(cfever_claims)

## FEVER

In [73]:
Counter(fever_cnt[constants.LOOKUP["label"]["s"]]), Counter(fever_cnt[constants.LOOKUP["label"]["r"]]), Counter(fever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 62251,
          2: 10520,
          4: 2473,
          16: 79,
          5: 2694,
          8: 589,
          13: 160,
          3: 3433,
          6: 1595,
          18: 70,
          7: 842,
          20: 51,
          9: 413,
          14: 128,
          27: 20,
          10: 334,
          138: 1,
          17: 70,
          21: 44,
          62: 2,
          15: 75,
          24: 33,
          11: 257,
          12: 215,
          44: 8,
          32: 11,
          34: 9,
          42: 7,
          50: 3,
          26: 30,
          43: 3,
          37: 6,
          25: 27,
          19: 55,
          22: 31,
          52: 4,
          29: 17,
          38: 7,
          28: 9,
          23: 20,
          36: 8,
          33: 11,
          31: 11,
          40: 4,
          59: 1,
          58: 1,
          51: 2,
          35: 6,
          45: 4,
          30: 16,
          140: 1,
          53: 3,
          48: 3,
          122: 1,
          39: 6,
          63: 1,


In [89]:
total = Counter(fever_cnt[constants.LOOKUP["label"]["s"]]) + Counter(fever_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.2811550892465609,
 2: 0.15980737684949084,
 3: 0.11906579396144289,
 4: 0.09055399457536828,
 5: 0.06152246999399068,
 6: 0.04394926182780855,
 7: 0.034139448766464735,
 8: 0.027358659108996118,
 9: 0.022494356109207258,
 10: 0.018523330788845396}

In [43]:
fever_stats

{'SUPPORTS': (1, ModeResult(mode=1, count=62251), 2.003979192858214, 251),
 'REFUTES': (1, ModeResult(mode=1, count=26269), 1.9882549875140638, 78),
 'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=42305), 0.0, 0)}

## Climate-FEVER

In [10]:
# climate-fever
Counter(cfever_cnt[constants.LOOKUP["label"]["s"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["r"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({5: 654}), Counter({5: 253}), Counter({5: 474}))

In [45]:
cfever_stats

{'SUPPORTS': (5, ModeResult(mode=5, count=654), 5.0, 5),
 'REFUTES': (5, ModeResult(mode=5, count=253), 5.0, 5),
 'NOT ENOUGH INFO': (5, ModeResult(mode=5, count=474), 5.0, 5)}

## SciFact

In [9]:
# scifact
Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["r"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 218, 2: 128, 4: 31, 5: 4, 3: 64, 9: 2, 11: 1, 8: 4, 6: 3, 7: 1}),
 Counter({1: 103, 3: 35, 5: 7, 2: 68, 4: 19, 7: 2, 8: 1, 6: 1, 12: 1}),
 Counter({0: 416}))

In [90]:
total = Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]) + Counter(scifact_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.5367965367965368,
 2: 0.25396825396825395,
 3: 0.1111111111111111,
 4: 0.03896103896103896,
 5: 0.023088023088023088,
 6: 0.017316017316017316,
 7: 0.012987012987012988,
 8: 0.005772005772005772,
 9: 0.002886002886002886,
 10: 0.002886002886002886}

In [44]:
scifact_stats

{'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=416), 0.0, 0),
 'REFUTES': (1, ModeResult(mode=1, count=103), 2.088607594936709, 12),
 'SUPPORTS': (1, ModeResult(mode=1, count=218), 1.9649122807017543, 11)}

# Check claim-evidence tokenizer length

In [39]:
def count_claim_evidence_length(data):
    res = defaultdict(list)
    for doc in data:
        length = tokenizer(doc["evidence"], doc["claim"], return_length=True)["length"][0] - 3
        res[constants.ID2LABEL[doc["labels"]]].append(length)
    return res, {k: Counter(v) for k, v in res.items()}

def pct_over_max_seq_length(cnt, max_length):
    over_all = []
    total_all = 0
    d = {}
    for label, count in cnt.items():
        over = []
        total = 0
        for c, n in count.items():
            if c > max_length:
                over.append(n)
            total += n
        over_all += over
        total_all += total
        d[label] = sum(over) / total
        d[label + "_count"] = sum(over)
    d["overall"] = sum(over_all) / total_all
    d["overall_count"] = sum(over_all)
    
    return d

## Document level evidence

In [40]:
from transformers import AutoTokenizer

bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-doc-evidence")
fever_all = read_data(bert_data_p / "fever.all.jsonl")
cfever_all = read_data(bert_data_p / "climatefever.all.jsonl")
scifact_all = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [41]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [42]:
fever_all_raw, fever_all_cnt = count_claim_evidence_length(fever_all)
cfever_all_raw, cfever_all_cnt = count_claim_evidence_length(cfever_all)
scifact_all_raw, scifact_all_cnt = count_claim_evidence_length(scifact_all)

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


In [43]:
(
    {k: sum(v) / len(v) for k, v in fever_all_raw.items()}, 
    pct_over_max_seq_length(fever_all_cnt, 128),
    pct_over_max_seq_length(fever_all_cnt, 256),
    pct_over_max_seq_length(fever_all_cnt, 512),
)

({'NOT ENOUGH INFO': 65.41730291927668,
  'SUPPORTS': 63.87875572369408,
  'REFUTES': 65.19085645289647},
 {'NOT ENOUGH INFO': 0.011818933932159319,
  'NOT ENOUGH INFO_count': 500,
  'SUPPORTS': 0.06672356720222374,
  'SUPPORTS_count': 5785,
  'REFUTES': 0.0759035152712604,
  'REFUTES_count': 2766,
  'overall': 0.054706341003463346,
  'overall_count': 9051},
 {'NOT ENOUGH INFO': 4.727573572863728e-05,
  'NOT ENOUGH INFO_count': 2,
  'SUPPORTS': 0.019803693152328116,
  'SUPPORTS_count': 1717,
  'REFUTES': 0.024505364836310748,
  'REFUTES_count': 893,
  'overall': 0.015787533167721385,
  'overall_count': 2612},
 {'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'SUPPORTS': 0.002272176791501828,
  'SUPPORTS_count': 197,
  'REFUTES': 0.0031283444471886064,
  'REFUTES_count': 114,
  'overall': 0.001879756054809093,
  'overall_count': 311})

In [44]:
(
    {k: sum(v) / len(v) for k, v in cfever_all_raw.items()}, 
    pct_over_max_seq_length(cfever_all_cnt, 128),
    pct_over_max_seq_length(cfever_all_cnt, 256),
    pct_over_max_seq_length(cfever_all_cnt, 512),
)

({'REFUTES': 101.18181818181819,
  'SUPPORTS': 115.69113149847095,
  'NOT ENOUGH INFO': 204.97679324894514},
 {'REFUTES': 0.2648221343873518,
  'REFUTES_count': 67,
  'SUPPORTS': 0.38073394495412843,
  'SUPPORTS_count': 249,
  'NOT ENOUGH INFO': 0.9451476793248945,
  'NOT ENOUGH INFO_count': 448,
  'overall': 0.5532223026792179,
  'overall_count': 764},
 {'REFUTES': 0.003952569169960474,
  'REFUTES_count': 1,
  'SUPPORTS': 0.016819571865443424,
  'SUPPORTS_count': 11,
  'NOT ENOUGH INFO': 0.14135021097046413,
  'NOT ENOUGH INFO_count': 67,
  'overall': 0.05720492396813903,
  'overall_count': 79},
 {'REFUTES': 0.0,
  'REFUTES_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.004219409282700422,
  'NOT ENOUGH INFO_count': 2,
  'overall': 0.001448225923244026,
  'overall_count': 2})

In [45]:
(
    {k: sum(v) / len(v) for k, v in scifact_all_raw.items()}, 
    pct_over_max_seq_length(scifact_all_cnt, 128),
    pct_over_max_seq_length(scifact_all_cnt, 256),
    pct_over_max_seq_length(scifact_all_cnt, 512),
)

({'NOT ENOUGH INFO': 97.0360576923077,
  'REFUTES': 129.75527426160338,
  'SUPPORTS': 116.41885964912281},
 {'NOT ENOUGH INFO': 0.12259615384615384,
  'NOT ENOUGH INFO_count': 51,
  'REFUTES': 0.37130801687763715,
  'REFUTES_count': 88,
  'SUPPORTS': 0.3048245614035088,
  'SUPPORTS_count': 139,
  'overall': 0.25067628494138866,
  'overall_count': 278},
 {'NOT ENOUGH INFO': 0.004807692307692308,
  'NOT ENOUGH INFO_count': 2,
  'REFUTES': 0.08860759493670886,
  'REFUTES_count': 21,
  'SUPPORTS': 0.05482456140350877,
  'SUPPORTS_count': 25,
  'overall': 0.04328223624887286,
  'overall_count': 48},
 {'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.004219409282700422,
  'REFUTES_count': 1,
  'SUPPORTS': 0.0043859649122807015,
  'SUPPORTS_count': 2,
  'overall': 0.002705139765554554,
  'overall_count': 3})

## Sentence level evidence

In [46]:
from transformers import AutoTokenizer

bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence")
fever_all = read_data(bert_data_p / "fever.all.jsonl")
cfever_all = read_data(bert_data_p / "climatefever.all.jsonl")
scifact_all = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [47]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [48]:
fever_all_raw, fever_all_cnt = count_claim_evidence_length(fever_all)
cfever_all_raw, cfever_all_cnt = count_claim_evidence_length(cfever_all)
scifact_all_raw, scifact_all_cnt = count_claim_evidence_length(scifact_all)

In [49]:
(
    {k: sum(v) / len(v) for k, v in fever_all_raw.items()}, 
    pct_over_max_seq_length(fever_all_cnt, 128),
    pct_over_max_seq_length(fever_all_cnt, 256),
    pct_over_max_seq_length(fever_all_cnt, 512),
)

({'NOT ENOUGH INFO': 38.00204467557026,
  'SUPPORTS': 45.85537872785205,
  'REFUTES': 44.549349364584124},
 {'NOT ENOUGH INFO': 0.0006736792341330812,
  'NOT ENOUGH INFO_count': 57,
  'SUPPORTS': 0.003523441846350962,
  'SUPPORTS_count': 418,
  'REFUTES': 0.0021117114374857317,
  'REFUTES_count': 111,
  'overall': 0.0022907805854390792,
  'overall_count': 586},
 {'NOT ENOUGH INFO': 1.181893393215932e-05,
  'NOT ENOUGH INFO_count': 1,
  'SUPPORTS': 8.429286713758282e-06,
  'SUPPORTS_count': 1,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 7.818363772829622e-06,
  'overall_count': 2},
 {'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'overall': 0.0,
  'overall_count': 0})

In [50]:
(
    {k: sum(v) / len(v) for k, v in cfever_all_raw.items()}, 
    pct_over_max_seq_length(cfever_all_cnt, 128),
    pct_over_max_seq_length(cfever_all_cnt, 256),
    pct_over_max_seq_length(cfever_all_cnt, 512),
)

({'REFUTES': 55.758620689655174,
  'SUPPORTS': 60.608799048751486,
  'NOT ENOUGH INFO': 61.55907172995781},
 {'REFUTES': 0.005172413793103448,
  'REFUTES_count': 3,
  'SUPPORTS': 0.0089179548156956,
  'SUPPORTS_count': 15,
  'NOT ENOUGH INFO': 0.018565400843881856,
  'NOT ENOUGH INFO_count': 44,
  'overall': 0.013385146804835924,
  'overall_count': 62},
 {'REFUTES': 0.0,
  'REFUTES_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.002953586497890295,
  'NOT ENOUGH INFO_count': 7,
  'overall': 0.0015112262521588947,
  'overall_count': 7},
 {'REFUTES': 0.0,
  'REFUTES_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'overall': 0.0,
  'overall_count': 0})

In [51]:
(
    {k: sum(v) / len(v) for k, v in scifact_all_raw.items()}, 
    pct_over_max_seq_length(scifact_all_cnt, 128),
    pct_over_max_seq_length(scifact_all_cnt, 256),
    pct_over_max_seq_length(scifact_all_cnt, 512),
)

({'NOT ENOUGH INFO': 59.03725961538461,
  'REFUTES': 72.32727272727273,
  'SUPPORTS': 68.83816964285714},
 {'NOT ENOUGH INFO': 0.01201923076923077,
  'NOT ENOUGH INFO_count': 10,
  'REFUTES': 0.06868686868686869,
  'REFUTES_count': 34,
  'SUPPORTS': 0.04799107142857143,
  'SUPPORTS_count': 43,
  'overall': 0.03913630229419703,
  'overall_count': 87},
 {'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'overall': 0.0,
  'overall_count': 0},
 {'NOT ENOUGH INFO': 0.0,
  'NOT ENOUGH INFO_count': 0,
  'REFUTES': 0.0,
  'REFUTES_count': 0,
  'SUPPORTS': 0.0,
  'SUPPORTS_count': 0,
  'overall': 0.0,
  'overall_count': 0})

### Label proportions

In [11]:
fever_labels = Counter([doc["labels"] for doc in fever_all])
cfever_labels = Counter([doc["labels"] for doc in cfever_all])
scifact_labels = Counter([doc["labels"] for doc in scifact_all])

In [12]:
def label_ratio(cnt):
    total = sum(cnt.values())
    return {constants.ID2LABEL[k]: round(v / total, 2) for k, v in cnt.items()}

In [13]:
label_ratio(fever_labels), label_ratio(cfever_labels), label_ratio(scifact_labels)

({'NOT ENOUGH INFO': 0.33, 'SUPPORTS': 0.46, 'REFUTES': 0.21},
 {'REFUTES': 0.13, 'SUPPORTS': 0.36, 'NOT ENOUGH INFO': 0.51},
 {'NOT ENOUGH INFO': 0.37, 'REFUTES': 0.22, 'SUPPORTS': 0.4})

In [71]:
label_ratio(fever_labels), label_ratio(cfever_labels), label_ratio(scifact_labels)

({'SUPPORTS': 0.52, 'REFUTES': 0.22, 'NOT ENOUGH INFO': 0.26},
 {'SUPPORTS': 0.47, 'REFUTES': 0.18, 'NOT ENOUGH INFO': 0.34},
 {'NOT ENOUGH INFO': 0.38, 'REFUTES': 0.21, 'SUPPORTS': 0.41})