In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import pickle as pkl
import random
import numpy as np
import pandas as pd
import scipy
import spacy
from pathlib import Path
from collections import Counter, defaultdict
from joblib import Parallel, delayed

import constants
import feverise.analysis as fa
from gen.util import read_data, write_jsonl

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
feverp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/fever")
climatefp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/climatefever")
scifactp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/data/scifact")

climatefdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-climatefever")
scifactdp = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact")

stat_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/stats")

In [4]:
fever_claims = read_data(feverp / "train.jsonl") + read_data(feverp / "shared_task_dev.jsonl")
scifact_claims = read_data(scifactdp / "scifact_all.jsonl")
cfever_claims = read_data(climatefdp / "climatefever_paper_all.jsonl")

# Label proportions

In [69]:
fever_labels = Counter([doc["label"] for doc in fever_claims])
cfever_labels = Counter([doc["label"] for doc in cfever_claims])
scifact_labels = Counter([doc["label"] for doc in scifact_claims])

In [70]:
def label_ratio(cnt):
    total = sum(cnt.values())
    return {k: round(v / total, 2) for k, v in cnt.items()}

In [71]:
label_ratio(fever_labels), label_ratio(cfever_labels), label_ratio(scifact_labels)

({'SUPPORTS': 0.52, 'REFUTES': 0.22, 'NOT ENOUGH INFO': 0.26},
 {'SUPPORTS': 0.47, 'REFUTES': 0.18, 'NOT ENOUGH INFO': 0.34},
 {'NOT ENOUGH INFO': 0.38, 'REFUTES': 0.21, 'SUPPORTS': 0.41})

# Count Claim-Evidence

In [72]:
fever_cnt, fever_stats = fa.count_evidences(fever_claims)
scifact_cnt, scifact_stats = fa.count_evidences(scifact_claims)
cfever_cnt, cfever_stats = fa.count_evidences(cfever_claims)

## FEVER

In [73]:
Counter(fever_cnt[constants.LOOKUP["label"]["s"]]), Counter(fever_cnt[constants.LOOKUP["label"]["r"]]), Counter(fever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 62251,
          2: 10520,
          4: 2473,
          16: 79,
          5: 2694,
          8: 589,
          13: 160,
          3: 3433,
          6: 1595,
          18: 70,
          7: 842,
          20: 51,
          9: 413,
          14: 128,
          27: 20,
          10: 334,
          138: 1,
          17: 70,
          21: 44,
          62: 2,
          15: 75,
          24: 33,
          11: 257,
          12: 215,
          44: 8,
          32: 11,
          34: 9,
          42: 7,
          50: 3,
          26: 30,
          43: 3,
          37: 6,
          25: 27,
          19: 55,
          22: 31,
          52: 4,
          29: 17,
          38: 7,
          28: 9,
          23: 20,
          36: 8,
          33: 11,
          31: 11,
          40: 4,
          59: 1,
          58: 1,
          51: 2,
          35: 6,
          45: 4,
          30: 16,
          140: 1,
          53: 3,
          48: 3,
          122: 1,
          39: 6,
          63: 1,


In [89]:
total = Counter(fever_cnt[constants.LOOKUP["label"]["s"]]) + Counter(fever_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.2811550892465609,
 2: 0.15980737684949084,
 3: 0.11906579396144289,
 4: 0.09055399457536828,
 5: 0.06152246999399068,
 6: 0.04394926182780855,
 7: 0.034139448766464735,
 8: 0.027358659108996118,
 9: 0.022494356109207258,
 10: 0.018523330788845396}

In [43]:
fever_stats

{'SUPPORTS': (1, ModeResult(mode=1, count=62251), 2.003979192858214, 251),
 'REFUTES': (1, ModeResult(mode=1, count=26269), 1.9882549875140638, 78),
 'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=42305), 0.0, 0)}

## Climate-FEVER

In [10]:
# climate-fever
Counter(cfever_cnt[constants.LOOKUP["label"]["s"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["r"]]), Counter(cfever_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({5: 654}), Counter({5: 253}), Counter({5: 474}))

In [45]:
cfever_stats

{'SUPPORTS': (5, ModeResult(mode=5, count=654), 5.0, 5),
 'REFUTES': (5, ModeResult(mode=5, count=253), 5.0, 5),
 'NOT ENOUGH INFO': (5, ModeResult(mode=5, count=474), 5.0, 5)}

## SciFact

In [9]:
# scifact
Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["r"]]), Counter(scifact_cnt[constants.LOOKUP["label"]["nei"]])

(Counter({1: 218, 2: 128, 4: 31, 5: 4, 3: 64, 9: 2, 11: 1, 8: 4, 6: 3, 7: 1}),
 Counter({1: 103, 3: 35, 5: 7, 2: 68, 4: 19, 7: 2, 8: 1, 6: 1, 12: 1}),
 Counter({0: 416}))

In [90]:
total = Counter(scifact_cnt[constants.LOOKUP["label"]["s"]]) + Counter(scifact_cnt[constants.LOOKUP["label"]["r"]])
fa.max_sentence_effect(total, 10)

{1: 0.5367965367965368,
 2: 0.25396825396825395,
 3: 0.1111111111111111,
 4: 0.03896103896103896,
 5: 0.023088023088023088,
 6: 0.017316017316017316,
 7: 0.012987012987012988,
 8: 0.005772005772005772,
 9: 0.002886002886002886,
 10: 0.002886002886002886}

In [44]:
scifact_stats

{'NOT ENOUGH INFO': (0, ModeResult(mode=0, count=416), 0.0, 0),
 'REFUTES': (1, ModeResult(mode=1, count=103), 2.088607594936709, 12),
 'SUPPORTS': (1, ModeResult(mode=1, count=218), 1.9649122807017543, 11)}

# Check claim-evidence tokenizer length

In [4]:
def multi_tok_lem(data):
    nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
    
    claims = [doc["claim"] for doc in data]
    evidence = [doc["evidence"] for doc in data]
    labels = [constants.ID2LABEL[doc["labels"]] for doc in data]
    
    cl_tok_len, ev_tok_len = [], []
    cl_lem_len, ev_lem_len = [], []
    
    for doc in nlp.pipe(claims, n_process=20, batch_size=1000):
        cl_tok_len.append(len(doc))
        cl_lem_len.append(len([t.lemma_ for t in doc]))
    for doc in nlp.pipe(evidence, n_process=20, batch_size=1000):
        ev_tok_len.append(len(doc))
        ev_lem_len.append(len([t.lemma_ for t in doc]))
    
    return {
        "claims_token": cl_tok_len,
        "claims_lemma": cl_lem_len,
        "evidence_token": ev_tok_len,
        "evidence_lemma": ev_lem_len,
        "label": labels
    }

## Document level evidence

In [22]:
bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-doc-evidence")
# fever_all = read_data(bert_data_p / "fever.all.jsonl")
# cfeverpure_all = read_data(bert_data_p / "climatefeverpure.all.jsonl")
cfever_all = read_data(bert_data_p / "climatefever.all.jsonl")

fever_all = read_data(bert_data_p / "fever.train.n5.jsonl")
cfeverpure_all = read_data(bert_data_p / "climatefeverpure.train.n5.jsonl")

scifact_all = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [23]:
doc_fever_all_toklem = multi_tok_lem(fever_all)
doc_cfeverpure_all_toklem = multi_tok_lem(cfeverpure_all)
doc_cfever_all_toklem = multi_tok_lem(cfever_all)
doc_scifact_all_toklem = multi_tok_lem(scifact_all)

In [29]:
doc_fever_all_toklem = pd.DataFrame(doc_fever_all_toklem)
doc_fever_all_toklem["data"] = "fever"
doc_cfeverpure_all_toklem = pd.DataFrame(doc_cfeverpure_all_toklem)
doc_cfeverpure_all_toklem["data"] = "climatefeverpure"
doc_cfever_all_toklem = pd.DataFrame(doc_cfever_all_toklem)
doc_cfever_all_toklem["data"] = "climatefever"
doc_scifact_all_toklem = pd.DataFrame(doc_scifact_all_toklem)
doc_scifact_all_toklem["data"] = "scifact"

doc_stats = pd.concat([doc_fever_all_toklem, doc_cfeverpure_all_toklem, doc_cfever_all_toklem, doc_scifact_all_toklem])
# doc_stats.to_parquet(stat_p / "doc_stats.parquet")

In [30]:
# doc_stats = pd.read_parquet(stat_p / "doc_stats.parquet")
doc_stats["concat_tokens"] = doc_stats["claims_token"] + doc_stats["evidence_token"]
doc_stats = doc_stats.assign(
    ge_512=doc_stats["concat_tokens"] >= 512,
    ge_1024=doc_stats["concat_tokens"] >= 1024
)

In [31]:
doc_stats.pivot_table(values=["evidence_token"], index="data", columns="label", aggfunc=["mean", "max"])

Unnamed: 0_level_0,mean,mean,mean,max,max,max
Unnamed: 0_level_1,evidence_token,evidence_token,evidence_token,evidence_token,evidence_token,evidence_token
label,NOT ENOUGH INFO,REFUTES,SUPPORTS,NOT ENOUGH INFO,REFUTES,SUPPORTS
data,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
climatefever,163.632911,157.889328,159.166667,755,336,639
climatefeverpure,160.759036,72.751412,83.540481,416,216,296
fever,119.814305,72.853199,77.49081,429,4625,13903
scifact,144.841346,82.075949,71.188596,343,423,443


In [32]:
doc_stats.groupby("data").agg({"ge_512": "sum", "ge_1024": "sum"})

Unnamed: 0_level_0,ge_512,ge_1024
data,Unnamed: 1_level_1,Unnamed: 2_level_1
climatefever,2,0
climatefeverpure,0,0
fever,1522,289
scifact,0,0


In [33]:
doc_stats.groupby("data").agg({"ge_512": "count", "ge_1024": "count"})

Unnamed: 0_level_0,ge_512,ge_1024
data,Unnamed: 1_level_1,Unnamed: 2_level_1
climatefever,1381,1381
climatefeverpure,966,966
fever,145449,145449
scifact,1109,1109


## Sentence level evidence

In [35]:
bert_data_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence")
# sent_fever_all = read_data(bert_data_p / "fever.all.nosentlim.jsonl")
sent_fever_all = read_data(bert_data_p / "fever.train.n5.jsonl")


sent_cfeverpure_all = read_data(bert_data_p / "climatefeverpure.all.jsonl")
sent_cfever_all = read_data(bert_data_p / "climatefever.all.jsonl")
sent_scifact_all = read_data(bert_data_p / "scifact.all.test.jsonl")

In [36]:
sent_fever_all_toklem = multi_tok_lem(sent_fever_all)
sent_cfeverpure_all_toklem = multi_tok_lem(sent_cfeverpure_all)
sent_cfever_all_toklem = multi_tok_lem(sent_cfever_all)
sent_scifact_all_toklem = multi_tok_lem(sent_scifact_all)

In [37]:
sent_fever_all_toklem = pd.DataFrame(sent_fever_all_toklem)
sent_fever_all_toklem["data"] = "fever"
sent_cfeverpure_all_toklem = pd.DataFrame(sent_cfeverpure_all_toklem)
sent_cfeverpure_all_toklem["data"] = "climatefeverpure"
sent_scifact_all_toklem = pd.DataFrame(sent_scifact_all_toklem)
sent_scifact_all_toklem["data"] = "scifact"
sent_cfever_all_toklem = pd.DataFrame(sent_cfever_all_toklem)
sent_cfever_all_toklem["data"] = "climatefever"

sent_stats = pd.concat([sent_fever_all_toklem, sent_cfeverpure_all_toklem, sent_cfever_all_toklem, sent_scifact_all_toklem])
# sent_stats.to_parquet(stat_p / "sent_stats.parquet")

In [38]:
# sent_stats = pd.read_parquet(stat_p / "sent_stats.parquet")
sent_stats["concat_tokens"] = sent_stats["claims_token"] + sent_stats["evidence_token"]
sent_stats = sent_stats.assign(
    ge_512=sent_stats["concat_tokens"] >= 512,
    ge_1024=sent_stats["concat_tokens"] >= 1024
)

In [39]:
sent_stats.pivot_table(values=["evidence_token"], index="data", columns="label", aggfunc=["mean", "max"])

Unnamed: 0_level_0,mean,mean,mean,max,max,max
Unnamed: 0_level_1,evidence_token,evidence_token,evidence_token,evidence_token,evidence_token,evidence_token
label,NOT ENOUGH INFO,REFUTES,SUPPORTS,NOT ENOUGH INFO,REFUTES,SUPPORTS
data,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
climatefever,32.037907,31.815517,32.34126,389,101,135
climatefeverpure,32.726582,31.815517,32.34126,301,101,135
fever,24.816142,36.161742,38.608686,263,1102,1333
scifact,34.169471,42.012959,39.016827,228,179,166


In [40]:
sent_stats.groupby("data").agg({"ge_512": "sum", "ge_1024": "sum"})

Unnamed: 0_level_0,ge_512,ge_1024
data,Unnamed: 1_level_1,Unnamed: 2_level_1
climatefever,0,0
climatefeverpure,0,0
fever,17,3
scifact,0,0


In [41]:
sent_stats.groupby("data").agg({"ge_512": "count", "ge_1024": "count"})

Unnamed: 0_level_0,ge_512,ge_1024
data,Unnamed: 1_level_1,Unnamed: 2_level_1
climatefever,6905,6905
climatefeverpure,4632,4632
fever,253140,253140
scifact,2127,2127


### Evidence per claim

In [102]:
sent_fever_all_ec = pd.DataFrame([{"data": "fever", "id": doc["claim_id"], "label": constants.ID2LABEL[doc["labels"]], "evidence": 1} for doc in sent_fever_all if doc["labels"] != constants.LABEL2ID[constants.LOOKUP["label"]["nei"]]])
sent_cfever_all_ec = pd.DataFrame([{"data": "climatefever", "id": doc["claim_id"], "label": constants.ID2LABEL[doc["labels"]], "evidence": 1} for doc in sent_cfever_all])
sent_scifact_all_ec = pd.DataFrame([{"data": "scifact", "id": doc["claim_id"], "label": constants.ID2LABEL[doc["labels"]], "evidence": 1} for doc in sent_scifact_all if doc["labels"] != constants.LABEL2ID[constants.LOOKUP["label"]["nei"]]])

sent_all_ec = pd.concat([sent_fever_all_ec, sent_cfever_all_ec, sent_scifact_all_ec])

In [103]:
sent_all_ec.groupby(["data", "id"], as_index=False)["evidence"].sum().groupby("data").agg({"evidence": ["mean", "max"]})

Unnamed: 0_level_0,evidence,evidence
Unnamed: 0_level_1,mean,max
data,Unnamed: 1_level_2,Unnamed: 2_level_2
climatefever,5.0,5
fever,1.999326,251
scifact,1.868687,11


In [104]:
sent_all_ec.groupby(["data", "id", "label"], as_index=False)["evidence"].sum().pivot_table(values="evidence", index="data", columns="label", aggfunc=["mean", "max"])

Unnamed: 0_level_0,mean,mean,mean,max,max,max
label,NOT ENOUGH INFO,REFUTES,SUPPORTS,NOT ENOUGH INFO,REFUTES,SUPPORTS
data,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
climatefever,3.602017,2.29249,2.571865,5.0,5.0,5.0
fever,,1.988255,2.003979,,78.0,251.0
scifact,,1.953586,1.824561,,11.0,11.0


### Label proportions

In [48]:
fever_train = read_data(feverp / "train.jsonl")
scifact_all = read_data(scifactdp / "scifact_all.jsonl")
cfeverpure_train = read_data(climatefdp / "finetune" / "baseline" / "climatefever_train.jsonl")
cfever_train = read_data(climatefdp.parent / "feverised-climatefever_sent" / "finetune" / "baseline" / "climatefever_train.jsonl")

fever_strain = read_data(bert_data_p / "fever.train.n5.jsonl")
cfeverpure_strain = read_data(bert_data_p / "climatefeverpure.train.n5.jsonl")
cfever_strain = read_data(bert_data_p / "climatefever.train.n5.jsonl")
scifact_sall = read_data(bert_data_p / "scifact.all.test.n5.jsonl")

In [49]:
def convert_to_sentence(data, nei_val: int):
    res = defaultdict(list)
    for doc in data:
        for evidence in doc["evidence"]:
            ev_len = len(evidence)
            if doc["label"] == constants.LOOKUP["label"]["nei"]:
                res[doc["label"]].append(nei_val)
            else:
                res[doc["label"]].append(ev_len)
    return res

In [51]:
(
    {k: sum(v) for k, v in convert_to_sentence(fever_train, 5).items()}, 
    Counter([constants.ID2LABEL[doc["labels"]] for doc in fever_strain])
)

({'SUPPORTS': 193756, 'REFUTES': 70066, 'NOT ENOUGH INFO': 178195},
 Counter({'SUPPORTS': 132827, 'REFUTES': 49035, 'NOT ENOUGH INFO': 71278}))

In [61]:
Counter([doc["label"] for doc in cfever_train])

Counter({'SUPPORTS': 457, 'REFUTES': 177, 'NOT ENOUGH INFO': 332})

In [55]:
Counter([constants.ID2LABEL[doc["labels"]] for doc in cfever_strain])

Counter({'NOT ENOUGH INFO': 3244, 'SUPPORTS': 1186, 'REFUTES': 400})

In [59]:
Counter([constants.ID2LABEL[doc["labels"]] for doc in cfeverpure_strain])

Counter({'SUPPORTS': 1186, 'REFUTES': 400, 'NOT ENOUGH INFO': 1660})

In [56]:
(
    {k: sum(v) for k, v in convert_to_sentence(scifact_all, 5).items()}, 
    Counter([constants.ID2LABEL[doc["labels"]] for doc in scifact_sall])
)

({'NOT ENOUGH INFO': 2080, 'REFUTES': 495, 'SUPPORTS': 896},
 Counter({'NOT ENOUGH INFO': 832, 'REFUTES': 495, 'SUPPORTS': 896}))