In [1]:
import sys
sys.path.insert(0, "../src")
import pickle as pkl
from pathlib import Path
from collections import Counter, defaultdict
from functools import partial

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from scipy.stats import ks_2samp
from sklearn.metrics import confusion_matrix

import constants
from gen.util import read_data, write_jsonl
from rte.aggregate import agg_predict, agg_predict_proba

# Init

In [2]:
sf_actual = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/feverised-scifact/scifact_all.jsonl")))
sf_actual["claim_id"] = "scifact|" + sf_actual["id"].astype(str)
sf_actual = sf_actual.rename(columns={"label": "predicted_label"})

sf_doc = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/doc/scifact")
sf_pipe_doc = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/doc/scifactpipeline")

sf_sent = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/sent/scifact")
sf_pipe_sent = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/sent/scifactpipeline")

In [3]:
def extract_model_dataset(fname):
    tok = fname.split("-")
    dataset = "-".join([tok[0], tok[1] if "climatefever" in tok[1] else ""]).strip("-")
    
    model = None
    if "xlnet" in fname:
        model = "xlnet"
    elif "bert" in fname:
        model = "bert"
    else:
        model = "da"
    
    return model, dataset

def get_stats(df1, df2):
    res = []
    for (oirk, oirv), (pk, pv) in zip(
        df1.sort_values(["dataset", "model", "predicted_label"]).groupby(["dataset", "model", "predicted_label"])["label_proba"], 
        df2.sort_values(["dataset", "model", "predicted_label"]).groupby(["dataset", "model", "predicted_label"])["label_proba"]
    ):
        assert all([i == j for i, j in zip(oirk, pk)]), f"data: {oirk} != {pk}"
        
        if len(oirv) <= 10 or len(pv) <= 10:
            res.append((pk[0], pk[1], pk[2], None, None, None))
            
        mean_shift = np.mean(pv) - np.mean(oirv)
        ks_res = ks_2samp(oirv, pv, alternative="two-sided", method="exact")
        
        res.append((pk[0], pk[1], pk[2], mean_shift, ks_res.statistic, ks_res.pvalue))
    
    res = pd.DataFrame(res, columns=["dataset", "model", "predicted_label", "mean_shift", "ks2samp_stats", "ks2samp_pvalue"])
    res["mean_shift"] = res["mean_shift"].round(4)
    res["ks2samp_stats"] = res["ks2samp_stats"].round(4)
    res["ks2samp_pvalue"] = res["ks2samp_pvalue"].round(4)

    return res

def get_confusion(df1, df2):
    res = defaultdict(dict)
    for (oirk, oirv), (pk, pv) in zip(
        df1.sort_values(["dataset", "model"]).groupby(["dataset", "model"])["predicted_label"], 
        df2.sort_values(["dataset", "model"]).groupby(["dataset", "model"])["predicted_label"]
    ):
        assert all([i == j for i, j in zip(oirk, pk)]), f"data: {oirk} != {pk}"
        
        res[pk[0]][pk[1]] = confusion_matrix(y_true=oirv, y_pred=pv, labels=[0,1,2], normalize="true")
    return res

## Archive

For all correctly labelled instances, measure the model confidence

```doc_ls.pivot_table(index=["dataset", "model"], columns=["predicted_label"], values="label_proba", aggfunc=["count", "min", "mean", "std"]).reset_index().to_csv("tmp.csv")```

# Document

In [4]:
def doc_proba(fn, sf, correct_only=False):
    pred = pd.DataFrame(read_data(fn))
    pred["model"], pred["dataset"] = extract_model_dataset(fn.stem)
    pred["label_proba"] = pred["predicted_proba"].apply(lambda x: max(x))
    if "predicted" in pred:
        pred = pred.rename(columns={"predicted": "predicted_label"})
    if correct_only:
        pred = pred.merge(sf[["claim_id", "predicted_label"]], on=["claim_id", "predicted_label"], how="inner")
    pred["predicted_label"] = pred["predicted_label"].map(constants.LABEL2ID)
    
    return pred

In [5]:
doc_ls = pd.concat(Parallel()(delayed(doc_proba)(p, sf_actual) for p in sf_doc.glob("*.all*")))
doc_pipe_ls = pd.concat(Parallel()(delayed(doc_proba)(p, sf_actual) for p in sf_pipe_doc.glob("*.all*")))

In [6]:
get_stats(doc_ls, doc_pipe_ls).pivot_table(index=["dataset", "model"], columns=["predicted_label"], values=["mean_shift", "ks2samp_pvalue"], aggfunc="mean")#.to_csv("tmp.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,ks2samp_pvalue,ks2samp_pvalue,ks2samp_pvalue,mean_shift,mean_shift,mean_shift
Unnamed: 0_level_1,predicted_label,0,1,2,0,1,2
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
climatefeverpure,bert,0.0,0.0,0.0079,-0.0699,-0.1331,-0.1037
climatefeverpure,da,0.0,0.0016,0.0,-0.0907,0.0305,-0.0721
climatefeverpure,xlnet,0.0,0.0,0.0711,-0.0967,-0.0663,-0.0908
fever,bert,0.0004,0.0,0.0104,0.0129,-0.0645,0.0333
fever,da,0.0,0.0,0.0048,0.217,0.0097,0.234
fever,xlnet,0.3982,0.0,0.0176,0.0001,-0.0589,-0.0043
fever-climatefeverpure,bert,0.0,0.0,0.0031,-0.0003,-0.0575,0.0063
fever-climatefeverpure,da,0.0,0.0071,0.3167,0.0749,0.0176,0.0192
fever-climatefeverpure,xlnet,0.0006,0.0,0.1028,-0.004,-0.0256,0.0064


In [7]:
doc_conf = get_confusion(doc_ls, doc_pipe_ls)

In [21]:
(doc_conf["fever-climatefeverpure"]["bert"] * 100 ).round(2)

array([[84.74,  8.02,  7.24],
       [51.9 , 30.43, 17.67],
       [22.52, 17.22, 60.26]])

In [22]:
(doc_conf["fever-climatefeverpure"]["xlnet"] * 100).round(2)

array([[70.6 , 24.02,  5.38],
       [29.7 , 65.81,  4.49],
       [27.85, 36.08, 36.08]])

In [10]:
doc_c_ls = pd.concat(Parallel()(delayed(doc_proba)(p, sf_actual, True) for p in sf_doc.glob("*.all*")))
doc_c_pipe_ls = pd.concat(Parallel()(delayed(doc_proba)(p, sf_actual, True) for p in sf_pipe_doc.glob("*.all*")))

# Sentence

## Majority

In [11]:
def majority_proba(fn, sf, correct_only=False):
    pred = pd.DataFrame(read_data(fn))
    pred["predicted_label"] = pred["predicted_label"].map(constants.LABEL2ID)
    pred["predicted_proba"] = pred["predicted_proba"].apply(np.array)
    pred = pred.groupby("claim_id", as_index=False, sort=False).agg({"predicted_proba": np.stack, "predicted_label": agg_predict})
    
    pred["_labls"] = pred["predicted_proba"].apply(lambda x: np.argmax(x, axis=1))
    pred["_probls"] = pred["predicted_proba"].apply(lambda x: np.max(x, axis=1))
    pred["_filter"] = pred[["predicted_label", "_labls"]].apply(lambda x: np.array([i == x["predicted_label"] for i in x["_labls"]]), axis=1)
    
    pred["label_proba"] = pred[["_probls", "_filter"]].apply(lambda x: np.mean(x["_probls"][x["_filter"]]), axis=1)
    pred["predicted_label"] = pred["predicted_label"].map(constants.ID2LABEL)
    if correct_only:
        pred = pred.merge(sf[["claim_id", "predicted_label"]], on=["claim_id", "predicted_label"], how="inner")
    pred["predicted_label"] = pred["predicted_label"].map(constants.LABEL2ID)
    
    pred = pred.drop(columns=[c for c in pred.columns if c.startswith("_")])
    pred["model"], pred["dataset"] = extract_model_dataset(fn.stem)
    # impute equal S/R counts as predicted NEI with mean of NEI
    pred.loc[pred["label_proba"].isnull(), "label_proba"] = pred.loc[pred["predicted_label"] == constants.LABEL2ID[constants.LOOKUP["label"]["nei"]], "label_proba"].mean()
    
    return pred

In [12]:
maj_ls = pd.concat(Parallel()(delayed(majority_proba)(p, sf_actual) for p in sf_sent.glob("*.all*")))
maj_pipe_ls = pd.concat(Parallel()(delayed(majority_proba)(p, sf_actual) for p in sf_pipe_sent.glob("*.all*")))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [13]:
get_stats(maj_ls, maj_pipe_ls).pivot_table(index=["dataset", "model"], columns=["predicted_label"], values=["mean_shift", "ks2samp_pvalue"], aggfunc="mean")#.to_csv("tmp.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,ks2samp_pvalue,ks2samp_pvalue,ks2samp_pvalue,mean_shift,mean_shift,mean_shift
Unnamed: 0_level_1,predicted_label,0,1,2,0,1,2
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
climatefever,bert,0.0,0.0012,0.1299,-0.0213,0.0441,-0.0202
climatefever,xlnet,0.0,0.0,0.1687,-0.0332,0.0431,0.0258
climatefeverpure,bert,0.0956,0.0144,0.9915,-0.0109,0.0238,0.0049
climatefeverpure,xlnet,0.0,0.0286,0.7472,-0.0299,0.0337,-0.0092
fever,bert,0.0,0.0015,0.4401,-0.0121,-0.0114,0.0049
fever,xlnet,0.0,0.0,0.0223,-0.0073,-0.0075,0.0
fever-climatefever,bert,0.0,0.0301,0.5396,-0.0169,0.0092,0.0019
fever-climatefever,xlnet,0.0,0.07,0.0,-0.014,0.0025,-0.0193
fever-climatefeverpure,bert,0.0,0.003,0.1743,-0.0028,-0.0071,0.0019
fever-climatefeverpure,xlnet,0.0,0.0,0.0062,-0.0079,0.0041,-0.005


In [23]:
maj_conf = get_confusion(maj_ls, maj_pipe_ls)
(maj_conf["fever-climatefever"]["xlnet"] * 100).round(2)

array([[92.61,  4.03,  3.36],
       [39.  , 43.67, 17.33],
       [ 9.81,  8.88, 81.31]])

In [15]:
maj_c_ls = pd.concat(Parallel()(delayed(majority_proba)(p, sf_actual, True) for p in sf_sent.glob("*.all*")))
maj_c_pipe_ls = pd.concat(Parallel()(delayed(majority_proba)(p, sf_actual, True) for p in sf_pipe_sent.glob("*.all*")))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Mean proba

In [16]:
def meanproba_proba(fn, sf, correct_only=False):
    pred = pd.DataFrame(read_data(fn))
    ret_proba = partial(agg_predict_proba, return_proba=True)
    pred = pred.groupby("claim_id", as_index=False, sort=False).agg({"predicted_proba": ret_proba})
    pred["label_proba"] = pred["predicted_proba"].apply(np.max)
    pred["predicted_label"] = pred["predicted_proba"].apply(np.argmax).map(constants.ID2LABEL)
    
    pred["model"], pred["dataset"] = extract_model_dataset(fn.stem)
    if correct_only:
        pred = pred.merge(sf[["claim_id", "predicted_label"]], on=["claim_id", "predicted_label"], how="inner")
    pred["predicted_label"] = pred["predicted_label"].map(constants.LABEL2ID)
    
    return pred

In [17]:
mp_ls = pd.concat(Parallel()(delayed(meanproba_proba)(p, sf_actual) for p in sf_sent.glob("*.all*")))
mp_pipe_ls = pd.concat(Parallel()(delayed(meanproba_proba)(p, sf_actual) for p in sf_pipe_sent.glob("*.all*")))

In [18]:
get_stats(mp_ls, mp_pipe_ls).pivot_table(index=["dataset", "model"], columns=["predicted_label"], values=["mean_shift", "ks2samp_pvalue"], aggfunc="mean")#.to_csv("tmp.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,ks2samp_pvalue,ks2samp_pvalue,ks2samp_pvalue,mean_shift,mean_shift,mean_shift
Unnamed: 0_level_1,predicted_label,0,1,2,0,1,2
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
climatefever,bert,0.0,0.0142,0.2853,-0.0431,-0.0349,-0.0307
climatefever,xlnet,0.0,0.0353,0.4856,-0.047,-0.0139,0.0226
climatefeverpure,bert,0.0725,0.1384,0.8755,-0.0143,-0.0424,-0.0107
climatefeverpure,xlnet,0.0,0.4301,0.9026,-0.0311,-0.0019,-0.0138
fever,bert,0.0,0.3597,0.0,-0.0702,-0.0455,-0.0587
fever,xlnet,0.0,0.0,0.0,-0.0763,-0.0893,-0.0818
fever-climatefever,bert,0.0,0.0,0.0,-0.0872,-0.0589,-0.1046
fever-climatefever,xlnet,0.0,0.0,0.0,-0.1,-0.0858,-0.1358
fever-climatefeverpure,bert,0.0,0.0,0.0,-0.086,-0.046,-0.0712
fever-climatefeverpure,xlnet,0.0,0.0,0.0,-0.0633,-0.0761,-0.1179


In [24]:
mp_conf = get_confusion(mp_ls, mp_pipe_ls)
(mp_conf["fever-climatefever"]["xlnet"] * 100).round(2)

array([[70.18, 24.64,  5.18],
       [18.79, 73.7 ,  7.51],
       [12.32, 24.14, 63.55]])

In [20]:
mp_c_ls = pd.concat(Parallel()(delayed(meanproba_proba)(p, sf_actual, True) for p in sf_sent.glob("*.all*")))
mp_c_pipe_ls = pd.concat(Parallel()(delayed(meanproba_proba)(p, sf_actual, True) for p in sf_pipe_sent.glob("*.all*")))