In [1]:
import sys
sys.path.insert(0, "../../src")
import pickle as pkl
from pathlib import Path
from functools import partial
from collections import Counter

import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, util

import constants
from gen.util import read_data, write_jsonl
from gen.special import entropy3

  from .autonotebook import tqdm as notebook_tqdm


# Init

In [2]:
root_error = Path("../../errors")
root_data = Path("../../data")
root_metrics = Path("../../metrics")

In [3]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

error_p = root_error
error_p.mkdir(exist_ok=True)

(error_p / "scifact").mkdir(exist_ok=True)
(error_p / "scifactpipeline").mkdir(exist_ok=True)

(error_p / "scifact" / "document").mkdir(exist_ok=True)
(error_p / "scifact" / "sentence").mkdir(exist_ok=True)
(error_p / "scifactpipeline" / "document").mkdir(exist_ok=True)
(error_p / "scifactpipeline" / "sentence").mkdir(exist_ok=True)


sf_actual_doc = read_data(root_data / "doc-dataset" / "scifact.all.test.n5.jsonl")
sf_actual_sent = read_data(root_data / "sent-dataset" / "scifact.all.test.n5.jsonl")

sf_pipe_doc = read_data(root_data / "doc-dataset" / "scifactpipeline.all.test.n6.s5.jsonl")
sf_pipe_sent = read_data(root_data / "sent-dataset" / "scifactpipeline.all.test.n6.s5.jsonl")

In [4]:
def update_with_cosine_similarity(documents):
    # Get cossim for each doc for doc data
    claim_embedding = sbert_model.encode([doc["claim"] for doc in documents], convert_to_tensor=True)
    evidence_embedding = sbert_model.encode([doc["evidence"] for doc in documents], convert_to_tensor=True)
    cosine_score = util.cos_sim(evidence_embedding, claim_embedding)

    for i in range(len(documents)):
        documents[i]["cosine_score"] = float(np.round(cosine_score[i][i].cpu().numpy(), 4))

update_with_cosine_similarity(sf_actual_doc)
update_with_cosine_similarity(sf_actual_sent)
update_with_cosine_similarity(sf_pipe_doc)
update_with_cosine_similarity(sf_pipe_sent)

In [5]:
df_sf_actual_doc = pd.DataFrame(sf_actual_doc).set_index("claim_id")
df_sf_actual_sent = pd.DataFrame(sf_actual_sent).set_index("claim_id")

df_sf_pipe_doc = (
    pd.DataFrame(sf_pipe_doc)
    .set_index("claim_id")
    .drop(columns=["labels"])
    .join(df_sf_actual_doc[["labels"]], how="left")
)
df_sf_pipe_sent = (
    pd.DataFrame(sf_pipe_sent)
    .set_index("claim_id")
    .drop(columns=["labels"])
    .join(df_sf_actual_doc[["labels"]], how="left")
)

sf_pipe_doc = df_sf_pipe_doc.reset_index().to_dict("records")
sf_pipe_sent = df_sf_pipe_sent.reset_index().to_dict("records")

In [6]:
for res in root_metrics.iterdir():
    if res.suffix == ".pkl":
        print(res.stem)
        with res.open("rb") as fn:
            globals()[res.stem] = pkl.load(fn)

concatenate_evidences_metrics
sent_macro_verdict_meanproba_metrics
sent_micro_verdict_metrics
sent_macro_verdict_majority_metrics


In [7]:
def compile_pred_actual(actual, pred):
    df_actual = pd.DataFrame([{"id": doc["claim_id"], "actual": doc["labels"]} for doc in actual])
    df_pred_ls = []
    
    for p in pred:
        assert len(actual) == len(p.predictions)
        df_pred_ls.append(pd.DataFrame([
            {f"{p._score_name.split('.')[0]}": constants.LABEL2ID[doc["predicted_label"]]} for doc in p.predictions
        ]))
    
    return pd.concat([df_actual, pd.concat(df_pred_ls, axis=1)], axis=1).set_index("id")

def assign_disagree_derive_cols(df, entropy_base: int = 3):
    _entropy = partial(entropy3, base=entropy_base)
    df = df.assign(
        model_agreement=df[df.filter(like="-").columns].stack().groupby(level=0, sort=False).apply(lambda x: Counter(x.tolist()).most_common(3)),
        entropy=df.loc[:, df.filter(like="-").columns].apply(_entropy, axis=1)
    )
    # combined_error: True if at least 1 model predicted label is not the same as actual label
    # total_error: True if all models fail to predict the actual label
    df = df.assign(
        combined_error=(df["model_agreement"].apply(len) > 1).values | (df["actual"] != df["model_agreement"].apply(lambda x: x[0][0])).values,
        total_error=df[["actual", "model_agreement"]].apply(lambda x: x["actual"] not in [i for i, _ in x["model_agreement"]], axis=1)
    )
    return df

# Document Model

## SciFact Oracle

In [8]:
df_doc_compile = compile_pred_actual(sf_actual_doc, concatenate_evidences_metrics["scifact"]["all"])
df_doc_compile["model_agreement"] = df_doc_compile[df_doc_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: Counter(x.tolist()).most_common(3))

Get model total agreement and disagreement regardless of training data

In [9]:
doc_model_agree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) == 1]
doc_model_disagree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) > 1]
doc_model_agree.shape, doc_model_disagree.shape

((174, 11), (935, 11))

### Model complete agreement analysis

In [10]:
doc_model_agree_error = doc_model_agree[doc_model_agree["actual"] != doc_model_agree["model_agreement"].apply(lambda x: x[0][0])]
doc_model_agree_error["actual"].value_counts()

2    9
0    2
Name: actual, dtype: int64

In [11]:
write_jsonl(
    error_p  / "scifact" / "document" / "doc_model_agree_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_agree_error[["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifact/document/doc_model_agree_total_error.jsonl')

### Model Disagreement analysis

In [12]:
doc_model_disagree = assign_disagree_derive_cols(doc_model_disagree)
(
    doc_model_disagree["combined_error"].value_counts(),
    doc_model_disagree["total_error"].value_counts()
)

(True    935
 Name: combined_error, dtype: int64,
 False    894
 True      41
 Name: total_error, dtype: int64)

In [13]:
doc_model_disagree.loc[doc_model_disagree["total_error"], "actual"].value_counts()

2    41
Name: actual, dtype: int64

In [14]:
write_jsonl(
    error_p / "scifact" / "document" / "doc_model_disagree_alltrain_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree.loc[doc_model_disagree["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifact/document/doc_model_disagree_alltrain_total_error.jsonl')

In [15]:
doc_model_disagree_f = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-[b|x|d].*")], axis=1), 2)
doc_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="^climatefeverpure-[b|x|d].*")], axis=1), 2)
doc_model_disagree_c_fp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-climatefeverpure-[b|x|d].*")], axis=1), 2)

In [16]:
(
    doc_model_disagree_f["combined_error"].value_counts(), 
    doc_model_disagree_cfp["combined_error"].value_counts(), 
    doc_model_disagree_c_fp["combined_error"].value_counts()
)

(True     595
 False    340
 Name: combined_error, dtype: int64,
 True     486
 False    449
 Name: combined_error, dtype: int64,
 True     647
 False    288
 Name: combined_error, dtype: int64)

In [17]:
(
    doc_model_disagree_f["total_error"].value_counts(), 
    doc_model_disagree_cfp["total_error"].value_counts(), 
    doc_model_disagree_c_fp["total_error"].value_counts()
)

(False    824
 True     111
 Name: total_error, dtype: int64,
 False    791
 True     144
 Name: total_error, dtype: int64,
 False    847
 True      88
 Name: total_error, dtype: int64)

In [18]:
(
    doc_model_disagree_f.loc[doc_model_disagree_f["total_error"], "actual"].value_counts(), 
    doc_model_disagree_cfp.loc[doc_model_disagree_cfp["total_error"], "actual"].value_counts(), 
    doc_model_disagree_c_fp.loc[doc_model_disagree_c_fp["total_error"], "actual"].value_counts()
)

(2    79
 0    32
 Name: actual, dtype: int64,
 2    118
 0     23
 1      3
 Name: actual, dtype: int64,
 2    77
 0    11
 Name: actual, dtype: int64)

In [19]:
write_jsonl(
    error_p / "scifact" / "document" / "doc_model_disagree_fever_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_f.loc[doc_model_disagree_f["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "document" / "doc_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_cfp.loc[doc_model_disagree_cfp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "document" / "doc_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_c_fp.loc[doc_model_disagree_c_fp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifact/document/doc_model_disagree_fever-climatefeverpure_total_error.jsonl')

## SciFact Pipeline

In [20]:
df_doc_compile = compile_pred_actual(sf_pipe_doc, concatenate_evidences_metrics["scifactpipeline"]["all"])
df_doc_compile["model_agreement"] = df_doc_compile[df_doc_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: Counter(x.tolist()).most_common(3))

Get model total agreement and disagreement regardless of training data

In [21]:
doc_model_agree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) == 1]
doc_model_disagree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) > 1]
doc_model_agree.shape, doc_model_disagree.shape

((20, 11), (1089, 11))

### Model complete agreement analysis

In [22]:
doc_model_agree_error = doc_model_agree[doc_model_agree["actual"] != doc_model_agree["model_agreement"].apply(lambda x: x[0][0])]
doc_model_agree_error["actual"].value_counts()

2    4
0    3
Name: actual, dtype: int64

In [23]:
write_jsonl(
    error_p / "scifactpipeline" / "document" / "doc_model_agree_total_error.jsonl", 
    df_sf_pipe_doc.join(doc_model_agree_error[["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifactpipeline/document/doc_model_agree_total_error.jsonl')

### Model Disagreement analysis

In [24]:
doc_model_disagree = assign_disagree_derive_cols(doc_model_disagree)
(
    doc_model_disagree["combined_error"].value_counts(),
    doc_model_disagree["total_error"].value_counts()
)

(True    1089
 Name: combined_error, dtype: int64,
 False    992
 True      97
 Name: total_error, dtype: int64)

In [25]:
doc_model_disagree.loc[doc_model_disagree["total_error"], "actual"].value_counts()

2    85
0    12
Name: actual, dtype: int64

In [26]:
write_jsonl(
    error_p / "scifactpipeline" / "document" / "doc_model_disagree_alltrain_total_error.jsonl", 
    df_sf_pipe_doc.join(doc_model_disagree.loc[doc_model_disagree["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifactpipeline/document/doc_model_disagree_alltrain_total_error.jsonl')

In [27]:
doc_model_disagree_f = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-[b|x|d].*")], axis=1), 2)
doc_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="^climatefeverpure-[b|x|d].*")], axis=1), 2)
doc_model_disagree_c_fp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-climatefeverpure-[b|x|d].*")], axis=1), 2)

In [28]:
(
    doc_model_disagree_f["combined_error"].value_counts(), 
    doc_model_disagree_cfp["combined_error"].value_counts(), 
    doc_model_disagree_c_fp["combined_error"].value_counts()
)

(True     1027
 False      62
 Name: combined_error, dtype: int64,
 True     873
 False    216
 Name: combined_error, dtype: int64,
 True     801
 False    288
 Name: combined_error, dtype: int64)

In [29]:
(
    doc_model_disagree_f["total_error"].value_counts(), 
    doc_model_disagree_cfp["total_error"].value_counts(), 
    doc_model_disagree_c_fp["total_error"].value_counts()
)

(False    921
 True     168
 Name: total_error, dtype: int64,
 False    755
 True     334
 Name: total_error, dtype: int64,
 False    836
 True     253
 Name: total_error, dtype: int64)

In [30]:
(
    doc_model_disagree_f.loc[doc_model_disagree_f["total_error"], "actual"].value_counts(), 
    doc_model_disagree_cfp.loc[doc_model_disagree_cfp["total_error"], "actual"].value_counts(), 
    doc_model_disagree_c_fp.loc[doc_model_disagree_c_fp["total_error"], "actual"].value_counts()
)

(2    113
 0     52
 1      3
 Name: actual, dtype: int64,
 2    208
 0    103
 1     23
 Name: actual, dtype: int64,
 2    118
 1    113
 0     22
 Name: actual, dtype: int64)

In [31]:
write_jsonl(
    error_p / "scifactpipeline" / "document" / "doc_model_disagree_fever_total_error.jsonl", 
    df_sf_pipe_doc.join(doc_model_disagree_f.loc[doc_model_disagree_f["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "document" / "doc_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_pipe_doc.join(doc_model_disagree_cfp.loc[doc_model_disagree_cfp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "document" / "doc_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_pipe_doc.join(doc_model_disagree_c_fp.loc[doc_model_disagree_c_fp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('../../errors/scifactpipeline/document/doc_model_disagree_fever-climatefeverpure_total_error.jsonl')

# Sentence Model

## SciFact Oracle

In [32]:
df_sf_sent_grp = (
    pd.DataFrame(sf_actual_sent)
    .groupby("claim_id", as_index=False, sort=False)
    .agg({"claim": "max", "evidence": list, "cosine_score": list, "labels": "max"})
    .set_index("claim_id")
    .loc[df_sf_actual_doc.index]
)

#### Majority Aggregation

In [33]:
df_maj_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_majority_metrics["scifact"]["all"])
df_maj_compile["model_agreement"] = df_maj_compile[df_maj_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [34]:
sent_model_agree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((295, 12), (814, 12))

##### Model complete agreement analysis

In [35]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error["actual"].value_counts()

1    66
2    19
0     4
Name: actual, dtype: int64

In [36]:
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/majority_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [37]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    814
 Name: combined_error, dtype: int64,
 False    766
 True      48
 Name: total_error, dtype: int64)

In [38]:
sent_model_disagree.loc[sent_model_disagree["total_error"], "actual"].value_counts()

2    30
1    14
0     4
Name: actual, dtype: int64

In [39]:
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/majority_model_disagree_alltrain_total_error.jsonl')

In [40]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [41]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     591
 False    223
 Name: combined_error, dtype: int64,
 True     607
 False    207
 Name: combined_error, dtype: int64,
 True     590
 False    224
 Name: combined_error, dtype: int64,
 True     593
 False    221
 Name: combined_error, dtype: int64,
 True     582
 False    232
 Name: combined_error, dtype: int64)

In [42]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(False    532
 True     282
 Name: total_error, dtype: int64,
 False    408
 True     406
 Name: total_error, dtype: int64,
 False    502
 True     312
 Name: total_error, dtype: int64,
 False    452
 True     362
 Name: total_error, dtype: int64,
 False    526
 True     288
 Name: total_error, dtype: int64)

In [43]:
(
    sent_model_disagree_f.loc[sent_model_disagree_f["total_error"], "actual"].value_counts(), 
    sent_model_disagree_cfp.loc[sent_model_disagree_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cfp.loc[sent_model_disagree_f_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_cf.loc[sent_model_disagree_cf["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cf.loc[sent_model_disagree_f_cf["total_error"], "actual"].value_counts() 
)

(1    186
 2     60
 0     36
 Name: actual, dtype: int64,
 1    215
 2    155
 0     36
 Name: actual, dtype: int64,
 1    178
 2     82
 0     52
 Name: actual, dtype: int64,
 2    180
 1    129
 0     53
 Name: actual, dtype: int64,
 1    123
 2    102
 0     63
 Name: actual, dtype: int64)

In [44]:
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "majority_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/majority_model_disagree_fever-climatefever_total_error.jsonl')

#### Mean Probability Aggregation

In [45]:
df_mpr_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_meanproba_metrics["scifact"]["all"])
df_mpr_compile["model_agreement"] = df_mpr_compile[df_mpr_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [46]:
sent_model_agree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((275, 12), (834, 12))

##### Model complete agreement analysis

In [47]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error["actual"].value_counts()

1    65
2    18
0     3
Name: actual, dtype: int64

In [48]:
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/meanproba_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [49]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    834
 Name: combined_error, dtype: int64,
 False    764
 True      70
 Name: total_error, dtype: int64)

In [50]:
sent_model_disagree.loc[sent_model_disagree["total_error"], "actual"].value_counts()

2    36
1    27
0     7
Name: actual, dtype: int64

In [51]:
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/meanproba_model_disagree_alltrain_total_error.jsonl')

In [52]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [53]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     606
 False    228
 Name: combined_error, dtype: int64,
 True     609
 False    225
 Name: combined_error, dtype: int64,
 True     607
 False    227
 Name: combined_error, dtype: int64,
 True     594
 False    240
 Name: combined_error, dtype: int64,
 True     588
 False    246
 Name: combined_error, dtype: int64)

In [54]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(True     417
 False    417
 Name: total_error, dtype: int64,
 False    452
 True     382
 Name: total_error, dtype: int64,
 False    498
 True     336
 Name: total_error, dtype: int64,
 False    484
 True     350
 Name: total_error, dtype: int64,
 False    521
 True     313
 Name: total_error, dtype: int64)

In [55]:
(
    sent_model_disagree_f.loc[sent_model_disagree_f["total_error"], "actual"].value_counts(), 
    sent_model_disagree_cfp.loc[sent_model_disagree_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cfp.loc[sent_model_disagree_f_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_cf.loc[sent_model_disagree_cf["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cf.loc[sent_model_disagree_f_cf["total_error"], "actual"].value_counts() 
)

(1    302
 2     67
 0     48
 Name: actual, dtype: int64,
 1    184
 2    160
 0     38
 Name: actual, dtype: int64,
 1    181
 2     86
 0     69
 Name: actual, dtype: int64,
 2    187
 0     83
 1     80
 Name: actual, dtype: int64,
 2    114
 1    109
 0     90
 Name: actual, dtype: int64)

In [56]:
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifact" / "sentence" / "meanproba_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifact/sentence/meanproba_model_disagree_fever-climatefever_total_error.jsonl')

## SciFact Pipeline

In [57]:
df_sf_sent_grp = (
    df_sf_pipe_sent.reset_index()
    .groupby("claim_id", as_index=False, sort=False)
    .agg({"claim": "max", "evidence": list, "cosine_score": list, "labels": "max"})
    .set_index("claim_id")
    .loc[df_sf_pipe_doc.index]
)

#### Majority Aggregation

In [58]:
df_maj_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_majority_metrics["scifactpipeline"]["all"])
df_maj_compile["model_agreement"] = df_maj_compile[df_maj_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [59]:
sent_model_agree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((389, 12), (720, 12))

##### Model complete agreement analysis

In [60]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error["actual"].value_counts()

1    100
2     37
0      3
Name: actual, dtype: int64

In [61]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/majority_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [62]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    720
 Name: combined_error, dtype: int64,
 False    638
 True      82
 Name: total_error, dtype: int64)

In [63]:
sent_model_disagree.loc[sent_model_disagree["total_error"], "actual"].value_counts()

1    52
2    22
0     8
Name: actual, dtype: int64

In [64]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/majority_model_disagree_alltrain_total_error.jsonl')

In [65]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [66]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     561
 False    159
 Name: combined_error, dtype: int64,
 True     529
 False    191
 Name: combined_error, dtype: int64,
 True     550
 False    170
 Name: combined_error, dtype: int64,
 True     504
 False    216
 Name: combined_error, dtype: int64,
 True     527
 False    193
 Name: combined_error, dtype: int64)

In [67]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(True     363
 False    357
 Name: total_error, dtype: int64,
 True     373
 False    347
 Name: total_error, dtype: int64,
 False    414
 True     306
 Name: total_error, dtype: int64,
 False    394
 True     326
 Name: total_error, dtype: int64,
 False    453
 True     267
 Name: total_error, dtype: int64)

In [68]:
(
    sent_model_disagree_f.loc[sent_model_disagree_f["total_error"], "actual"].value_counts(), 
    sent_model_disagree_cfp.loc[sent_model_disagree_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cfp.loc[sent_model_disagree_f_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_cf.loc[sent_model_disagree_cf["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cf.loc[sent_model_disagree_f_cf["total_error"], "actual"].value_counts() 
)

(1    275
 2     44
 0     44
 Name: actual, dtype: int64,
 1    209
 2    137
 0     27
 Name: actual, dtype: int64,
 1    203
 2     60
 0     43
 Name: actual, dtype: int64,
 2    156
 1    137
 0     33
 Name: actual, dtype: int64,
 1    147
 2     79
 0     41
 Name: actual, dtype: int64)

In [69]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "majority_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/majority_model_disagree_fever-climatefever_total_error.jsonl')

#### Mean Probability Aggregation

In [70]:
df_mpr_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_meanproba_metrics["scifactpipeline"]["all"])
df_mpr_compile["model_agreement"] = df_mpr_compile[df_mpr_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [71]:
sent_model_agree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((251, 12), (858, 12))

##### Model complete agreement analysis

In [72]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error["actual"].value_counts()

1    48
2    18
0     2
Name: actual, dtype: int64

In [73]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/meanproba_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [74]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    858
 Name: combined_error, dtype: int64,
 False    784
 True      74
 Name: total_error, dtype: int64)

In [75]:
sent_model_disagree.loc[sent_model_disagree["total_error"], "actual"].value_counts()

2    50
1    14
0    10
Name: actual, dtype: int64

In [76]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/meanproba_model_disagree_alltrain_total_error.jsonl')

In [77]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [78]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     618
 False    240
 Name: combined_error, dtype: int64,
 True     625
 False    233
 Name: combined_error, dtype: int64,
 True     605
 False    253
 Name: combined_error, dtype: int64,
 True     594
 False    264
 Name: combined_error, dtype: int64,
 True     582
 False    276
 Name: combined_error, dtype: int64)

In [79]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(False    440
 True     418
 Name: total_error, dtype: int64,
 False    468
 True     390
 Name: total_error, dtype: int64,
 False    525
 True     333
 Name: total_error, dtype: int64,
 False    515
 True     343
 Name: total_error, dtype: int64,
 False    552
 True     306
 Name: total_error, dtype: int64)

In [80]:
(
    sent_model_disagree_f.loc[sent_model_disagree_f["total_error"], "actual"].value_counts(), 
    sent_model_disagree_cfp.loc[sent_model_disagree_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cfp.loc[sent_model_disagree_f_cfp["total_error"], "actual"].value_counts(),
    sent_model_disagree_cf.loc[sent_model_disagree_cf["total_error"], "actual"].value_counts(),
    sent_model_disagree_f_cf.loc[sent_model_disagree_f_cf["total_error"], "actual"].value_counts() 
)

(1    307
 2     66
 0     45
 Name: actual, dtype: int64,
 1    190
 2    165
 0     35
 Name: actual, dtype: int64,
 1    169
 2     93
 0     71
 Name: actual, dtype: int64,
 2    198
 0     77
 1     68
 Name: actual, dtype: int64,
 2    124
 0    104
 1     78
 Name: actual, dtype: int64)

In [81]:
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "scifactpipeline" / "sentence" / "meanproba_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('../../errors/scifactpipeline/sentence/meanproba_model_disagree_fever-climatefever_total_error.jsonl')