In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, "../src")
import pickle as pkl
from pathlib import Path
from functools import partial
from collections import Counter

import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, util

import constants
from gen.util import read_data, write_jsonl
from gen.special import entropy3

# Init

In [3]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

error_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact")
error_p.mkdir(exist_ok=True)

(error_p / "document").mkdir(exist_ok=True)
(error_p / "sentence").mkdir(exist_ok=True)

sf_actual_doc = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-doc-evidence/scifact.all.test.n5.jsonl"))
sf_actual_sent = read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence/scifact.all.test.n5.jsonl"))

In [4]:
# Get cossim for each doc for doc data
claim_embedding = sbert_model.encode([doc["claim"] for doc in sf_actual_doc], convert_to_tensor=True)
evidence_embedding = sbert_model.encode([doc["evidence"] for doc in sf_actual_doc], convert_to_tensor=True)
cosine_score = util.cos_sim(evidence_embedding, claim_embedding)

for i in range(len(sf_actual_doc)):
    sf_actual_doc[i]["cosine_score"] = float(cosine_score[i][i].cpu().numpy())
    
claim_embedding = sbert_model.encode([doc["claim"] for doc in sf_actual_sent], convert_to_tensor=True)
evidence_embedding = sbert_model.encode([doc["evidence"] for doc in sf_actual_sent], convert_to_tensor=True)
cosine_score = util.cos_sim(evidence_embedding, claim_embedding)

for i in range(len(sf_actual_sent)):
    sf_actual_sent[i]["cosine_score"] = float(cosine_score[i][i].cpu().numpy())

In [5]:
df_sf_actual_doc = pd.DataFrame(sf_actual_doc).set_index("claim_id")
df_sf_actual_sent = pd.DataFrame(sf_actual_sent).set_index("claim_id")

In [6]:
result_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/results/metrics")

for res in result_p.iterdir():
    print(res.stem)
    with res.open("rb") as fn:
        globals()[res.stem] = pkl.load(fn)

concatenate_evidences_metrics
sent_macro_verdict_meanproba_metrics
sent_micro_verdict_metrics
sent_macro_verdict_majority_metrics


In [7]:
def compile_pred_actual(actual, pred):
    df_actual = pd.DataFrame([{"id": doc["claim_id"], "actual": doc["labels"]} for doc in actual])
    df_pred_ls = []
    
    for p in pred:
        assert len(actual) == len(p.predictions)
        df_pred_ls.append(pd.DataFrame([
            {f"{p._score_name.split('.')[0]}": constants.LABEL2ID[doc["predicted_label"]]} for doc in p.predictions
        ]))
    
    return pd.concat([df_actual, pd.concat(df_pred_ls, axis=1)], axis=1).set_index("id")

def assign_disagree_derive_cols(df, entropy_base: int = 3):
    _entropy = partial(entropy3, base=entropy_base)
    df = df.assign(
        model_agreement=df[df.filter(like="-").columns].stack().groupby(level=0, sort=False).apply(lambda x: Counter(x.tolist()).most_common(3)),
        entropy=df.loc[:, df.filter(like="-").columns].apply(_entropy, axis=1)
    )
    # combined_error: True if at least 1 model predicted label is not the same as actual label
    # total_error: True if all models fail to predict the actual label
    df = df.assign(
        combined_error=(df["model_agreement"].apply(len) > 1).values | (df["actual"] != df["model_agreement"].apply(lambda x: x[0][0])).values,
        total_error=df[["actual", "model_agreement"]].apply(lambda x: x["actual"] not in [i for i, _ in x["model_agreement"]], axis=1)
    )
    return df

# Document Model

## SciFact Oracle

In [8]:
df_doc_compile = compile_pred_actual(sf_actual_doc, concatenate_evidences_metrics["scifact"]["all"])
df_doc_compile["model_agreement"] = df_doc_compile[df_doc_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: Counter(x.tolist()).most_common(3))

Get model total agreement and disagreement regardless of training data

In [9]:
doc_model_agree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) == 1]
doc_model_disagree = df_doc_compile[df_doc_compile["model_agreement"].apply(len) > 1]
doc_model_agree.shape, doc_model_disagree.shape

((174, 11), (935, 11))

### Model complete agreement analysis

In [10]:
doc_model_agree_error = doc_model_agree[doc_model_agree["actual"] != doc_model_agree["model_agreement"].apply(lambda x: x[0][0])]
doc_model_agree_error

Unnamed: 0_level_0,actual,climatefeverpure-bert-base-uncased,fever-climatefeverpure-da,fever-bert-base-uncased,climatefeverpure-da,fever-climatefeverpure-bert-base-uncased,fever-da,fever-xlnet-base-cased,fever-climatefeverpure-xlnet-base-cased,climatefeverpure-xlnet-base-cased,model_agreement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
scifact|105,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|184,0,2,2,2,2,2,2,2,2,2,"[(2, 9)]"
scifact|299,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|317,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|500,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|1101,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|1372,0,1,1,1,1,1,1,1,1,1,"[(1, 9)]"
scifact|1373,2,1,1,1,1,1,1,1,1,1,"[(1, 9)]"
scifact|1386,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"
scifact|718,2,0,0,0,0,0,0,0,0,0,"[(0, 9)]"


In [11]:
write_jsonl(
    error_p / "document" / "doc_model_agree_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_agree_error[["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/document/doc_model_agree_total_error.jsonl')

### Model Disagreement analysis

In [12]:
doc_model_disagree = assign_disagree_derive_cols(doc_model_disagree)
(
    doc_model_disagree["combined_error"].value_counts(),
    doc_model_disagree["total_error"].value_counts()
)

(True    935
 Name: combined_error, dtype: int64,
 False    894
 True      41
 Name: total_error, dtype: int64)

In [13]:
write_jsonl(
    error_p / "document" / "doc_model_disagree_alltrain_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree.loc[doc_model_disagree["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/document/doc_model_disagree_alltrain_total_error.jsonl')

In [14]:
doc_model_disagree_f = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-[b|x|d].*")], axis=1), 2)
doc_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="^climatefeverpure-[b|x|d].*")], axis=1), 2)
doc_model_disagree_c_fp = assign_disagree_derive_cols(pd.concat([doc_model_disagree[["actual"]], doc_model_disagree.filter(regex="fever-climatefeverpure-[b|x|d].*")], axis=1), 2)

In [15]:
(
    doc_model_disagree_f["combined_error"].value_counts(), 
    doc_model_disagree_cfp["combined_error"].value_counts(), 
    doc_model_disagree_c_fp["combined_error"].value_counts()
)

(True     595
 False    340
 Name: combined_error, dtype: int64,
 True     486
 False    449
 Name: combined_error, dtype: int64,
 True     647
 False    288
 Name: combined_error, dtype: int64)

In [16]:
(
    doc_model_disagree_f["total_error"].value_counts(), 
    doc_model_disagree_cfp["total_error"].value_counts(), 
    doc_model_disagree_c_fp["total_error"].value_counts()
)

(False    824
 True     111
 Name: total_error, dtype: int64,
 False    791
 True     144
 Name: total_error, dtype: int64,
 False    847
 True      88
 Name: total_error, dtype: int64)

In [17]:
write_jsonl(
    error_p / "document" / "doc_model_disagree_fever_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_f.loc[doc_model_disagree_f["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "document" / "doc_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_cfp.loc[doc_model_disagree_cfp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)
write_jsonl(
    error_p / "document" / "doc_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_actual_doc.join(doc_model_disagree_c_fp.loc[doc_model_disagree_c_fp["total_error"], ["model_agreement"]], how="inner").reset_index(names="id").to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/document/doc_model_disagree_fever-climatefeverpure_total_error.jsonl')

# Sentence Model

## SciFact Oracle

In [18]:
df_sf_sent_grp = (
    pd.DataFrame(sf_actual_sent)
    .groupby(["claim_id", "claim"], as_index=False, sort=False)[["evidence", "cosine_score", "labels"]]
    .agg({"evidence": list, "cosine_score": list, "labels": "max"})
    .set_index("claim_id")
)

#### Majority Aggregation

In [19]:
df_maj_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_majority_metrics["scifact"]["all"])
df_maj_compile["model_agreement"] = df_maj_compile[df_maj_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [20]:
sent_model_agree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_maj_compile[df_maj_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((295, 12), (814, 12))

##### Model complete agreement analysis

In [21]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error

Unnamed: 0_level_0,actual,fever-climatefever-xlnet-base-cased,climatefeverpure-bert-base-uncased,climatefever-xlnet-base-cased,fever-bert-base-uncased,fever-climatefeverpure-bert-base-uncased,fever-climatefever-bert-base-uncased,climatefever-bert-base-uncased,fever-xlnet-base-cased,fever-climatefeverpure-xlnet-base-cased,climatefeverpure-xlnet-base-cased,model_agreement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
scifact|63,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|81,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|82,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|119,2,0,0,0,0,0,0,0,0,0,0,[0]
scifact|149,2,0,0,0,0,0,0,0,0,0,0,[0]
...,...,...,...,...,...,...,...,...,...,...,...,...
scifact|1232,2,0,0,0,0,0,0,0,0,0,0,[0]
scifact|1279,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|1292,1,2,2,2,2,2,2,2,2,2,2,[2]
scifact|1316,1,0,0,0,0,0,0,0,0,0,0,[0]


In [22]:
write_jsonl(
    error_p / "sentence" / "majority_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/majority_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [23]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    814
 Name: combined_error, dtype: int64,
 False    766
 True      48
 Name: total_error, dtype: int64)

In [24]:
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/majority_model_disagree_alltrain_total_error.jsonl')

In [25]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [26]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     591
 False    223
 Name: combined_error, dtype: int64,
 True     607
 False    207
 Name: combined_error, dtype: int64,
 True     590
 False    224
 Name: combined_error, dtype: int64,
 True     593
 False    221
 Name: combined_error, dtype: int64,
 True     582
 False    232
 Name: combined_error, dtype: int64)

In [27]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(False    532
 True     282
 Name: total_error, dtype: int64,
 False    408
 True     406
 Name: total_error, dtype: int64,
 False    502
 True     312
 Name: total_error, dtype: int64,
 False    452
 True     362
 Name: total_error, dtype: int64,
 False    526
 True     288
 Name: total_error, dtype: int64)

In [28]:
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "majority_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/majority_model_disagree_fever-climatefever_total_error.jsonl')

#### Mean Probability Aggregation

In [29]:
df_mpr_compile = compile_pred_actual(df_sf_sent_grp.reset_index().to_dict("records"), sent_macro_verdict_meanproba_metrics["scifact"]["all"])
df_mpr_compile["model_agreement"] = df_mpr_compile[df_mpr_compile.columns[1:]].stack().groupby(level=0, sort=False).apply(lambda x: x.unique().tolist())

Get model total agreement and disagreement regardless of training data

In [30]:
sent_model_agree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) == 1]
sent_model_disagree = df_mpr_compile[df_mpr_compile["model_agreement"].apply(len) > 1]
sent_model_agree.shape, sent_model_disagree.shape

((275, 12), (834, 12))

##### Model complete agreement analysis

In [31]:
sent_model_agree_error = sent_model_agree[sent_model_agree["actual"] != sent_model_agree["model_agreement"].apply(lambda x: x[0])]
sent_model_agree_error

Unnamed: 0_level_0,actual,fever-climatefever-xlnet-base-cased,climatefeverpure-bert-base-uncased,climatefever-xlnet-base-cased,fever-bert-base-uncased,fever-climatefeverpure-bert-base-uncased,fever-climatefever-bert-base-uncased,climatefever-bert-base-uncased,fever-xlnet-base-cased,fever-climatefeverpure-xlnet-base-cased,climatefeverpure-xlnet-base-cased,model_agreement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
scifact|63,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|81,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|82,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|119,2,0,0,0,0,0,0,0,0,0,0,[0]
scifact|149,2,0,0,0,0,0,0,0,0,0,0,[0]
...,...,...,...,...,...,...,...,...,...,...,...,...
scifact|1226,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|1232,2,0,0,0,0,0,0,0,0,0,0,[0]
scifact|1279,1,0,0,0,0,0,0,0,0,0,0,[0]
scifact|1339,2,0,0,0,0,0,0,0,0,0,0,[0]


In [32]:
write_jsonl(
    error_p / "sentence" / "meanproba_model_agree_total_error.jsonl", 
    df_sf_sent_grp.join(sent_model_agree_error[["model_agreement"]], how="inner").reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/meanproba_model_agree_total_error.jsonl')

##### Model Disagreement analysis

In [33]:
sent_model_disagree = assign_disagree_derive_cols(sent_model_disagree)
(
    sent_model_disagree["combined_error"].value_counts(),
    sent_model_disagree["total_error"].value_counts()
)

(True    834
 Name: combined_error, dtype: int64,
 False    764
 True      70
 Name: total_error, dtype: int64)

In [34]:
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_alltrain_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree[sent_model_disagree["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/meanproba_model_disagree_alltrain_total_error.jsonl')

In [35]:
sent_model_disagree_f = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^fever-[b|x].*")], axis=1))
sent_model_disagree_f_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefever-")], axis=1))
sent_model_disagree_f_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(like="fever-climatefeverpure-")], axis=1))
sent_model_disagree_cf = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefever-.*")], axis=1))
sent_model_disagree_cfp = assign_disagree_derive_cols(pd.concat([sent_model_disagree[["actual"]], sent_model_disagree.filter(regex="^climatefeverpure-.*")], axis=1))

In [36]:
(
    sent_model_disagree_f["combined_error"].value_counts(), 
    sent_model_disagree_cfp["combined_error"].value_counts(),
    sent_model_disagree_f_cfp["combined_error"].value_counts(),
    sent_model_disagree_cf["combined_error"].value_counts(),
    sent_model_disagree_f_cf["combined_error"].value_counts() 
)

(True     606
 False    228
 Name: combined_error, dtype: int64,
 True     609
 False    225
 Name: combined_error, dtype: int64,
 True     607
 False    227
 Name: combined_error, dtype: int64,
 True     594
 False    240
 Name: combined_error, dtype: int64,
 True     588
 False    246
 Name: combined_error, dtype: int64)

In [37]:
(
    sent_model_disagree_f["total_error"].value_counts(), 
    sent_model_disagree_cfp["total_error"].value_counts(),
    sent_model_disagree_f_cfp["total_error"].value_counts(),
    sent_model_disagree_cf["total_error"].value_counts(),
    sent_model_disagree_f_cf["total_error"].value_counts() 
)

(True     417
 False    417
 Name: total_error, dtype: int64,
 False    452
 True     382
 Name: total_error, dtype: int64,
 False    498
 True     336
 Name: total_error, dtype: int64,
 False    484
 True     350
 Name: total_error, dtype: int64,
 False    521
 True     313
 Name: total_error, dtype: int64)

In [38]:
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_fever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f[sent_model_disagree_f["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cfp[sent_model_disagree_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_fever-climatefeverpure_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cfp[sent_model_disagree_f_cfp["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_cf[sent_model_disagree_cf["total_error"]].index].reset_index().to_dict("records")
)
write_jsonl(
    error_p / "sentence" / "meanproba_model_disagree_fever-climatefever_total_error.jsonl", 
    df_sf_sent_grp.loc[sent_model_disagree_f_cf[sent_model_disagree_f_cf["total_error"]].index].reset_index().to_dict("records")
)

PosixPath('/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/errors/scifact/sentence/meanproba_model_disagree_fever-climatefever_total_error.jsonl')