In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import json
import sys
import re
import pickle as pkl
from pathlib import Path
from functools import partial
from collections import defaultdict, namedtuple
sys.path.append("../src")

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

import constants
from scoring import scorer
from gen.util import read_data, write_jsonl
from rte import aggregate

In [3]:
data_sent_micro_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence")

pred_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions")
result_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/results/metrics")

fever_actual_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/fever-nei-sampled")
cfever_actual_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/climatefever-neg-sampled")
cfeverpure_actual_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/climatefeverpure-neg-sampled")
sf_actual_p = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/scifact-nei-sampled/all.n5.nei.jsonl")

actual_pls = (
    list(fever_actual_p.glob("*.n5.nei.jsonl")) 
    + list(cfever_actual_p.glob("*.n5.nei.jsonl")) 
    + list(cfeverpure_actual_p.glob("*.n5.nei.jsonl")) 
    + [sf_actual_p]
)
actual_pls = [p for p in actual_pls if "train" not in p.stem]

# Concatenate Evidences

In [4]:
concat_evi_res = {"fever": defaultdict(list), "climatefeverpure": defaultdict(list), "scifact": defaultdict(list)}

for actual_dp in tqdm(actual_pls):
    dataset = actual_dp.parent.stem.split("-")[0]
    split = actual_dp.stem.split(".")[0]
    actual_data = read_data(actual_dp)
    if dataset == "climatefeverpure":
        score_obj = scorer.ClimateFEVERScorer
    elif dataset == "climatefever":
        continue
    else:
        score_obj = scorer.FEVERScorer
        
    for pp in pred_p.joinpath("doc", dataset).glob(f"*.{split}.jsonl"):
        preds = read_data(pp)
        concat_evi_res[dataset][split].append(score_obj(actual_data=actual_data, prediction_data=preds, score_name=pp.stem, oracle_rte=False, oracle_ir=True, max_evidence=None))

100%|██████████| 7/7 [00:09<00:00,  1.38s/it]


In [5]:
with result_p.joinpath("concatenate_evidences_metrics.pkl").open("wb") as fn:
    pkl.dump(concat_evi_res, fn)

In [6]:
for score in concat_evi_res["fever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9967    0.9967    0.9967      3333
        REFUTES     0.9565    0.8983    0.9265      3333
       SUPPORTS     0.9038    0.9589    0.9306      3333

       accuracy                         0.9513      9999
      macro avg     0.9524    0.9513    0.9513      9999
   weighted avg     0.9524    0.9513    0.9513      9999

===
fever-da.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.7925    0.8446    0.8177      3333
        REFUTES     0.9042    0.3909    0.5459      3333
       SUPPORTS     0.5733    0.8611    0.6883      3333

       accuracy                         0.6989      9999
      macro avg     0.7567    0.6989    0.6840      9999
   weighted avg     0.7567    0.6989    0.6840      9999

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.1028    0.00

In [7]:
for score in concat_evi_res["climatefeverpure"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8391    0.7684    0.8022        95
        REFUTES     0.4776    0.6275    0.5424        51
       SUPPORTS     0.7742    0.7273    0.7500       132

       accuracy                         0.7230       278
      macro avg     0.6970    0.7077    0.6982       278
   weighted avg     0.7420    0.7230    0.7297       278

===
fever-da.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4082    0.2105    0.2778        95
        REFUTES     0.4000    0.3137    0.3516        51
       SUPPORTS     0.5132    0.7348    0.6044       132

       accuracy                         0.4784       278
      macro avg     0.4405    0.4197    0.4113       278
   weighted avg     0.4566    0.4784    0.4464       278

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8750    0.88

In [8]:
for score in concat_evi_res["scifact"]["all"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

climatefeverpure-bert-base-uncased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.5476    0.1659    0.2546       416
        REFUTES     0.3898    0.0970    0.1554       237
       SUPPORTS     0.4394    0.8904    0.5884       456

       accuracy                         0.4491      1109
      macro avg     0.4589    0.3844    0.3328      1109
   weighted avg     0.4694    0.4491    0.3707      1109

===
fever-climatefeverpure-da.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4762    0.0481    0.0873       416
        REFUTES     0.3723    0.1477    0.2115       237
       SUPPORTS     0.4337    0.9254    0.5906       456

       accuracy                         0.4301      1109
      macro avg     0.4274    0.3737    0.2965      1109
   weighted avg     0.4365    0.4301    0.3208      1109

===
fever-bert-base-uncased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8985    1.0

# Sentence

In [9]:
sent_pls = list(data_sent_micro_p.glob("*.dev*")) + list(data_sent_micro_p.glob("*.test*"))
sent_pls = [p for p in sent_pls if "fever-climatefever" not in p.stem]

## Micro verdict

In [10]:
sent_micro_res = {
    "fever": defaultdict(list), 
    "climatefeverpure": defaultdict(list), 
    "scifact": defaultdict(list),
    "climatefever": defaultdict(list)
}
# sscore = namedtuple("SentenceScorer", ["score_name", "classification_report", "rte_metrics"], module=scorer)

for actual_dp in tqdm(sent_pls):
    dataset = actual_dp.stem.split(".")[0]
    split = actual_dp.stem.split(".")[1]
    actual_data = read_data(actual_dp)
    
    for pp in pred_p.joinpath("sent", dataset).glob(f"*.{split}.jsonl"):
        preds = read_data(pp)
        assert len(actual_data) == len(preds), f"{actual_dp} ({len(actual_data)}) != {pp} ({len(preds)})"
        actual_labels = [constants.ID2LABEL[i["labels"]] for i in actual_data]
        predicted_labels = [i["predicted_label"] for i in preds]

        mi_p, mi_r, mi_f, _ = precision_recall_fscore_support(y_true=actual_labels, y_pred=predicted_labels, average="micro", beta=1.0)
        ma_p, ma_r, ma_f, _ = precision_recall_fscore_support(y_true=actual_labels, y_pred=predicted_labels, average="macro", beta=1.0)
        rte_metrics = {
            "accuracy": accuracy_score(y_true=actual_labels, y_pred=predicted_labels),
            "micro_precision": mi_p,
            "micro_recall": mi_r,
            "micro_f1": mi_f,
            "macro_precision": ma_p,
            "macro_recall": ma_r,
            "macro_f1": ma_f,

        }
        
        sent_micro_res[dataset][split].append(scorer.SentenceMicroScorer(
            pp.stem, 
            classification_report(y_true=actual_labels, y_pred=predicted_labels, digits=4),
            classification_report(y_true=actual_labels, y_pred=predicted_labels, output_dict=True),
            rte_metrics
        ))

100%|██████████| 7/7 [00:13<00:00,  1.88s/it]


In [11]:
with result_p.joinpath("sent_micro_verdict_metrics.pkl").open("wb") as fn:
    pkl.dump(sent_micro_res, fn)

In [12]:
for score in sent_micro_res["fever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9743    0.9853    0.9798      6666
        REFUTES     0.9519    0.9036    0.9272      4888
       SUPPORTS     0.9156    0.9501    0.9325      4588

       accuracy                         0.9506     16142
      macro avg     0.9473    0.9463    0.9465     16142
   weighted avg     0.9508    0.9506    0.9504     16142

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9778    0.9790    0.9784      6666
        REFUTES     0.9482    0.9059    0.9266      4888
       SUPPORTS     0.9123    0.9540    0.9327      4588

       accuracy                         0.9498     16142
      macro avg     0.9461    0.9463    0.9459     16142
   weighted avg     0.9502    0.9498    0.9497     16142

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [13]:
for score in sent_micro_res["climatefeverpure"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.7430    0.6695    0.7043       475
        REFUTES     0.4076    0.4848    0.4429       132
       SUPPORTS     0.6404    0.6844    0.6616       320

       accuracy                         0.6483       927
      macro avg     0.5970    0.6129    0.6030       927
   weighted avg     0.6598    0.6483    0.6524       927

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.6453    0.9116    0.7557       475
        REFUTES     0.5301    0.3333    0.4093       132
       SUPPORTS     0.8150    0.4406    0.5720       320

       accuracy                         0.6667       927
      macro avg     0.6635    0.5618    0.5790       927
   weighted avg     0.6875    0.6667    0.6430       927

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [14]:
for score in sent_micro_res["climatefever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8246    0.5512    0.6607       938
        REFUTES     0.2832    0.4848    0.3575       132
       SUPPORTS     0.4078    0.6844    0.5111       320

       accuracy                         0.5755      1390
      macro avg     0.5052    0.5735    0.5098      1390
   weighted avg     0.6772    0.5755    0.5975      1390

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.7674    0.8369    0.8006       938
        REFUTES     0.4151    0.3333    0.3697       132
       SUPPORTS     0.5402    0.4406    0.4854       320

       accuracy                         0.6978      1390
      macro avg     0.5742    0.5369    0.5519      1390
   weighted avg     0.6816    0.6978    0.6871      1390

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [15]:
for score in sent_micro_res["scifact"]["all"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.6370    0.9976    0.7775       832
        REFUTES     0.7644    0.3475    0.4778       495
       SUPPORTS     0.8058    0.6250    0.7040       896

       accuracy                         0.7027      2223
      macro avg     0.7357    0.6567    0.6531      2223
   weighted avg     0.7334    0.7027    0.6811      2223

===
climatefeverpure-bert-base-uncased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8639    0.6334    0.7309       832
        REFUTES     0.5395    0.1657    0.2535       495
       SUPPORTS     0.5414    0.8828    0.6712       896

       accuracy                         0.6298      2223
      macro avg     0.6483    0.5606    0.5519      2223
   weighted avg     0.6617    0.6298    0.6005      2223

===
climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO  

## Macro verdict

### Majority

In [16]:
majority_agg_res = {
    "fever": defaultdict(list), 
    "climatefeverpure": defaultdict(list), 
    "scifact": defaultdict(list),
    "climatefever": defaultdict(list)
}

for actual_dp in tqdm(actual_pls):
    dataset = actual_dp.parent.stem.split("-")[0]
    split = actual_dp.stem.split(".")[0]
    actual_data = read_data(actual_dp)
    if "climatefever" in dataset:
        score_obj = scorer.ClimateFEVERScorer
    else:
        score_obj = scorer.FEVERScorer
        
    for pp in pred_p.joinpath("sent", dataset).glob(f"*.{split}.jsonl"):
        preds = read_data(pp)
        sent_actual_data = read_data(data_sent_micro_p.joinpath("scifact.all.test.n5.jsonl" if dataset == "scifact" else f"{dataset}.{split}.n5.jsonl"))
        
        # aggregate
        df_agg = pd.DataFrame(sent_actual_data).merge(pd.DataFrame(preds), on="claim_id", how="left")
        df_agg["predicted_label"] = df_agg["predicted_label"].map(constants.LABEL2ID)
        df_agg = df_agg.groupby("claim_id", sort=False).agg({"predicted_label": aggregate.agg_predict})
        df_agg["predicted_label"] = df_agg["predicted_label"].map(constants.ID2LABEL)
        df_agg = df_agg.reset_index()
        df_agg = df_agg.to_dict("records")
        
        majority_agg_res[dataset][split].append(score_obj(actual_data=actual_data, prediction_data=df_agg, oracle_rte=False, oracle_ir=True, max_evidence=None, score_name=pp.stem))

100%|██████████| 7/7 [00:19<00:00,  2.81s/it]


In [17]:
with result_p.joinpath("sent_macro_verdict_majority_metrics.pkl").open("wb") as fn:
    pkl.dump(majority_agg_res, fn)

In [18]:
for score in majority_agg_res["fever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9828    0.9775    0.9801      3333
        REFUTES     0.9438    0.9064    0.9247      3333
       SUPPORTS     0.9076    0.9484    0.9275      3333

       accuracy                         0.9441      9999
      macro avg     0.9447    0.9441    0.9441      9999
   weighted avg     0.9447    0.9441    0.9441      9999

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9814    0.9661    0.9737      3333
        REFUTES     0.9387    0.9052    0.9216      3333
       SUPPORTS     0.9064    0.9529    0.9291      3333

       accuracy                         0.9414      9999
      macro avg     0.9422    0.9414    0.9415      9999
   weighted avg     0.9422    0.9414    0.9415      9999

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [19]:
for score in majority_agg_res["climatefeverpure"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4824    0.4316    0.4556        95
        REFUTES     0.3833    0.4510    0.4144        51
       SUPPORTS     0.6842    0.6894    0.6868       132

       accuracy                         0.5576       278
      macro avg     0.5166    0.5240    0.5189       278
   weighted avg     0.5600    0.5576    0.5578       278

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4933    0.7789    0.6041        95
        REFUTES     0.5135    0.3725    0.4318        51
       SUPPORTS     0.8022    0.5530    0.6547       132

       accuracy                         0.5971       278
      macro avg     0.6030    0.5682    0.5635       278
   weighted avg     0.6437    0.5971    0.5965       278

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [20]:
for score in majority_agg_res["climatefever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.5256    0.4316    0.4740        95
        REFUTES     0.3750    0.4706    0.4174        51
       SUPPORTS     0.6618    0.6818    0.6716       132

       accuracy                         0.5576       278
      macro avg     0.5208    0.5280    0.5210       278
   weighted avg     0.5626    0.5576    0.5575       278

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.5522    0.7789    0.6463        95
        REFUTES     0.5385    0.4118    0.4667        51
       SUPPORTS     0.8095    0.6439    0.7173       132

       accuracy                         0.6475       278
      macro avg     0.6334    0.6116    0.6101       278
   weighted avg     0.6719    0.6475    0.6471       278

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [21]:
for score in majority_agg_res["scifact"]["all"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.7437    0.9976    0.8522       416
        REFUTES     0.7661    0.4008    0.5263       237
       SUPPORTS     0.8126    0.7610    0.7860       456

       accuracy                         0.7728      1109
      macro avg     0.7742    0.7198    0.7215      1109
   weighted avg     0.7769    0.7728    0.7553      1109

===
climatefeverpure-bert-base-uncased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8833    0.5457    0.6746       416
        REFUTES     0.4615    0.1519    0.2286       237
       SUPPORTS     0.5271    0.8947    0.6634       456

       accuracy                         0.6050      1109
      macro avg     0.6240    0.5308    0.5222      1109
   weighted avg     0.6467    0.6050    0.5747      1109

===
climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO  

### Mean probability

In [22]:
meanproba_agg_res = {
    "fever": defaultdict(list), 
    "climatefeverpure": defaultdict(list), 
    "scifact": defaultdict(list),
    "climatefever": defaultdict(list)
}

for actual_dp in tqdm(actual_pls):
    dataset = actual_dp.parent.stem.split("-")[0]
    split = actual_dp.stem.split(".")[0]
    actual_data = read_data(actual_dp)
    if "climatefever" in dataset:
        score_obj = scorer.ClimateFEVERScorer
    else:
        score_obj = scorer.FEVERScorer
        
    for pp in pred_p.joinpath("sent", dataset).glob(f"*.{split}.jsonl"):
        preds = read_data(pp)
        sent_actual_data = read_data(data_sent_micro_p.joinpath("scifact.all.test.n5.jsonl" if dataset == "scifact" else f"{dataset}.{split}.n5.jsonl"))
        
        # aggregate
        df_agg = pd.DataFrame(sent_actual_data).merge(pd.DataFrame(preds), on="claim_id", how="left")
        df_agg = df_agg.groupby("claim_id", sort=False).agg({"predicted_proba": aggregate.agg_predict_proba})
        df_agg = df_agg.rename(columns={"predicted_proba": "predicted_label"})
        df_agg["predicted_label"] = df_agg["predicted_label"].map(constants.ID2LABEL)
        df_agg = df_agg.reset_index()
        df_agg = df_agg.to_dict("records")
        
        meanproba_agg_res[dataset][split].append(score_obj(actual_data=actual_data, prediction_data=df_agg, oracle_rte=False, oracle_ir=True, max_evidence=None, score_name=pp.stem))

100%|██████████| 7/7 [00:22<00:00,  3.16s/it]


In [23]:
with result_p.joinpath("sent_macro_verdict_meanproba_metrics.pkl").open("wb") as fn:
    pkl.dump(meanproba_agg_res, fn)

In [24]:
for score in meanproba_agg_res["fever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9866    0.9928    0.9897      3333
        REFUTES     0.9494    0.9061    0.9272      3333
       SUPPORTS     0.9148    0.9508    0.9325      3333

       accuracy                         0.9499      9999
      macro avg     0.9503    0.9499    0.9498      9999
   weighted avg     0.9503    0.9499    0.9498      9999

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.9871    0.9877    0.9874      3333
        REFUTES     0.9500    0.9058    0.9274      3333
       SUPPORTS     0.9125    0.9544    0.9330      3333

       accuracy                         0.9493      9999
      macro avg     0.9499    0.9493    0.9492      9999
   weighted avg     0.9499    0.9493    0.9492      9999

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [25]:
for score in meanproba_agg_res["climatefeverpure"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.5546    0.6947    0.6168        95
        REFUTES     0.4773    0.4118    0.4421        51
       SUPPORTS     0.7478    0.6515    0.6964       132

       accuracy                         0.6223       278
      macro avg     0.5932    0.5860    0.5851       278
   weighted avg     0.6322    0.6223    0.6225       278

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4564    0.9368    0.6138        95
        REFUTES     0.5417    0.2549    0.3467        51
       SUPPORTS     0.9153    0.4091    0.5654       132

       accuracy                         0.5612       278
      macro avg     0.6378    0.5336    0.5086       278
   weighted avg     0.6899    0.5612    0.5418       278

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [26]:
for score in meanproba_agg_res["climatefever"]["dev"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefeverpure-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.5280    0.6947    0.6000        95
        REFUTES     0.4667    0.4118    0.4375        51
       SUPPORTS     0.7407    0.6061    0.6667       132

       accuracy                         0.6007       278
      macro avg     0.5785    0.5709    0.5681       278
   weighted avg     0.6178    0.6007    0.6018       278

===
fever-climatefever-bert-base-uncased.dev
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.4101    0.9368    0.5705        95
        REFUTES     0.5000    0.1569    0.2388        51
       SUPPORTS     0.9111    0.3106    0.4633       132

       accuracy                         0.4964       278
      macro avg     0.6071    0.4681    0.4242       278
   weighted avg     0.6645    0.4964    0.4587       278

===
climatefeverpure-xlnet-base-cased.dev
                 precision    recall  f1-score   support

NOT EN

In [27]:
for score in meanproba_agg_res["scifact"]["all"]:
    print(score._score_name)
    print(score.classification_report)
    print("===")

fever-climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.6770    0.9976    0.8066       416
        REFUTES     0.7615    0.3502    0.4798       237
       SUPPORTS     0.8088    0.6864    0.7426       456

       accuracy                         0.7313      1109
      macro avg     0.7491    0.6781    0.6763      1109
   weighted avg     0.7492    0.7313    0.7104      1109

===
climatefeverpure-bert-base-uncased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO     0.8889    0.6538    0.7535       416
        REFUTES     0.4857    0.1435    0.2215       237
       SUPPORTS     0.5539    0.8904    0.6829       456

       accuracy                         0.6420      1109
      macro avg     0.6428    0.5626    0.5526      1109
   weighted avg     0.6650    0.6420    0.6108      1109

===
climatefever-xlnet-base-cased.all
                 precision    recall  f1-score   support

NOT ENOUGH INFO  