In [1]:
import sys
sys.path.insert(0, "../../src")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from gen.util import read_data, write_jsonl
from rte.aggregate import generate_micro_macro_df

In [2]:
root_data = Path("../../data").resolve()
root_model = Path("../../models").resolve()

In [3]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [4]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)
import torch
torch.backends.cuda.matmul.allow_tf32 = True

  from .autonotebook import tqdm as notebook_tqdm


# Huggingface Init

## Model

In [5]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [6]:
model_checkpoint = "bert-base-uncased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=512, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure"]
doc_sent = ["doc", "sent"]

di = 2
ds = 0

model_store_path = root_model.joinpath("sentence-models" if ds == 1 else "document-models")
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}"

## Dataset

In [8]:
datap = root_data / f"{doc_sent[ds]}-dataset"

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "fever_test": Dataset.from_list(read_data(datap / f"{dataset[0]}.test.n5.jsonl")),
    "climatefever_test": Dataset.from_list(read_data(datap / f"{dataset[1]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

                                                                      

## Trainer

In [11]:
batch_size = 32
learning_rate = 2e-5
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=10, 
    hp_space=optuna_hp_space
)

[I 2023-07-04 13:44:38,036] A new study created in memory with name: no-name-22429e36-ada3-4df8-bb9e-33d8cc219dda
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.590838,0.653693,0.64969,0.824519,0.544652
400,No log,0.315366,0.892284,0.891552,0.90027,0.891459
600,0.434000,0.278977,0.89929,0.898646,0.905239,0.898105
800,0.434000,0.281363,0.903766,0.902989,0.914432,0.90307
1000,0.212600,0.318529,0.90396,0.903366,0.909012,0.903025
1200,0.212600,0.24968,0.916513,0.915994,0.921238,0.916243
1400,0.212600,0.271009,0.911647,0.910966,0.919599,0.911099
1600,0.153300,0.246249,0.922156,0.921712,0.925279,0.921676
1800,0.153300,0.22019,0.924394,0.924152,0.925721,0.924473
2000,0.125400,0.314358,0.921378,0.920906,0.925193,0.921115


[I 2023-07-04 13:57:21,457] Trial 0 finished with value: 3.7116962953643977 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 5}. Best is trial 0 with value: 3.7116962953643977.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.667036,0.651163,0.647167,0.486575,0.539921
400,No log,0.494824,0.794493,0.79238,0.838563,0.779226
600,0.528100,0.267156,0.8989,0.898392,0.901689,0.898118
800,0.528100,0.27507,0.905517,0.904841,0.91371,0.905153
1000,0.207900,0.302457,0.90649,0.90584,0.911936,0.905389
1200,0.207900,0.253134,0.918264,0.917737,0.923638,0.918249
1400,0.207900,0.241218,0.918167,0.917681,0.922086,0.917947
1600,0.155900,0.245108,0.917778,0.917259,0.92257,0.917654
1800,0.155900,0.21113,0.928968,0.928583,0.932085,0.928981
2000,0.130800,0.255262,0.928189,0.927842,0.93078,0.928281


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:10:01,960] Trial 1 finished with value: 3.7166557780025076 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 5}. Best is trial 1 with value: 3.7166557780025076.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.452018,0.817651,0.816626,0.824603,0.814178
400,No log,0.316104,0.891797,0.891027,0.899883,0.890459
600,0.414400,0.294901,0.89929,0.898623,0.90529,0.897676
800,0.414400,0.247189,0.914469,0.913968,0.919854,0.914515
1000,0.207400,0.258151,0.921183,0.920801,0.922802,0.920617
1200,0.207400,0.274122,0.914469,0.913979,0.919247,0.914412
1400,0.207400,0.298828,0.905128,0.904423,0.915049,0.905041
1600,0.151700,0.24345,0.917194,0.916691,0.922098,0.917094
1800,0.151700,0.234448,0.923616,0.923175,0.927431,0.923464
2000,0.115600,0.280603,0.92887,0.928662,0.930313,0.929099


[I 2023-07-04 14:20:13,640] Trial 2 finished with value: 3.731241053209169 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 4}. Best is trial 2 with value: 3.731241053209169.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (init

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.644587,0.651844,0.647833,0.489438,0.541158
400,No log,0.371948,0.870682,0.869553,0.888426,0.86809
600,0.463100,0.285426,0.899971,0.899369,0.905537,0.899269
800,0.463100,0.238083,0.914858,0.914318,0.920825,0.914805
1000,0.202900,0.267234,0.918653,0.918295,0.920514,0.918297
1200,0.202900,0.235929,0.921572,0.921115,0.925935,0.921536
1400,0.202900,0.258967,0.917778,0.917269,0.923302,0.917894
1600,0.150500,0.229437,0.927508,0.927156,0.929766,0.927388
1800,0.150500,0.206593,0.92926,0.928959,0.931092,0.929273
2000,0.122500,0.26319,0.92926,0.928924,0.931748,0.929309


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:30:26,122] Trial 3 finished with value: 3.7068982070106444 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 4}. Best is trial 2 with value: 3.731241053209169.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.652238,0.652622,0.648621,0.487727,0.541195
400,No log,0.372396,0.857254,0.856307,0.861457,0.852995
600,0.500000,0.257217,0.904447,0.903982,0.9067,0.903662
800,0.500000,0.256499,0.907463,0.906826,0.913046,0.906548
1000,0.200900,0.293596,0.912329,0.911712,0.917422,0.911355
1200,0.200900,0.218867,0.925173,0.924915,0.926922,0.925356
1400,0.200900,0.238204,0.919626,0.919095,0.924223,0.919273
1600,0.154000,0.214328,0.928384,0.928083,0.929848,0.928232
1800,0.154000,0.208498,0.929162,0.928876,0.930782,0.929167
2000,0.127700,0.260393,0.927897,0.927513,0.930972,0.927967


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:40:39,071] Trial 4 finished with value: 3.7176406297132525 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 4}. Best is trial 2 with value: 3.731241053209169.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.593463,0.65379,0.649777,0.824438,0.543648
400,No log,0.314252,0.884889,0.884076,0.893657,0.883675
600,0.435200,0.272513,0.903863,0.903241,0.909605,0.903307
800,0.435200,0.254756,0.909215,0.908598,0.915702,0.908891
1000,0.193700,0.270581,0.914567,0.914007,0.919142,0.914063
1200,0.193700,0.243753,0.923129,0.922721,0.926442,0.923166
1400,0.193700,0.248324,0.92021,0.919693,0.924848,0.919974
1600,0.146600,0.216215,0.927411,0.927121,0.929032,0.927401
1800,0.146600,0.22847,0.926048,0.925661,0.928865,0.925984


[I 2023-07-04 14:45:41,956] Trial 5 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.34419,0.863773,0.863017,0.870363,0.862805
400,No log,0.289511,0.893354,0.892508,0.904893,0.892242
600,0.361700,0.273547,0.900165,0.899377,0.91018,0.899214
800,0.361700,0.230252,0.916999,0.916515,0.921631,0.91692


[I 2023-07-04 14:48:10,859] Trial 6 finished with value: 3.67206596618286 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 1}. Best is trial 2 with value: 3.731241053209169.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initi

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.593463,0.65379,0.649777,0.824438,0.543648
400,No log,0.314252,0.884889,0.884076,0.893657,0.883675
600,0.435200,0.272513,0.903863,0.903241,0.909605,0.903307
800,0.435200,0.254756,0.909215,0.908598,0.915702,0.908891


[I 2023-07-04 14:50:24,871] Trial 7 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.648853,0.652039,0.648028,0.489253,0.541232


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:50:58,875] Trial 8 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.442202,0.799455,0.797848,0.821781,0.790572
400,No log,0.327025,0.886445,0.885522,0.898133,0.884845
600,0.396700,0.311826,0.885667,0.884742,0.899731,0.884841


[I 2023-07-04 14:52:39,375] Trial 9 pruned. 


In [13]:
best_run

BestRun(run_id='2', objective=3.731241053209169, hyperparameters={'learning_rate': 5e-05, 'num_train_epochs': 4}, run_summary=None)

## Train with best hyperparameters

In [None]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy = "steps",
    save_steps=1000,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1000,0.2187,0.236582,0.918751,0.918158,0.925179,0.918469
2000,0.1785,0.212528,0.93286,0.932448,0.935645,0.932553
3000,0.1731,0.182076,0.93753,0.937162,0.940182,0.937454
4000,0.1673,0.194541,0.941909,0.941669,0.943992,0.94217
5000,0.1222,0.197473,0.942006,0.941741,0.943291,0.941942
6000,0.1228,0.178773,0.944244,0.943941,0.946652,0.94437
7000,0.1098,0.197495,0.940158,0.93983,0.942155,0.940072
8000,0.1177,0.18374,0.944244,0.943902,0.946076,0.943899
9000,0.1125,0.17065,0.949693,0.949527,0.950453,0.949768
10000,0.0698,0.234957,0.943077,0.942854,0.944713,0.943282


In [29]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [22]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [23]:
preds = trainer.predict(data["validation"])
val = generate_doc_df(data["validation"], preds)

In [24]:
print(classification_report(y_true=val["actual"], y_pred=val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       1.00      0.99      0.99      3428
        REFUTES       0.95      0.91      0.93      3384
       SUPPORTS       0.91      0.96      0.94      3465

       accuracy                           0.95     10277
      macro avg       0.95      0.95      0.95     10277
   weighted avg       0.95      0.95      0.95     10277



## Test on test data

### FEVER

In [25]:
preds = trainer.predict(data["fever_test"])
fval = generate_doc_df(data["fever_test"], preds)

In [26]:
print(classification_report(y_true=fval["actual"], y_pred=fval["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       1.00      0.99      1.00      3333
        REFUTES       0.95      0.88      0.91      3333
       SUPPORTS       0.89      0.96      0.92      3333

       accuracy                           0.94      9999
      macro avg       0.95      0.94      0.94      9999
   weighted avg       0.95      0.94      0.94      9999



### Climate-FEVER

In [27]:
preds = trainer.predict(data["climatefever_test"])
cfval = generate_doc_df(data["climatefever_test"], preds)

In [28]:
print(classification_report(y_true=cfval["actual"], y_pred=cfval["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.91      0.85      0.88        47
        REFUTES       0.46      0.44      0.45        25
       SUPPORTS       0.74      0.78      0.76        65

       accuracy                           0.74       137
      macro avg       0.70      0.69      0.70       137
   weighted avg       0.75      0.74      0.74       137

