In [1]:
import sys
sys.path.insert(0, "../../src")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from gen.util import read_data, write_jsonl
from rte.aggregate import generate_micro_macro_df

In [2]:
root_data = Path("../../data").resolve()
root_model = Path("../../models").resolve()

In [3]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [4]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

  from .autonotebook import tqdm as notebook_tqdm


# Huggingface Init

## Model

In [5]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [6]:
model_checkpoint = "bert-base-uncased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=512, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure", "climatefever", "fever-climatefever"]
doc_sent = ["doc", "sent"]

di = 4
ds = 1

model_store_path = root_model.joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}"

## Dataset

In [9]:
datap = root_data / f"{doc_sent[ds]}-dataset"

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "fever_test": Dataset.from_list(read_data(datap / f"{dataset[0]}.test.n5.jsonl")),
    "climatefever_test": Dataset.from_list(read_data(datap / f"{dataset[1]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

                                                                      

## Trainer

In [8]:
batch_size = 16
learning_rate = 2e-5
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-02 22:24:20,485] A new study created in memory with name: no-name-8cdc174e-8dd2-45b5-99f7-65e9bbd09896
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.709205,0.664948,0.630959,0.465465,0.520997
400,No log,0.407057,0.833909,0.82371,0.831642,0.817921
600,0.521500,0.339742,0.873279,0.86625,0.877925,0.862448
800,0.521500,0.309051,0.890386,0.883398,0.890336,0.881781
1000,0.229500,0.258453,0.911184,0.903836,0.908845,0.904794
1200,0.229500,0.281829,0.904447,0.901484,0.901296,0.898728
1400,0.229500,0.247656,0.914055,0.906301,0.912077,0.908129
1600,0.194000,0.295872,0.911008,0.905235,0.907248,0.904746
1800,0.194000,0.268496,0.914582,0.909214,0.910846,0.908957
2000,0.133700,0.295163,0.913352,0.90944,0.90921,0.907789


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:35:48,568] Trial 0 finished with value: 3.6577068518348232 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 0 with value: 3.6577068518348232.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.815616,0.61515,0.590411,0.446991,0.486206
400,No log,0.736004,0.65575,0.624416,0.464368,0.515318
600,0.676600,0.610219,0.673736,0.641676,0.744473,0.543583
800,0.676600,0.405915,0.843576,0.83354,0.831153,0.831706
1000,0.388600,0.374655,0.865839,0.858709,0.859833,0.855843
1200,0.388600,0.360239,0.883824,0.879217,0.880916,0.87609
1400,0.388600,0.290826,0.900873,0.895359,0.893537,0.89438
1600,0.250500,0.297646,0.898061,0.89076,0.896296,0.890459
1800,0.250500,0.305376,0.9014,0.895757,0.899722,0.894439
2000,0.231300,0.317847,0.900521,0.890517,0.900745,0.89219


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 23:09:34,812] Trial 1 finished with value: 3.6645075218228365 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.683713,0.671627,0.637717,0.69477,0.537583
400,No log,0.452704,0.852774,0.837278,0.861842,0.836443
600,0.483500,0.562799,0.822309,0.82145,0.824317,0.812734
800,0.483500,0.309705,0.889449,0.885076,0.882851,0.882855
1000,0.295200,0.3482,0.893081,0.888372,0.886566,0.886387
1200,0.295200,0.455493,0.861269,0.856195,0.875219,0.851888
1400,0.295200,0.354575,0.895132,0.889138,0.893245,0.88755
1600,0.240700,0.284758,0.900814,0.897475,0.897678,0.894895
1800,0.240700,0.322178,0.901986,0.895329,0.901366,0.894573
2000,0.223900,0.300177,0.902923,0.894486,0.902857,0.894935


[I 2023-07-02 23:18:01,616] Trial 2 finished with value: 3.646729813298749 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.723697,0.663484,0.629558,0.463476,0.519598
400,No log,0.402827,0.837952,0.825512,0.832836,0.821273
600,0.544200,0.308936,0.88775,0.880629,0.883056,0.879457
800,0.544200,0.264701,0.905501,0.899083,0.900985,0.899128
1000,0.235000,0.260209,0.909426,0.902045,0.905931,0.903234
1200,0.235000,0.282013,0.904095,0.901582,0.898971,0.898878
1400,0.235000,0.248275,0.910657,0.902709,0.908403,0.904348
1600,0.194300,0.321022,0.909309,0.903279,0.90472,0.903529
1800,0.194300,0.303957,0.907493,0.904371,0.902773,0.902327
2000,0.137100,0.343673,0.898822,0.893803,0.898185,0.891581


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 23:40:45,653] Trial 3 finished with value: 3.6594067661703074 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.836464,0.604546,0.581877,0.443671,0.478743
400,No log,0.745774,0.651415,0.621068,0.463926,0.512608
600,0.692300,0.666423,0.662312,0.630149,0.804531,0.521339
800,0.692300,0.496443,0.784815,0.76555,0.813738,0.747528
1000,0.444500,0.396364,0.858047,0.84898,0.850459,0.846319
1200,0.444500,0.354929,0.881481,0.874966,0.877748,0.872377
1400,0.444500,0.321176,0.892319,0.886009,0.884165,0.884992
1600,0.271100,0.315483,0.887281,0.880167,0.884497,0.878795
1800,0.271100,0.312387,0.897006,0.889903,0.896263,0.889006
2000,0.241600,0.329408,0.896479,0.885678,0.897249,0.887762


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:22:49,296] Trial 4 finished with value: 3.6563836493369033 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.756087,0.636827,0.608787,0.457892,0.502235


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:23:26,131] Trial 5 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.757413,0.647841,0.616564,0.456821,0.50859


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:24:15,289] Trial 6 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.633942,0.672037,0.637516,0.701281,0.530283
400,No log,0.36641,0.861328,0.852011,0.857701,0.848544
600,0.467400,0.346905,0.874158,0.866454,0.879494,0.863147
800,0.467400,0.290487,0.896069,0.889695,0.893903,0.888251
1000,0.222300,0.260377,0.909309,0.904372,0.906201,0.903228
1200,0.222300,0.279347,0.900228,0.897425,0.898547,0.894289
1400,0.222300,0.2582,0.908899,0.903337,0.906714,0.902379


[I 2023-07-03 00:29:56,500] Trial 7 finished with value: 3.621329622441646 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.633942,0.672037,0.637516,0.701281,0.530283
400,No log,0.36641,0.861328,0.852011,0.857701,0.848544
600,0.467400,0.346905,0.874158,0.866454,0.879494,0.863147
800,0.467400,0.290487,0.896069,0.889695,0.893903,0.888251
1000,0.222300,0.260377,0.909309,0.904372,0.906201,0.903228
1200,0.222300,0.279347,0.900228,0.897425,0.898547,0.894289
1400,0.222300,0.2582,0.908899,0.903337,0.906714,0.902379


[I 2023-07-03 00:35:37,926] Trial 8 finished with value: 3.621329622441646 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 1 with value: 3.6645075218228365.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.815616,0.61515,0.590411,0.446991,0.486206


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:36:14,940] Trial 9 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.75626,0.636886,0.608833,0.45788,0.502267


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:36:51,652] Trial 10 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.723697,0.663484,0.629558,0.463476,0.519598


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:37:40,611] Trial 11 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.709205,0.664948,0.630959,0.465465,0.520997
400,No log,0.397458,0.84135,0.834331,0.83049,0.831048
600,0.521200,0.34551,0.876501,0.870952,0.878675,0.867261
800,0.521200,0.328682,0.880368,0.87769,0.881969,0.873647


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:40:54,350] Trial 12 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.784135,0.627336,0.600736,0.453199,0.495255


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:41:31,044] Trial 13 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.784135,0.627336,0.600736,0.453199,0.495255


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:42:07,877] Trial 14 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.655432,0.668873,0.635156,0.805765,0.526179
400,No log,0.362243,0.859336,0.851693,0.855282,0.847287
600,0.475300,0.317437,0.884235,0.877474,0.886348,0.874951
800,0.475300,0.309116,0.88857,0.884793,0.886818,0.881791


[I 2023-07-03 00:45:21,819] Trial 15 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.720584,0.649364,0.619179,0.463095,0.511156


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:45:58,724] Trial 16 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.723697,0.663484,0.629558,0.463476,0.519598


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:46:47,793] Trial 17 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.709205,0.664948,0.630959,0.465465,0.520997
400,No log,0.407057,0.833909,0.82371,0.831642,0.817921


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:48:25,015] Trial 18 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.756087,0.636827,0.608787,0.457892,0.502235


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:49:01,721] Trial 19 pruned. 


In [13]:
best_run

BestRun(run_id='1', objective=3.6645075218228365, hyperparameters={'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}, run_summary=None)

## Train with best hyperparameters

In [9]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy = "steps",
    save_steps=1000,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1000,0.5953,0.687341,0.659482,0.624442,0.472108,0.520794
2000,0.2716,0.360863,0.87168,0.862188,0.873963,0.861208
3000,0.2056,0.27266,0.915642,0.908608,0.916013,0.910425
4000,0.2019,0.277599,0.918463,0.912303,0.917352,0.913781
5000,0.1878,0.247009,0.920911,0.918838,0.918299,0.917991
6000,0.1894,0.273646,0.913087,0.90587,0.916032,0.907253
7000,0.1723,0.333343,0.909468,0.901395,0.912013,0.903051
8000,0.1654,0.272476,0.92304,0.920822,0.92124,0.920044
9000,0.1748,0.236978,0.927191,0.922201,0.926476,0.923335
10000,0.1575,0.253983,0.928682,0.926469,0.926157,0.92593


  _warn_prf(average, modifier, msg_start, len(result))
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



TrainOutput(global_step=64496, training_loss=0.10213710430152835, metrics={'train_runtime': 7334.785, 'train_samples_per_second': 140.683, 'train_steps_per_second': 8.793, 'total_flos': 5.904750620294971e+16, 'train_loss': 0.10213710430152835, 'epoch': 4.0})

In [10]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [11]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [12]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [13]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.95      0.98      0.96      7604
        REFUTES       0.96      0.90      0.93      5479
       SUPPORTS       0.93      0.93      0.93      5706

       accuracy                           0.94     18789
      macro avg       0.94      0.94      0.94     18789
   weighted avg       0.94      0.94      0.94     18789



## Test on test data

### FEVER

In [14]:
preds = trainer.predict(data["fever_test"])
micro_val, macro_val = generate_micro_macro_df(data["fever_test"], preds)

In [15]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.99      0.99      6666
        REFUTES       0.95      0.89      0.92      5272
       SUPPORTS       0.91      0.96      0.93      5389

       accuracy                           0.95     17327
      macro avg       0.95      0.94      0.94     17327
   weighted avg       0.95      0.95      0.95     17327



### Climate-FEVER

In [16]:
preds = trainer.predict(data["climatefever_test"])
micro_val, macro_val = generate_micro_macro_df(data["climatefever_test"], preds)

In [18]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.62      0.95      0.75       235
        REFUTES       0.76      0.27      0.40        48
       SUPPORTS       0.86      0.39      0.54       176

       accuracy                           0.67       459
      macro avg       0.75      0.54      0.56       459
   weighted avg       0.73      0.67      0.63       459

