In [1]:
import sys
sys.path.insert(0, "../src/gen")
sys.path.insert(1, "../src/rte")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from util import read_data, write_jsonl
from aggregate import generate_micro_macro_df

In [2]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [3]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

# Huggingface Init

## Model

In [4]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [5]:
model_checkpoint = "bert-base-uncased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=512, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure", "climatefever", "fever-climatefever"]
task = ["hp_tune", "output"]
doc_sent = ["doc", "sent"]

di = 1
ti = 0
ds = 1

model_store_path = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/models").joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}-{task[ti]}"

## Dataset

In [7]:
datap = Path(f"/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-{doc_sent[ds]}-evidence")

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "test": Dataset.from_list(read_data(datap / f"{dataset[di]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

Map:   0%|          | 0/3246 [00:00<?, ? examples/s]

Map:   0%|          | 0/927 [00:00<?, ? examples/s]

Map:   0%|          | 0/459 [00:00<?, ? examples/s]

## Trainer

In [8]:
batch_size = 16
learning_rate = 2e-5
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-02 22:28:20,171] A new study created in memory with name: no-name-17fef9c7-094f-4bf4-9b60-5a11b61f0bb8
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.921518,0.63754,0.514799,0.631454,0.501793
400,No log,1.360084,0.606257,0.593936,0.587575,0.58048


[I 2023-07-02 22:30:07,256] Trial 0 finished with value: 2.3682484199567595 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5}. Best is trial 0 with value: 2.3682484199567595.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928846,0.604099,0.473882,0.405993,0.43118
400,No log,1.076467,0.612729,0.544926,0.594452,0.549344


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:31:21,870] Trial 1 finished with value: 2.301451273904334 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 2.3682484199567595.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequence

Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:31:42,007] Trial 2 finished with value: 1.9118813595322481 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 0 with value: 2.3682484199567595.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.87159,0.618123,0.483004,0.413686,0.440486


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:32:03,910] Trial 3 finished with value: 1.9552988680240306 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 0 with value: 2.3682484199567595.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.951339,0.60302,0.477383,0.741092,0.435768


[I 2023-07-02 22:32:58,593] Trial 4 finished with value: 2.2572634120007176 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 0 with value: 2.3682484199567595.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928846,0.604099,0.473882,0.405993,0.43118


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:33:35,519] Trial 5 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.944621,0.648328,0.566476,0.662307,0.576952


[I 2023-07-02 22:34:12,983] Trial 6 finished with value: 2.4540627862546955 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 6 with value: 2.4540627862546955.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.930996,0.590076,0.464419,0.396672,0.421646


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:34:49,882] Trial 7 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.891536,0.648328,0.515294,0.760837,0.487808
400,No log,1.115915,0.638619,0.614773,0.621635,0.612853


[I 2023-07-02 22:36:03,890] Trial 8 finished with value: 2.4878796814817714 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 8 with value: 2.4878796814817714.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.944621,0.648328,0.566476,0.662307,0.576952


[I 2023-07-02 22:36:41,466] Trial 9 finished with value: 2.4540627862546955 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 8 with value: 2.4878796814817714.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.895271,0.62891,0.475746,0.409954,0.440159


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:37:01,640] Trial 10 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:37:41,248] Trial 11 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that y

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:38:20,894] Trial 12 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that y

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:39:00,772] Trial 13 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that y

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:39:40,301] Trial 14 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that y

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.8727,0.597627,0.47239,0.408121,0.427874


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:40:00,539] Trial 15 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.889845,0.641855,0.489605,0.419424,0.451786


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:40:20,681] Trial 16 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:41:00,393] Trial 17 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that y

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.8727,0.597627,0.47239,0.408121,0.427874


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 22:41:20,542] Trial 18 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.928575,0.641855,0.55182,0.664005,0.566535
400,No log,1.201953,0.638619,0.647256,0.610233,0.619533


[I 2023-07-02 22:42:00,058] Trial 19 finished with value: 2.5156413600393424 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 11 with value: 2.5156413600393424.


In [13]:
best_run

BestRun(run_id='11', objective=2.5156413600393424, hyperparameters={'learning_rate': 5e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}, run_summary=None)

## Train with best hyperparameters

In [9]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.964099,0.622438,0.495237,0.752013,0.454259
400,No log,1.221255,0.645092,0.614377,0.619184,0.607757


TrainOutput(global_step=406, training_loss=0.5194713799237031, metrics={'train_runtime': 42.0091, 'train_samples_per_second': 154.538, 'train_steps_per_second': 9.665, 'total_flos': 380892644698860.0, 'train_loss': 0.5194713799237031, 'epoch': 2.0})

In [10]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [11]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [12]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [13]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.76      0.60      0.67       475
        REFUTES       0.52      0.45      0.48       132
       SUPPORTS       0.58      0.78      0.66       320

       accuracy                           0.65       927
      macro avg       0.62      0.61      0.61       927
   weighted avg       0.66      0.65      0.64       927



In [14]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.49      0.36      0.41        95
        REFUTES       0.46      0.37      0.41        51
       SUPPORTS       0.63      0.80      0.71       132

       accuracy                           0.57       278
      macro avg       0.53      0.51      0.51       278
   weighted avg       0.55      0.57      0.55       278



In [15]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.59      0.63      0.61        95
        REFUTES       0.56      0.35      0.43        51
       SUPPORTS       0.70      0.77      0.74       132

       accuracy                           0.65       278
      macro avg       0.62      0.59      0.59       278
   weighted avg       0.64      0.65      0.64       278



## Test on test data

In [16]:
preds = trainer.predict(data["test"])
micro_val, macro_val = generate_micro_macro_df(data["test"], preds)

In [17]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.76      0.61      0.68       235
        REFUTES       0.38      0.44      0.41        48
       SUPPORTS       0.58      0.71      0.64       176

       accuracy                           0.63       459
      macro avg       0.57      0.59      0.57       459
   weighted avg       0.65      0.63      0.63       459



In [18]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.57      0.43      0.49        47
        REFUTES       0.41      0.36      0.38        25
       SUPPORTS       0.60      0.74      0.66        65

       accuracy                           0.56       137
      macro avg       0.53      0.51      0.51       137
   weighted avg       0.56      0.56      0.55       137



In [19]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.57      0.60      0.58        47
        REFUTES       0.50      0.36      0.42        25
       SUPPORTS       0.61      0.66      0.64        65

       accuracy                           0.58       137
      macro avg       0.56      0.54      0.55       137
   weighted avg       0.58      0.58      0.58       137

