In [1]:
import sys
sys.path.insert(0, "../../src")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from gen.util import read_data, write_jsonl
from rte.aggregate import generate_micro_macro_df

In [2]:
root_data = Path("../../data").resolve()
root_model = Path("../../models").resolve()

In [3]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [4]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

  from .autonotebook import tqdm as notebook_tqdm


# Huggingface Init

## Model

In [5]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [6]:
model_checkpoint = "bert-base-uncased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=512, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure", "climatefever", "fever-climatefever"]
doc_sent = ["doc", "sent"]

di = 0
ds = 1

model_store_path = root_model.joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}"

## Dataset

In [8]:
datap = root_data / f"{doc_sent[ds]}-dataset"

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "test": Dataset.from_list(read_data(datap / f"{dataset[di]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

                                                                      

## Trainer

In [8]:
batch_size = 16
learning_rate = 2e-5
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-02 22:45:06,097] A new study created in memory with name: no-name-37d4434d-1dee-4279-87ed-5a4253f73215
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.66124,0.67377,0.643054,0.460897,0.52728
400,No log,0.539173,0.770103,0.755476,0.808334,0.726822
600,0.562800,0.268429,0.904721,0.898588,0.89873,0.896791
800,0.562800,0.254028,0.913518,0.907115,0.907062,0.906717
1000,0.249700,0.233321,0.920022,0.913982,0.916712,0.913144
1200,0.249700,0.277065,0.909491,0.902535,0.911353,0.900359
1400,0.249700,0.231667,0.924854,0.918203,0.922928,0.917754
1600,0.195200,0.267987,0.923182,0.922281,0.917324,0.919027
1800,0.195200,0.249193,0.927642,0.923455,0.923442,0.921896
2000,0.148300,0.275984,0.920022,0.914826,0.918875,0.912893


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 23:11:37,614] Trial 0 finished with value: 3.7275370030572534 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5}. Best is trial 0 with value: 3.7275370030572534.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.713938,0.656424,0.630072,0.459434,0.517025
400,No log,0.572352,0.710569,0.682192,0.724736,0.613365
600,0.608900,0.369273,0.856895,0.843793,0.868721,0.836504
800,0.608900,0.463622,0.85101,0.842995,0.868047,0.835117
1000,0.289100,0.266537,0.909491,0.90653,0.902463,0.90401
1200,0.289100,0.25405,0.9155,0.910151,0.909995,0.908843
1400,0.289100,0.299564,0.909057,0.902846,0.909395,0.900416
1600,0.227300,0.281428,0.918102,0.915227,0.913254,0.912469
1800,0.227300,0.311625,0.908376,0.90712,0.906291,0.902925
2000,0.227900,0.275113,0.916677,0.910674,0.916798,0.90879


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 23:26:59,684] Trial 1 finished with value: 3.7408988111774795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 1 with value: 3.7408988111774795.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.698922,0.664602,0.636219,0.45987,0.521839
400,No log,0.566699,0.688638,0.660647,0.738482,0.558522
600,0.608300,0.336267,0.874799,0.867402,0.866054,0.865417
800,0.608300,0.266552,0.903853,0.897167,0.899918,0.895026
1000,0.269100,0.23226,0.921323,0.915652,0.916754,0.914432
1200,0.269100,0.257161,0.91296,0.90694,0.913312,0.904523
1400,0.269100,0.225788,0.925226,0.919257,0.922695,0.918457
1600,0.195600,0.242691,0.92758,0.925277,0.921837,0.923007
1800,0.195600,0.214552,0.934952,0.93016,0.930326,0.929989
2000,0.149500,0.231514,0.931483,0.92591,0.928592,0.925475


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-02 23:53:31,273] Trial 2 finished with value: 3.7379546212305836 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5}. Best is trial 1 with value: 3.7408988111774795.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenc

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.655917,0.675505,0.644363,0.461181,0.528326
400,No log,0.544872,0.776732,0.763614,0.820566,0.738623
600,0.556600,0.27421,0.900322,0.893469,0.894359,0.891699
800,0.556600,0.254721,0.910544,0.905301,0.90549,0.903071
1000,0.244600,0.221874,0.923182,0.917806,0.91859,0.916516
1200,0.244600,0.276691,0.90249,0.894537,0.906878,0.891863
1400,0.244600,0.217758,0.926155,0.920352,0.923985,0.91953
1600,0.188600,0.23779,0.929129,0.926383,0.924057,0.924209
1800,0.188600,0.218685,0.934581,0.931085,0.929502,0.929655
2000,0.138100,0.238658,0.931855,0.926413,0.929039,0.925684


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:09:19,991] Trial 3 finished with value: 3.750187554771879 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 3 with value: 3.750187554771879.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceC

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.673817,0.668195,0.639731,0.463461,0.52497
400,No log,0.455397,0.796122,0.777375,0.817276,0.755482
600,0.571900,0.305834,0.887437,0.877734,0.886879,0.875873
800,0.571900,0.570284,0.84593,0.840046,0.863845,0.832247
1000,0.282500,0.276413,0.905588,0.901585,0.901045,0.898659
1200,0.282500,0.249472,0.915128,0.907797,0.911679,0.907887
1400,0.282500,0.298556,0.913456,0.907045,0.912697,0.905462
1600,0.233600,0.286449,0.91996,0.916439,0.916371,0.914175
1800,0.233600,0.276297,0.918226,0.915824,0.914702,0.912837
2000,0.230400,0.271623,0.919775,0.914782,0.917963,0.912968


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:24:42,239] Trial 4 finished with value: 3.7405969469052858 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 3 with value: 3.750187554771879.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequence

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.663612,0.673089,0.644814,0.468672,0.52936
400,No log,0.434438,0.827283,0.816443,0.842668,0.805802
600,0.527600,0.294412,0.902614,0.896931,0.895339,0.895885
800,0.527600,0.681339,0.822699,0.815111,0.854527,0.803995


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:26:58,222] Trial 5 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.614898,0.684735,0.654134,0.701922,0.544411
400,No log,0.449842,0.845372,0.836244,0.866024,0.827624
600,0.477500,0.248436,0.909862,0.902213,0.906367,0.901514
800,0.477500,0.248845,0.912898,0.909705,0.907681,0.906767
1000,0.231200,0.255283,0.912774,0.905156,0.914047,0.903913


[I 2023-07-03 00:30:50,123] Trial 6 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.621213,0.678974,0.648541,0.799326,0.531972
400,No log,0.551349,0.794635,0.780704,0.835425,0.759192
600,0.511300,0.25691,0.908747,0.901739,0.902338,0.90096
800,0.511300,0.271723,0.908066,0.904481,0.901273,0.902007
1000,0.244600,0.231932,0.921385,0.914403,0.916562,0.914781
1200,0.244600,0.298552,0.908562,0.902318,0.908898,0.8997
1400,0.244600,0.230403,0.924483,0.92015,0.920356,0.918387


[I 2023-07-03 00:36:13,822] Trial 7 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.622583,0.679779,0.649051,0.798456,0.532824
400,No log,0.452273,0.818858,0.805439,0.841408,0.789827
600,0.512700,0.27546,0.902366,0.897693,0.896143,0.894814
800,0.512700,0.225697,0.92027,0.91489,0.913836,0.914034
1000,0.232700,0.228013,0.922996,0.91639,0.920638,0.915939
1200,0.232700,0.239188,0.919465,0.913124,0.919309,0.911469
1400,0.232700,0.210516,0.930368,0.925536,0.92696,0.924423
1600,0.180400,0.237313,0.931297,0.928155,0.926392,0.92624
1800,0.180400,0.21678,0.934147,0.929685,0.929635,0.92873
2000,0.133300,0.221484,0.933156,0.928733,0.929291,0.927653


[I 2023-07-03 00:45:27,449] Trial 8 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.804362,0.620555,0.60053,0.446236,0.492092


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:46:02,169] Trial 9 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.596001,0.686222,0.655545,0.692106,0.549174
400,No log,0.407653,0.840726,0.83023,0.856344,0.819889
600,0.475000,0.268968,0.903048,0.898465,0.897403,0.895534
800,0.475000,0.230939,0.92027,0.915061,0.915427,0.913449
1000,0.227900,0.232635,0.921633,0.915719,0.919357,0.914355
1200,0.227900,0.229238,0.922376,0.916482,0.920252,0.915026


[I 2023-07-03 00:51:06,126] Trial 10 finished with value: 3.674136889484728 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 3 with value: 3.750187554771879.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you 

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.755566,0.642238,0.618349,0.453993,0.507188


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:51:40,941] Trial 11 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.784692,0.629352,0.607767,0.449446,0.498267


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:52:15,604] Trial 12 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.661518,0.672407,0.644151,0.467679,0.528682
400,No log,0.398717,0.831867,0.816388,0.840346,0.806251
600,0.542900,0.338967,0.872197,0.861355,0.87807,0.856416


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:53:57,876] Trial 13 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.676214,0.670611,0.640775,0.460771,0.52549


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:54:45,118] Trial 14 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.714016,0.656424,0.630072,0.459434,0.517025


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 00:55:19,695] Trial 15 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.622583,0.679779,0.649051,0.798456,0.532824
400,No log,0.452273,0.818858,0.805439,0.841408,0.789827
600,0.512700,0.27546,0.902366,0.897693,0.896143,0.894814
800,0.512700,0.225697,0.92027,0.91489,0.913836,0.914034
1000,0.232700,0.228013,0.922996,0.91639,0.920638,0.915939
1200,0.232700,0.239188,0.919465,0.913124,0.919309,0.911469
1400,0.232700,0.210516,0.930368,0.925536,0.92696,0.924423
1600,0.180400,0.237313,0.931297,0.928155,0.926392,0.92624
1800,0.180400,0.21678,0.934147,0.929685,0.929635,0.92873
2000,0.133300,0.221484,0.933156,0.928733,0.929291,0.927653


[I 2023-07-03 01:04:33,893] Trial 16 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.784692,0.629352,0.607767,0.449446,0.498267


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 01:05:09,440] Trial 17 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.714016,0.656424,0.630072,0.459434,0.517025


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 01:05:44,019] Trial 18 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.399952,0.832363,0.819415,0.841437,0.809131
400,No log,0.430494,0.86086,0.851505,0.876884,0.844144
600,0.395300,0.235065,0.916057,0.911863,0.910538,0.909805
800,0.395300,0.235221,0.922562,0.917968,0.91729,0.916428
1000,0.206700,0.223917,0.927952,0.922378,0.926033,0.921359
1200,0.206700,0.225649,0.924669,0.919261,0.923845,0.917701


[I 2023-07-03 01:10:47,932] Trial 19 finished with value: 3.68547573227396 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 3 with value: 3.750187554771879.


In [13]:
best_run

BestRun(run_id='3', objective=3.750187554771879, hyperparameters={'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}, run_summary=None)

## Train with best hyperparameters

In [12]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1000,0.3779,0.282555,0.896201,0.890241,0.898962,0.888655
2000,0.1696,0.19939,0.934364,0.932148,0.932955,0.931223
3000,0.1536,0.169217,0.94971,0.946888,0.948141,0.946748
4000,0.1308,0.148025,0.955227,0.953154,0.953243,0.952832
5000,0.1344,0.143667,0.957124,0.955334,0.954964,0.954972
6000,0.1177,0.166775,0.952583,0.950127,0.951747,0.949852
7000,0.1168,0.138376,0.958848,0.95662,0.957303,0.956576
8000,0.1011,0.160141,0.959653,0.957636,0.958052,0.957443
9000,0.0814,0.155362,0.961205,0.959438,0.959136,0.959189
10000,0.0774,0.146129,0.961492,0.959656,0.959629,0.959471


TrainOutput(global_step=23733, training_loss=0.10302246744561495, metrics={'train_runtime': 4843.5148, 'train_samples_per_second': 156.791, 'train_steps_per_second': 4.9, 'total_flos': 5.144692423815806e+16, 'train_loss': 0.10302246744561495, 'epoch': 3.0})

In [13]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [14]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [15]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [16]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.99      0.99      0.99      6666
        REFUTES       0.96      0.92      0.94      5347
       SUPPORTS       0.94      0.97      0.95      5386

       accuracy                           0.96     17399
      macro avg       0.96      0.96      0.96     17399
   weighted avg       0.96      0.96      0.96     17399



## Test on test data

In [19]:
preds = trainer.predict(data["test"])
micro_val, macro_val = generate_micro_macro_df(data["test"], preds)

In [20]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.99      0.99      6666
        REFUTES       0.95      0.90      0.92      5272
       SUPPORTS       0.92      0.96      0.94      5389

       accuracy                           0.95     17327
      macro avg       0.95      0.95      0.95     17327
   weighted avg       0.95      0.95      0.95     17327

