In [1]:
import sys
sys.path.insert(0, "../../src")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from gen.util import read_data, write_jsonl
from rte.aggregate import generate_micro_macro_df

In [2]:
root_data = Path("../../data").resolve()
root_model = Path("../../models").resolve()

In [3]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [4]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)
import torch
torch.backends.cuda.matmul.allow_tf32 = True

  from .autonotebook import tqdm as notebook_tqdm


# Huggingface Init

## Model

In [5]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [6]:
model_checkpoint = "bert-base-uncased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=512, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure"]
doc_sent = ["doc", "sent"]

di = 0
ds = 0

model_store_path = root_model.joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}"

## Dataset

In [8]:
datap = root_data / f"{doc_sent[ds]}-dataset"

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "test": Dataset.from_list(read_data(datap / f"{dataset[di]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

                                                                      

## Trainer

In [8]:
batch_size = 32
learning_rate = 2e-5
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=10, 
    hp_space=optuna_hp_space
)

[I 2023-07-04 13:43:19,788] A new study created in memory with name: no-name-d00b05c7-1510-47de-b5f6-a7e264da984e
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClas

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.66162,0.658066,0.658066,0.6592,0.547713
400,No log,0.239154,0.912491,0.912491,0.916174,0.912087
600,0.438900,0.20963,0.925993,0.925993,0.928032,0.925833
800,0.438900,0.208217,0.931293,0.931293,0.933916,0.931267
1000,0.190900,0.211251,0.932693,0.932693,0.936083,0.932621
1200,0.190900,0.194829,0.939994,0.939994,0.941238,0.939965
1400,0.190900,0.242561,0.925193,0.925193,0.930704,0.924873
1600,0.134500,0.234694,0.933893,0.933893,0.937704,0.933789
1800,0.134500,0.201446,0.936894,0.936894,0.939571,0.936696
2000,0.107000,0.252698,0.936494,0.936494,0.939619,0.936383


[I 2023-07-04 13:50:43,919] Trial 0 finished with value: 3.7657467255346386 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 3}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.64642,0.655766,0.655766,0.4892,0.545109
400,No log,0.306013,0.878288,0.878288,0.891253,0.876036
600,0.500800,0.216311,0.921292,0.921292,0.922859,0.920955
800,0.500800,0.213145,0.925993,0.925993,0.92998,0.925847
1000,0.202700,0.203453,0.934693,0.934693,0.936586,0.934636
1200,0.202700,0.19088,0.939794,0.939794,0.940391,0.939739
1400,0.202700,0.211524,0.926593,0.926593,0.931494,0.926282
1600,0.147800,0.230081,0.930693,0.930693,0.934777,0.930575
1800,0.147800,0.220852,0.930793,0.930793,0.935333,0.930619
2000,0.118400,0.250252,0.933593,0.933593,0.936882,0.933674


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:00:37,725] Trial 1 finished with value: 3.7550912675037296 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 4}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.421591,0.811181,0.811181,0.849094,0.799006
400,No log,0.236395,0.916292,0.916292,0.919214,0.916084
600,0.398800,0.201488,0.929793,0.929793,0.930568,0.92964
800,0.398800,0.195903,0.923592,0.923592,0.928485,0.923307
1000,0.196000,0.210948,0.934493,0.934493,0.935555,0.934464
1200,0.196000,0.214844,0.936294,0.936294,0.93728,0.936243
1400,0.196000,0.253301,0.929393,0.929393,0.932987,0.929063
1600,0.134700,0.26101,0.931193,0.931193,0.935101,0.93085
1800,0.134700,0.227615,0.931393,0.931393,0.935594,0.931247
2000,0.103200,0.283238,0.935194,0.935194,0.938051,0.93493


[I 2023-07-04 14:07:58,179] Trial 2 finished with value: 3.7526556601423042 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 3}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.638685,0.657166,0.657166,0.490467,0.546401
400,No log,0.297662,0.888589,0.888589,0.896408,0.888128
600,0.482400,0.201401,0.925193,0.925193,0.92572,0.925024
800,0.482400,0.22161,0.922592,0.922592,0.92779,0.92226
1000,0.202000,0.202357,0.934493,0.934493,0.936732,0.934358
1200,0.202000,0.207302,0.936294,0.936294,0.937527,0.936291
1400,0.202000,0.254437,0.918192,0.918192,0.926514,0.917614
1600,0.146300,0.22439,0.933593,0.933593,0.937324,0.933337
1800,0.146300,0.200387,0.932293,0.932293,0.936124,0.932034
2000,0.119500,0.248337,0.932393,0.932393,0.936529,0.932306


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-04 14:20:16,192] Trial 3 finished with value: 3.7486683219841455 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 5}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.66041,0.657766,0.657766,0.65929,0.547541
400,No log,0.265282,0.90139,0.90139,0.910063,0.900281
600,0.443400,0.212183,0.925593,0.925593,0.928161,0.925487
800,0.443400,0.207044,0.930793,0.930793,0.933915,0.930638
1000,0.193400,0.192595,0.937394,0.937394,0.938419,0.937468
1200,0.193400,0.188796,0.941194,0.941194,0.94199,0.941255
1400,0.193400,0.21406,0.932293,0.932293,0.936176,0.932091
1600,0.135900,0.225375,0.933393,0.933393,0.936936,0.933297
1800,0.135900,0.210765,0.937094,0.937094,0.939722,0.937012


[I 2023-07-04 14:25:13,468] Trial 4 finished with value: 3.75092165167713 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (init

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.421591,0.811181,0.811181,0.849094,0.799006
400,No log,0.236395,0.916292,0.916292,0.919214,0.916084
600,0.398800,0.201488,0.929793,0.929793,0.930568,0.92964
800,0.398800,0.195903,0.923592,0.923592,0.928485,0.923307
1000,0.196000,0.210948,0.934493,0.934493,0.935555,0.934464


[I 2023-07-04 14:27:58,545] Trial 5 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.421591,0.811181,0.811181,0.849094,0.799006
400,No log,0.236395,0.916292,0.916292,0.919214,0.916084
600,0.398800,0.201488,0.929793,0.929793,0.930568,0.92964
800,0.398800,0.195903,0.923592,0.923592,0.928485,0.923307
1000,0.196000,0.210948,0.934493,0.934493,0.935555,0.934464


[I 2023-07-04 14:30:43,660] Trial 6 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.66162,0.658066,0.658066,0.6592,0.547713
400,No log,0.247155,0.910791,0.910791,0.911932,0.911265
600,0.443700,0.203672,0.926693,0.926693,0.928743,0.926445
800,0.443700,0.211886,0.924892,0.924892,0.928839,0.924667
1000,0.206400,0.211908,0.931793,0.931793,0.934177,0.931588


[I 2023-07-04 14:33:29,292] Trial 7 pruned. 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.469518,0.732673,0.732673,0.831569,0.684342
400,No log,0.230421,0.921692,0.921692,0.922456,0.921592
600,0.400900,0.195518,0.927893,0.927893,0.930097,0.927742
800,0.400900,0.198819,0.935594,0.935594,0.938158,0.93549
1000,0.186600,0.208504,0.935594,0.935594,0.938557,0.935567
1200,0.186600,0.199093,0.940294,0.940294,0.941006,0.940311
1400,0.186600,0.198199,0.940694,0.940694,0.942118,0.940634
1600,0.126000,0.234338,0.934093,0.934093,0.937923,0.93398
1800,0.126000,0.207779,0.939494,0.939494,0.942063,0.939446


[I 2023-07-04 14:38:27,071] Trial 8 finished with value: 3.7604967047778217 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 3.7657467255346386.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (in

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.469518,0.732673,0.732673,0.831569,0.684342
400,No log,0.230421,0.921692,0.921692,0.922456,0.921592
600,0.400900,0.195518,0.927893,0.927893,0.930097,0.927742
800,0.400900,0.198819,0.935594,0.935594,0.938158,0.93549
1000,0.186600,0.208504,0.935594,0.935594,0.938557,0.935567
1200,0.186600,0.199093,0.940294,0.940294,0.941006,0.940311
1400,0.186600,0.198199,0.940694,0.940694,0.942118,0.940634
1600,0.126000,0.234338,0.934093,0.934093,0.937923,0.93398
1800,0.126000,0.207779,0.939494,0.939494,0.942063,0.939446


[I 2023-07-04 14:43:25,487] Trial 9 finished with value: 3.7604967047778217 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 3.7657467255346386.


In [13]:
best_run

BestRun(run_id='0', objective=3.7657467255346386, hyperparameters={'learning_rate': 3e-05, 'num_train_epochs': 3}, run_summary=None)

## Train with best hyperparameters

In [9]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1000,0.2242,0.200795,0.933293,0.933293,0.934953,0.9332
2000,0.1637,0.168868,0.943494,0.943494,0.944997,0.943356
3000,0.1505,0.167306,0.947995,0.947995,0.950445,0.947985
4000,0.1408,0.161222,0.949395,0.949395,0.951864,0.949295
5000,0.1069,0.14205,0.958596,0.958596,0.958938,0.958573
6000,0.1003,0.17635,0.952995,0.952995,0.954508,0.952972
7000,0.1003,0.126838,0.959496,0.959496,0.959883,0.959483
8000,0.1015,0.151014,0.956596,0.956596,0.957435,0.956654
9000,0.0943,0.140714,0.959296,0.959296,0.959891,0.959293
10000,0.0569,0.184445,0.960196,0.960196,0.960646,0.960209


TrainOutput(global_step=13638, training_loss=0.12346009352214526, metrics={'train_runtime': 2666.9916, 'train_samples_per_second': 163.61, 'train_steps_per_second': 5.114, 'total_flos': 8.67066711249443e+16, 'train_loss': 0.12346009352214526, 'epoch': 3.0})

In [10]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [11]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [12]:
preds = trainer.predict(data["validation"])
val = generate_doc_df(data["validation"], preds)

In [13]:
print(classification_report(y_true=val["actual"], y_pred=val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       1.00      0.99      1.00      3333
        REFUTES       0.96      0.92      0.94      3333
       SUPPORTS       0.93      0.96      0.94      3333

       accuracy                           0.96      9999
      macro avg       0.96      0.96      0.96      9999
   weighted avg       0.96      0.96      0.96      9999



## Test on test data

In [None]:
preds = trainer.predict(data["test"])
tes = generate_doc_df(data["test"], preds)

In [None]:
print(classification_report(y_true=tes["actual"], y_pred=tes["predicted"]))