In [1]:
import sys
sys.path.insert(0, "../src/gen")
sys.path.insert(1, "../src/rte")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from util import read_data, write_jsonl
from aggregate import generate_micro_macro_df

In [2]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [3]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

# Huggingface Init

## Model

In [4]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [5]:
model_checkpoint = "xlnet-base-cased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [6]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure", "climatefever", "fever-climatefever"]
task = ["hp_tune", "output"]
doc_sent = ["doc", "sent"]

di = 1
ti = 0
ds = 1

model_store_path = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/models").joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}-{task[ti]}"

## Dataset

In [7]:
datap = Path(f"/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-{doc_sent[ds]}-evidence")

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "test": Dataset.from_list(read_data(datap / f"{dataset[di]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

Map:   0%|          | 0/3246 [00:00<?, ? examples/s]

Map:   0%|          | 0/927 [00:00<?, ? examples/s]

Map:   0%|          | 0/459 [00:00<?, ? examples/s]

## Trainer

In [8]:
batch_size = 16
learning_rate = 4e-4
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [10]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [11]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [12]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [13]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-03 12:50:23,213] A new study created in memory with name: no-name-c27257e7-8d86-41a3-8bb2-518143be6258
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.910866,0.61165,0.6031,0.642863,0.6011
400,No log,1.458986,0.601942,0.59768,0.669954,0.598754
600,0.533000,2.404453,0.557713,0.609952,0.594632,0.562048


[I 2023-07-03 12:51:55,037] Trial 0 finished with value: 2.324345351737909 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3}. Best is trial 0 with value: 2.324345351737909.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.904213,0.604099,0.579552,0.604901,0.582255


[I 2023-07-03 12:53:18,664] Trial 1 finished with value: 2.3708070238809817 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 2.3708070238809817.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.910866,0.61165,0.6031,0.642863,0.6011
400,No log,1.458986,0.601942,0.59768,0.669954,0.598754
600,0.533000,2.404453,0.557713,0.609952,0.594632,0.562048


[I 2023-07-03 12:54:50,534] Trial 2 finished with value: 2.324345351737909 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3}. Best is trial 1 with value: 2.3708070238809817.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 12:55:20,677] Trial 3 finished with value: 1.6481139464211956 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 1 with value: 2.3708070238809817.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized fr

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.904213,0.604099,0.579552,0.604901,0.582255


[I 2023-07-03 12:56:44,612] Trial 4 finished with value: 2.3708070238809817 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 2.3708070238809817.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.817094,0.626753,0.575035,0.652082,0.592144


[I 2023-07-03 12:57:15,923] Trial 5 finished with value: 2.4460142464815338 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 5 with value: 2.4460142464815338.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.827972,0.613808,0.575051,0.622654,0.585963


[I 2023-07-03 12:57:46,582] Trial 6 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.817094,0.626753,0.575035,0.652082,0.592144


[I 2023-07-03 12:58:18,285] Trial 7 finished with value: 2.4460142464815338 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 5 with value: 2.4460142464815338.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.829296,0.623517,0.557693,0.626151,0.574081


[I 2023-07-03 12:59:14,588] Trial 8 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.877388,0.564186,0.458334,0.56286,0.44321


[I 2023-07-03 13:00:10,952] Trial 9 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.902807,0.538296,0.448598,0.611658,0.4163


[I 2023-07-03 13:00:41,791] Trial 10 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.817094,0.626753,0.575035,0.652082,0.592144


[I 2023-07-03 13:01:13,207] Trial 11 finished with value: 2.4460142464815338 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 5 with value: 2.4460142464815338.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are ne

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.817094,0.626753,0.575035,0.652082,0.592144


[I 2023-07-03 13:01:44,159] Trial 12 finished with value: 2.4460142464815338 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 5 with value: 2.4460142464815338.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are ne

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.831162,0.627832,0.595951,0.620763,0.604968
400,No log,0.996525,0.629989,0.606224,0.623935,0.611823


[I 2023-07-03 13:02:45,642] Trial 13 finished with value: 2.471971059861637 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 13 with value: 2.471971059861637.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.831162,0.627832,0.595951,0.620763,0.604968
400,No log,0.996525,0.629989,0.606224,0.623935,0.611823


[I 2023-07-03 13:03:47,010] Trial 14 finished with value: 2.471971059861637 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 13 with value: 2.471971059861637.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.831162,0.627832,0.595951,0.620763,0.604968
400,No log,0.996525,0.629989,0.606224,0.623935,0.611823


[I 2023-07-03 13:04:48,412] Trial 15 finished with value: 2.471971059861637 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 13 with value: 2.471971059861637.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.831162,0.627832,0.595951,0.620763,0.604968
400,No log,0.996525,0.629989,0.606224,0.623935,0.611823


[I 2023-07-03 13:05:49,792] Trial 16 finished with value: 2.471971059861637 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 13 with value: 2.471971059861637.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.831162,0.627832,0.595951,0.620763,0.604968
400,No log,0.996525,0.629989,0.606224,0.623935,0.611823


[I 2023-07-03 13:06:51,156] Trial 17 finished with value: 2.471971059861637 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 13 with value: 2.471971059861637.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.998068,0.512406,0.333333,0.170802,0.225868


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 13:07:22,251] Trial 18 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weigh

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.957794,0.529666,0.42682,0.374828,0.380867


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 13:07:53,307] Trial 19 pruned. 


In [14]:
best_run

BestRun(run_id='13', objective=2.471971059861637, hyperparameters={'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}, run_summary=None)

## Train with best hyperparameters

In [9]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.820426,0.677454,0.648566,0.681366,0.657658
400,No log,1.089946,0.638619,0.638663,0.64637,0.630488


TrainOutput(global_step=406, training_loss=0.6346851969000153, metrics={'train_runtime': 110.8316, 'train_samples_per_second': 58.575, 'train_steps_per_second': 3.663, 'total_flos': 431480356112700.0, 'train_loss': 0.6346851969000153, 'epoch': 2.0})

In [10]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [11]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [12]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [13]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.75      0.68      0.71       475
        REFUTES       0.70      0.53      0.60       132
       SUPPORTS       0.59      0.74      0.66       320

       accuracy                           0.68       927
      macro avg       0.68      0.65      0.66       927
   weighted avg       0.69      0.68      0.68       927



In [14]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.52      0.56      0.54        95
        REFUTES       0.69      0.43      0.53        51
       SUPPORTS       0.67      0.73      0.70       132

       accuracy                           0.62       278
      macro avg       0.63      0.57      0.59       278
   weighted avg       0.62      0.62      0.61       278



In [15]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.54      0.67      0.60        95
        REFUTES       0.73      0.43      0.54        51
       SUPPORTS       0.71      0.69      0.70       132

       accuracy                           0.64       278
      macro avg       0.66      0.60      0.61       278
   weighted avg       0.65      0.64      0.64       278



## Test on test data

In [16]:
preds = trainer.predict(data["test"])
micro_val, macro_val = generate_micro_macro_df(data["test"], preds)

In [17]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.76      0.63      0.69       235
        REFUTES       0.47      0.42      0.44        48
       SUPPORTS       0.59      0.74      0.65       176

       accuracy                           0.65       459
      macro avg       0.60      0.59      0.59       459
   weighted avg       0.66      0.65      0.65       459



In [18]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.49      0.49      0.49        47
        REFUTES       0.50      0.32      0.39        25
       SUPPORTS       0.58      0.66      0.62        65

       accuracy                           0.54       137
      macro avg       0.52      0.49      0.50       137
   weighted avg       0.53      0.54      0.53       137



In [19]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.56      0.66      0.61        47
        REFUTES       0.53      0.32      0.40        25
       SUPPORTS       0.66      0.68      0.67        65

       accuracy                           0.61       137
      macro avg       0.58      0.55      0.56       137
   weighted avg       0.60      0.61      0.60       137

