In [1]:
import sys
sys.path.insert(0, "../src/gen")
sys.path.insert(1, "../src/rte")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from util import read_data, write_jsonl
from aggregate import generate_micro_macro_df

In [2]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [3]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

# Huggingface Init

## Model

In [4]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [5]:
model_checkpoint = "xlnet-base-cased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [6]:
dataset = ["fever", "climatefever", "fever-climatefever"]
task = ["hp_tune", "output"]

di = 0
ti = 0

model_store_path = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/models").joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{task[ti]}"

## Dataset

In [7]:
datap = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence")

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "test": Dataset.from_list(read_data(datap / f"{dataset[di]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

Map:   0%|          | 0/223460 [00:00<?, ? examples/s]

Map:   0%|          | 0/16142 [00:00<?, ? examples/s]

Map:   0%|          | 0/16206 [00:00<?, ? examples/s]

## Trainer

In [8]:
batch_size = 16
learning_rate = 4e-4
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-03 12:57:58,567] A new study created in memory with name: no-name-2bef115b-c1dc-4b25-97b1-f3755592b842
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summar

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.104038,0.425536,0.439966,0.367711,0.336744
400,No log,0.873155,0.573349,0.563443,0.604527,0.463396
600,0.927000,0.573731,0.729216,0.708241,0.79535,0.659866
800,0.927000,0.626781,0.793644,0.785391,0.834284,0.770983
1000,0.435300,0.340676,0.878082,0.868632,0.87765,0.866576
1200,0.435300,0.298316,0.898464,0.890141,0.89309,0.890772
1400,0.435300,0.472975,0.860055,0.852273,0.873206,0.846291
1600,0.307100,0.300246,0.903544,0.899288,0.897161,0.897288
1800,0.307100,0.318785,0.901004,0.892937,0.901905,0.891543
2000,0.281900,0.320226,0.895986,0.897514,0.891704,0.892


[I 2023-07-03 14:00:36,014] Trial 0 finished with value: 3.7200750259754667 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 5}. Best is trial 0 with value: 3.7200750259754667.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 14:27:03,281] Trial 1 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.936659,0.578615,0.568511,0.443802,0.46442
400,No log,0.505874,0.833664,0.822162,0.847289,0.813687
600,0.670300,0.320832,0.888118,0.880775,0.882481,0.87969
800,0.670300,0.429181,0.867365,0.86359,0.871978,0.858462
1000,0.306900,0.279572,0.904906,0.898293,0.900055,0.897243
1200,0.306900,0.304974,0.908623,0.900092,0.904457,0.901564
1400,0.306900,0.308111,0.90026,0.894686,0.899326,0.892204
1600,0.256500,0.30631,0.906641,0.903065,0.901997,0.900367
1800,0.256500,0.33434,0.894313,0.892072,0.894141,0.887468
2000,0.255300,0.280942,0.912526,0.911056,0.907341,0.907681


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 14:39:24,170] Trial 2 finished with value: 3.6684040371613156 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized fro

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.849029,0.616157,0.593408,0.684299,0.488341
400,No log,0.455377,0.81638,0.805563,0.834531,0.79473
600,0.704900,0.279086,0.898773,0.890799,0.893209,0.891043
800,0.704900,0.343158,0.881799,0.883181,0.876809,0.877259
1000,0.285700,0.298228,0.900942,0.901366,0.895607,0.896612
1200,0.285700,0.283095,0.906022,0.903629,0.901827,0.9
1400,0.285700,0.220451,0.92597,0.920422,0.921875,0.920262
1600,0.223800,0.282476,0.917111,0.914276,0.912513,0.91147
1800,0.223800,0.244075,0.921633,0.919223,0.916617,0.916486
2000,0.190400,0.26131,0.921509,0.918294,0.917931,0.915967


[I 2023-07-03 15:15:06,179] Trial 3 finished with value: 3.7250316930630176 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.8119,0.619626,0.600362,0.673474,0.495369
400,No log,0.41255,0.837567,0.828779,0.850457,0.821045
600,0.666400,0.282843,0.897658,0.891108,0.892656,0.889826
800,0.666400,0.379457,0.873807,0.875853,0.870841,0.868766
1000,0.275900,0.299563,0.899455,0.900187,0.894671,0.895033
1200,0.275900,0.262573,0.910234,0.907003,0.906133,0.904078
1400,0.275900,0.222868,0.92597,0.920313,0.921718,0.920135
1600,0.221700,0.255978,0.923368,0.919964,0.917704,0.918036
1800,0.221700,0.241767,0.923987,0.920996,0.919046,0.918695
2000,0.188300,0.254324,0.924792,0.921481,0.920622,0.91938


[I 2023-07-03 15:41:46,366] Trial 4 finished with value: 3.728442949962969 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.762799,0.631954,0.610797,0.664168,0.503767
400,No log,0.368564,0.860302,0.849357,0.862799,0.845825
600,0.642200,0.289155,0.898588,0.890934,0.896129,0.889741
800,0.642200,0.463325,0.854975,0.855867,0.864104,0.847951
1000,0.278400,0.42482,0.872878,0.877307,0.874743,0.869345


[I 2023-07-03 15:48:18,962] Trial 5 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.909115,0.593421,0.573954,0.619689,0.470166


[I 2023-07-03 15:49:37,652] Trial 6 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.463287,0.812725,0.795276,0.811373,0.789437
400,No log,0.441193,0.850452,0.839572,0.863735,0.831883
600,0.592500,0.325288,0.899393,0.893939,0.893789,0.893016
800,0.592500,0.551261,0.823628,0.819681,0.853737,0.810328
1000,0.309200,0.258313,0.906827,0.897895,0.903244,0.898802
1200,0.309200,0.282593,0.91104,0.90511,0.905627,0.904318
1400,0.309200,0.39252,0.891215,0.886265,0.895372,0.882451


[I 2023-07-03 15:56:05,798] Trial 7 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.0826,0.458369,0.466106,0.377943,0.366113


[I 2023-07-03 15:57:01,746] Trial 8 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.906309,0.598563,0.581714,0.43499,0.475703


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 15:57:57,651] Trial 9 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 16:24:38,140] Trial 10 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 16:51:18,172] Trial 11 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.495049,0.804919,0.798151,0.814046,0.786176
400,No log,0.325774,0.886321,0.87881,0.886626,0.876078
600,0.536300,0.248774,0.910978,0.905882,0.906365,0.904405
800,0.536300,0.336135,0.895366,0.896615,0.891996,0.890978
1000,0.244700,0.285281,0.906517,0.904857,0.904625,0.900715
1200,0.244700,0.243058,0.922376,0.917983,0.918,0.916314
1400,0.244700,0.215706,0.926713,0.922766,0.922603,0.92132
1600,0.194900,0.257392,0.929563,0.926314,0.924202,0.924685
1800,0.194900,0.242142,0.928448,0.925871,0.923413,0.923681
2000,0.155600,0.240745,0.933775,0.930925,0.92869,0.929324


[I 2023-07-03 17:08:54,755] Trial 12 finished with value: 3.7260306427665495 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.495049,0.804919,0.798151,0.814046,0.786176
400,No log,0.325774,0.886321,0.87881,0.886626,0.876078
600,0.536300,0.248774,0.910978,0.905882,0.906365,0.904405
800,0.536300,0.336135,0.895366,0.896615,0.891996,0.890978
1000,0.244700,0.285281,0.906517,0.904857,0.904625,0.900715
1200,0.244700,0.243058,0.922376,0.917983,0.918,0.916314
1400,0.244700,0.215706,0.926713,0.922766,0.922603,0.92132
1600,0.194900,0.257392,0.929563,0.926314,0.924202,0.924685
1800,0.194900,0.242142,0.928448,0.925871,0.923413,0.923681
2000,0.155600,0.240745,0.933775,0.930925,0.92869,0.929324


[I 2023-07-03 17:26:32,412] Trial 13 finished with value: 3.7260306427665495 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.524372,0.761925,0.745953,0.787374,0.718256
400,No log,0.372496,0.874303,0.86911,0.876131,0.864705
600,0.563800,0.309601,0.891092,0.88816,0.8904,0.88432


[I 2023-07-03 17:30:29,239] Trial 14 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 17:57:09,461] Trial 15 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.495049,0.804919,0.798151,0.814046,0.786176
400,No log,0.325774,0.886321,0.87881,0.886626,0.876078
600,0.536300,0.248774,0.910978,0.905882,0.906365,0.904405
800,0.536300,0.336135,0.895366,0.896615,0.891996,0.890978
1000,0.244700,0.285281,0.906517,0.904857,0.904625,0.900715
1200,0.244700,0.243058,0.922376,0.917983,0.918,0.916314
1400,0.244700,0.215706,0.926713,0.922766,0.922603,0.92132
1600,0.194900,0.257392,0.929563,0.926314,0.924202,0.924685
1800,0.194900,0.242142,0.928448,0.925871,0.923413,0.923681
2000,0.155600,0.240745,0.933775,0.930925,0.92869,0.929324


[I 2023-07-03 18:14:46,875] Trial 16 finished with value: 3.7260306427665495 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 2}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are new

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.524372,0.761925,0.745953,0.787374,0.718256
400,No log,0.372496,0.874303,0.86911,0.876131,0.864705
600,0.563800,0.309601,0.891092,0.88816,0.8904,0.88432


[I 2023-07-03 18:18:43,816] Trial 17 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 18:45:24,063] Trial 18 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.628372,0.700161,0.682135,0.730822,0.62849
400,No log,0.412479,0.851134,0.850169,0.858349,0.843399
600,0.586500,0.291437,0.896419,0.892769,0.892628,0.889661
800,0.586500,0.358873,0.886136,0.886796,0.884524,0.880672
1000,0.257000,0.29788,0.901561,0.901587,0.898697,0.896829
1200,0.257000,0.256079,0.918597,0.91651,0.912997,0.913521
1400,0.257000,0.21203,0.929377,0.924447,0.925061,0.923925
1600,0.208700,0.275353,0.928695,0.925179,0.923107,0.923539
1800,0.208700,0.261209,0.924545,0.922519,0.919569,0.919763
2000,0.161000,0.265897,0.928819,0.926188,0.924124,0.924029


[I 2023-07-03 19:12:04,617] Trial 19 finished with value: 3.741136595545795 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 1 with value: 3.741136595545795.


In [13]:
best_run

BestRun(run_id='1', objective=3.741136595545795, hyperparameters={'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}, run_summary=None)

## Train with best hyperparameters

In [None]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.97981,0.538099,0.527977,0.65479,0.42972
400,No log,0.583995,0.728163,0.703433,0.79959,0.649084
600,0.792800,0.364707,0.868789,0.866238,0.867951,0.860992
800,0.792800,0.27283,0.902181,0.894584,0.898441,0.894338
1000,0.306500,0.259474,0.905154,0.900147,0.901022,0.898284
1200,0.306500,0.242053,0.916429,0.910043,0.912242,0.909798
1400,0.306500,0.243242,0.91804,0.913659,0.912796,0.912668
1600,0.235800,0.265941,0.914695,0.913514,0.908828,0.91069
1800,0.235800,0.312867,0.902243,0.901494,0.899914,0.896866
2000,0.218700,0.407848,0.873622,0.869789,0.883767,0.863788


In [None]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [20]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [21]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [22]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.97      0.98      6666
        REFUTES       0.94      0.92      0.93      4888
       SUPPORTS       0.92      0.95      0.94      4588

       accuracy                           0.95     16142
      macro avg       0.95      0.95      0.95     16142
   weighted avg       0.95      0.95      0.95     16142



In [23]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.96      0.97      3238
        REFUTES       0.93      0.92      0.92      3260
       SUPPORTS       0.91      0.95      0.93      3245

       accuracy                           0.94      9743
      macro avg       0.94      0.94      0.94      9743
   weighted avg       0.94      0.94      0.94      9743



In [24]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.99      0.99      3238
        REFUTES       0.94      0.91      0.93      3260
       SUPPORTS       0.92      0.95      0.94      3245

       accuracy                           0.95      9743
      macro avg       0.95      0.95      0.95      9743
   weighted avg       0.95      0.95      0.95      9743



## Test on test data

In [25]:
preds = trainer.predict(data["test"])
micro_val, macro_val = generate_micro_macro_df(data["test"], preds)

In [26]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.98      0.98      6666
        REFUTES       0.93      0.89      0.91      4909
       SUPPORTS       0.89      0.94      0.92      4631

       accuracy                           0.94     16206
      macro avg       0.94      0.94      0.94     16206
   weighted avg       0.94      0.94      0.94     16206



In [27]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.96      0.97      3284
        REFUTES       0.93      0.88      0.90      3280
       SUPPORTS       0.88      0.95      0.91      3290

       accuracy                           0.93      9854
      macro avg       0.93      0.93      0.93      9854
   weighted avg       0.93      0.93      0.93      9854



In [28]:
print(classification_report(y_true=macro_val["actual"], y_pred=macro_val["proba"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.98      0.99      0.99      3284
        REFUTES       0.95      0.88      0.91      3280
       SUPPORTS       0.89      0.95      0.92      3290

       accuracy                           0.94      9854
      macro avg       0.94      0.94      0.94      9854
   weighted avg       0.94      0.94      0.94      9854

