In [1]:
import sys
sys.path.insert(0, "../src/gen")
sys.path.insert(1, "../src/rte")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from util import read_data, write_jsonl
from aggregate import generate_micro_macro_df

In [2]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [3]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

# Huggingface Init

## Model

In [4]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [5]:
model_checkpoint = "xlnet-base-cased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [6]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure", "climatefever", "fever-climatefever"]
task = ["hp_tune", "output"]
doc_sent = ["doc", "sent"]

di = 4
ti = 0
ds = 1

model_store_path = Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/models").joinpath(model_checkpoint)
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}-{task[ti]}"

## Dataset

In [7]:
datap = Path(f"/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-{doc_sent[ds]}-evidence")

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "fever_test": Dataset.from_list(read_data(datap / f"{dataset[0]}.test.n5.jsonl")),
    "climatefever_test": Dataset.from_list(read_data(datap / f"{dataset[1]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

Map:   0%|          | 0/228290 [00:00<?, ? examples/s]

Map:   0%|          | 0/17532 [00:00<?, ? examples/s]

Map:   0%|          | 0/16206 [00:00<?, ? examples/s]

Map:   0%|          | 0/459 [00:00<?, ? examples/s]

## Trainer

In [9]:
batch_size = 16
learning_rate = 4e-4
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 3e-5, 2e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=20, 
    hp_space=optuna_hp_space
)

[I 2023-07-03 13:02:16,683] A new study created in memory with name: no-name-0ca0b7ce-7d62-488a-b0b3-121379fdd2e1
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'lo

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.716896,0.687972,0.667642,0.787427,0.616972
400,No log,0.377311,0.861386,0.858889,0.856307,0.854226
600,0.552400,0.335807,0.878376,0.872998,0.878234,0.87016
800,0.552400,0.339101,0.880427,0.877289,0.878673,0.874102
1000,0.261400,0.283244,0.897885,0.892081,0.893379,0.891712
1200,0.261400,0.325783,0.889976,0.890187,0.885488,0.885494
1400,0.261400,0.295013,0.896596,0.894413,0.891339,0.891438


[I 2023-07-03 13:11:32,540] Trial 0 finished with value: 3.573787006087937 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 0 with value: 3.573787006087937.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.991626,0.542914,0.527758,0.573669,0.431203
400,No log,0.560934,0.761146,0.745409,0.792976,0.727212
600,0.760900,0.395433,0.850021,0.839189,0.858914,0.835378
800,0.760900,0.367791,0.870233,0.863724,0.873036,0.861422
1000,0.307600,0.302653,0.894428,0.887027,0.891225,0.88734
1200,0.307600,0.318581,0.89273,0.892704,0.887653,0.88829
1400,0.307600,0.284869,0.90058,0.894967,0.898088,0.894205
1600,0.239300,0.314854,0.892905,0.893231,0.887952,0.888542
1800,0.239300,0.27952,0.904915,0.90082,0.90027,0.899611
2000,0.193400,0.342002,0.89519,0.894341,0.893667,0.890263


[I 2023-07-03 13:58:01,520] Trial 1 finished with value: 3.658646782667355 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5}. Best is trial 1 with value: 3.658646782667355.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.808979,0.615912,0.595405,0.727615,0.504961
400,No log,0.344738,0.869998,0.864352,0.863474,0.86229
600,0.586700,0.335789,0.878845,0.871105,0.880043,0.869344
800,0.586700,0.418749,0.851016,0.846806,0.863699,0.84168
1000,0.265700,0.356141,0.8724,0.864598,0.878567,0.861954
1200,0.265700,0.305681,0.902513,0.899428,0.897116,0.897409
1400,0.265700,0.26244,0.912473,0.906475,0.908583,0.907086
1600,0.216800,0.263554,0.910657,0.906165,0.905734,0.905548
1800,0.216800,0.388181,0.877673,0.880185,0.875383,0.873752
2000,0.163600,0.368201,0.897944,0.896731,0.895872,0.893011


[I 2023-07-03 14:35:11,646] Trial 2 finished with value: 3.646126066394134 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 1 with value: 3.658646782667355.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.882749,0.722479,0.735403,0.789105,0.721355
400,No log,0.390005,0.864608,0.868687,0.861326,0.8617
600,0.463600,0.333252,0.881833,0.878264,0.881228,0.874615
800,0.463600,0.307227,0.898002,0.893006,0.894276,0.891952
1000,0.231600,0.26207,0.910833,0.905072,0.906392,0.905245
1200,0.231600,0.30515,0.897944,0.899082,0.894488,0.894169
1400,0.231600,0.260332,0.910715,0.908934,0.905414,0.906469


[I 2023-07-03 14:44:29,860] Trial 3 finished with value: 3.631532024816712 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 1}. Best is trial 1 with value: 3.658646782667355.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.945668,0.580643,0.557776,0.753864,0.458678
400,No log,0.524797,0.780011,0.764878,0.814697,0.750506
600,0.716400,0.553814,0.796825,0.784358,0.831344,0.77232
800,0.716400,0.333038,0.877732,0.866222,0.87245,0.868616
1000,0.366900,0.448578,0.852833,0.85186,0.853679,0.846213
1200,0.366900,0.395922,0.863788,0.86359,0.859414,0.858036
1400,0.366900,0.35509,0.882418,0.880305,0.875614,0.877327
1600,0.305500,0.391661,0.87656,0.877893,0.870532,0.872342
1800,0.305500,0.3038,0.903041,0.896567,0.899136,0.896867
2000,0.279200,0.372483,0.888101,0.884943,0.8835,0.882365


[I 2023-07-03 15:37:54,682] Trial 4 finished with value: 3.6497724435794066 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}. Best is trial 1 with value: 3.658646782667355.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newl

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.072086,0.448943,0.457304,0.403392,0.356058


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 15:38:52,129] Trial 5 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.919595,0.606011,0.578329,0.429888,0.476465


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 15:39:49,727] Trial 6 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.813023,0.7059,0.70535,0.795207,0.686981
400,No log,0.375171,0.870584,0.870604,0.863855,0.865782
600,0.517400,0.35152,0.875212,0.869923,0.876546,0.866983


[I 2023-07-03 15:43:47,176] Trial 7 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.75496,0.634952,0.603868,0.705535,0.501305
400,No log,0.510898,0.803914,0.793435,0.829767,0.783594


[I 2023-07-03 15:45:42,439] Trial 8 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.716896,0.687972,0.667642,0.787427,0.616972
400,No log,0.377311,0.861386,0.858889,0.856307,0.854226
600,0.552400,0.335807,0.878376,0.872998,0.878234,0.87016
800,0.552400,0.339101,0.880427,0.877289,0.878673,0.874102
1000,0.261400,0.283244,0.897885,0.892081,0.893379,0.891712
1200,0.261400,0.325783,0.889976,0.890187,0.885488,0.885494
1400,0.261400,0.295013,0.896596,0.894413,0.891339,0.891438


[I 2023-07-03 15:54:57,958] Trial 9 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.793861,0.626926,0.59605,0.773903,0.492172
400,No log,0.398127,0.848614,0.841237,0.845938,0.837561


[I 2023-07-03 15:57:35,886] Trial 10 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.133977,0.382096,0.405636,0.393715,0.286959


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 15:58:33,713] Trial 11 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bia

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.107083,0.387545,0.407514,0.36773,0.295036


[I 2023-07-03 15:59:31,492] Trial 12 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.945668,0.580643,0.557776,0.753864,0.458678


[I 2023-07-03 16:00:28,758] Trial 13 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.793861,0.626926,0.59605,0.773903,0.492172
400,No log,0.398127,0.848614,0.841237,0.845938,0.837561


[I 2023-07-03 16:03:06,357] Trial 14 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.93564,0.592536,0.568188,0.428723,0.467923


[I 2023-07-03 16:04:03,812] Trial 15 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.107083,0.387545,0.407514,0.36773,0.295036


[I 2023-07-03 16:05:01,540] Trial 16 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.826915,0.613568,0.58512,0.716916,0.483484


[I 2023-07-03 16:06:20,907] Trial 17 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.733386,0.638877,0.610234,0.735968,0.508581
400,No log,0.352569,0.862148,0.853291,0.858876,0.851588


[I 2023-07-03 16:08:58,443] Trial 18 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stre

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.072086,0.448943,0.457304,0.403392,0.356058


  _warn_prf(average, modifier, msg_start, len(result))
[I 2023-07-03 16:09:56,131] Trial 19 pruned. 


In [13]:
best_run

BestRun(run_id='1', objective=3.658646782667355, hyperparameters={'learning_rate': 1e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5}, run_summary=None)

## Train with best hyperparameters

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard"
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,1.100191,0.39271,0.409501,0.496755,0.304353
400,No log,0.980168,0.552989,0.528826,0.564965,0.433166
600,0.979300,0.70761,0.656799,0.611384,0.722983,0.520126
800,0.979300,0.504342,0.792836,0.76448,0.818734,0.754901
1000,0.542200,0.407479,0.843714,0.825957,0.847512,0.826394
1200,0.542200,0.366818,0.866302,0.859251,0.861783,0.857247
1400,0.542200,0.391505,0.861853,0.851124,0.864349,0.849856
1600,0.316000,0.324167,0.886037,0.878445,0.880949,0.878179
1800,0.316000,0.309643,0.890771,0.877427,0.890538,0.880998
2000,0.267200,0.329724,0.888832,0.876832,0.89084,0.878796


TrainOutput(global_step=35675, training_loss=0.13544854105980814, metrics={'train_runtime': 14443.645, 'train_samples_per_second': 79.028, 'train_steps_per_second': 2.47, 'total_flos': 7.670724643977622e+16, 'train_loss': 0.13544854105980814, 'epoch': 5.0})

In [11]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [12]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [13]:
preds = trainer.predict(data["validation"])
micro_val, macro_val = generate_micro_macro_df(data["validation"], preds)

In [14]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.94      0.97      0.96      7604
        REFUTES       0.94      0.90      0.92      5020
       SUPPORTS       0.91      0.91      0.91      4908

       accuracy                           0.93     17532
      macro avg       0.93      0.93      0.93     17532
   weighted avg       0.93      0.93      0.93     17532



## Test on test data

### FEVER

In [15]:
preds = trainer.predict(data["fever_test"])
micro_val, macro_val = generate_micro_macro_df(data["fever_test"], preds)

In [16]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.97      0.98      0.98      6666
        REFUTES       0.94      0.89      0.91      4909
       SUPPORTS       0.90      0.94      0.92      4631

       accuracy                           0.94     16206
      macro avg       0.94      0.94      0.94     16206
   weighted avg       0.94      0.94      0.94     16206



### Climate-FEVER

In [17]:
preds = trainer.predict(data["climatefever_test"])
micro_val, macro_val = generate_micro_macro_df(data["climatefever_test"], preds)

In [18]:
print(classification_report(y_true=micro_val["actual"], y_pred=micro_val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.60      0.92      0.73       235
        REFUTES       0.77      0.35      0.49        48
       SUPPORTS       0.81      0.36      0.50       176

       accuracy                           0.65       459
      macro avg       0.73      0.55      0.57       459
   weighted avg       0.70      0.65      0.62       459

