In [1]:
import sys
sys.path.insert(0, "../../src")
from pathlib import Path
from collections import Counter

import numpy as np
from scipy.special import softmax

from gen.util import read_data, write_jsonl
from rte.aggregate import generate_micro_macro_df

In [2]:
root_data = Path("../../data").resolve()
root_model = Path("../../models").resolve()

In [3]:
# constants
LOOKUP = {
    "verifiable": {"no": "NOT VERIFIABLE", "yes": "VERIFIABLE"},
    "label": {"nei": "NOT ENOUGH INFO", "r": "REFUTES", "s": "SUPPORTS"}
}

SEED = 123456789

LABEL2ID = {"SUPPORTS": 0, "NOT ENOUGH INFO": 1, "REFUTES": 2}
ID2LABEL = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

# Init

In [4]:
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding,
    TextClassificationPipeline,
    pipeline
)

import torch
torch.backends.cuda.matmul.allow_tf32 = True

  from .autonotebook import tqdm as notebook_tqdm


# Huggingface Init

## Model

In [5]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")

In [6]:
model_checkpoint = "xlnet-base-cased"
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=3, 
        id2label=ID2LABEL, 
        label2id=LABEL2ID
    )

model = model_init()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess(examples):
    return tokenizer(examples["evidence"], examples["claim"], max_length=1024, truncation="only_first")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="macro"))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))
    
    return results

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [7]:
dataset = ["fever", "climatefeverpure", "fever-climatefeverpure"]
doc_sent = ["doc", "sent"]

di = 2
ds = 0

model_store_path = root_model.joinpath("sentence-models" if ds == 1 else "document-models")
model_store_path.mkdir(exist_ok=True)
model_store_path = model_store_path / f"{dataset[di]}-{model_checkpoint}-{doc_sent[ds]}"

## Dataset

In [8]:
datap = root_data / f"{doc_sent[ds]}-dataset"

data = DatasetDict({
    "train": Dataset.from_list(read_data(datap / f"{dataset[di]}.train.n5.jsonl")),
    "validation": Dataset.from_list(read_data(datap / f"{dataset[di]}.dev.n5.jsonl")),
    "fever_test": Dataset.from_list(read_data(datap / f"{dataset[0]}.test.n5.jsonl")),
    "climatefever_test": Dataset.from_list(read_data(datap / f"{dataset[1]}.test.n5.jsonl"))
}).map(preprocess, batched=True)

                                                                     

## Trainer

In [8]:
# effective batch size of 32
per_device_train_batch_size = 4
gradient_accumulation_steps = 8
per_device_eval_batch_size = 32


learning_rate = 4e-4
epoch = 4
metric_name = "f1"
warmup_ratio=0.1
save_steps=200
eval_steps=200

# Hyperparameter tuning

In [9]:
# shard the data if the dataset is large for hyperparameter tuning
shard = data["train"].num_rows > 50000
hp_tune_train = data["train"].shuffle(seed=SEED).shard(num_shards=5, index=1)

In [10]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=eval_steps,
    save_strategy = "no",
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

_ = model.train()
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_tune_train if shard else data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
    }

def compute_objective(metrics):
    return metrics["f1"]

In [12]:
best_run = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=10, 
    hp_space=optuna_hp_space
)

[I 2023-07-04 13:33:37,304] A new study created in memory with name: no-name-dcbe787d-f66a-4bd2-af76-b8a98d4d3867
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logi

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.378833,0.837307,0.836011,0.855487,0.832568
400,No log,0.399713,0.863482,0.862281,0.88513,0.860991
600,0.452900,0.275815,0.909507,0.909022,0.912236,0.908609
800,0.452900,0.235704,0.916318,0.915778,0.92189,0.916091
1000,0.219200,0.277892,0.921962,0.921714,0.922702,0.921687
1200,0.219200,0.260226,0.919529,0.918989,0.924739,0.91892
1400,0.219200,0.232552,0.926535,0.926139,0.930207,0.926628
1600,0.174500,0.258453,0.924978,0.924559,0.928125,0.9247
1800,0.174500,0.230073,0.927897,0.927676,0.929492,0.928107
2000,0.144800,0.285133,0.928773,0.928547,0.930368,0.928993


[I 2023-07-04 14:20:15,654] Trial 0 finished with value: 3.7242138598056367 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 5}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.461152,0.791379,0.789335,0.832482,0.775212
400,No log,0.341258,0.891603,0.89108,0.895289,0.890827
600,0.441000,0.289134,0.897344,0.896626,0.90587,0.896682
800,0.441000,0.286995,0.897538,0.896639,0.91193,0.896698
1000,0.216500,0.279071,0.920016,0.919603,0.923609,0.920059
1200,0.216500,0.222393,0.928384,0.928012,0.930923,0.928179
1400,0.216500,0.232925,0.920989,0.920545,0.925446,0.921088
1600,0.168700,0.245319,0.924297,0.923877,0.928008,0.924271
1800,0.168700,0.217089,0.932471,0.932189,0.933861,0.932297
2000,0.136200,0.293239,0.926146,0.925934,0.927853,0.926438


[I 2023-07-04 14:57:40,601] Trial 1 finished with value: 3.7221037008785203 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 4}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.366577,0.849372,0.8482,0.86452,0.845575
400,No log,0.298726,0.903376,0.902899,0.907475,0.903262
600,0.411500,0.242055,0.920405,0.920086,0.921525,0.919872
800,0.411500,0.230889,0.92634,0.925953,0.929219,0.926231
1000,0.197900,0.261186,0.925075,0.924794,0.926284,0.924887
1200,0.197900,0.231314,0.927703,0.927321,0.930795,0.92772
1400,0.197900,0.235613,0.924881,0.924411,0.929643,0.924939
1600,0.153100,0.229402,0.929357,0.929089,0.931431,0.92956
1800,0.153100,0.228252,0.928092,0.927719,0.931376,0.928217


[I 2023-07-04 15:16:22,343] Trial 2 finished with value: 3.7154045118570203 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.287858,0.891895,0.892042,0.893595,0.892326
400,No log,0.263472,0.908144,0.907696,0.911921,0.908032
600,0.378800,0.246835,0.912718,0.912102,0.919548,0.912368
800,0.378800,0.224521,0.920794,0.920321,0.925199,0.920693


[I 2023-07-04 15:25:23,697] Trial 3 finished with value: 3.6870075852707855 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 1}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.364059,0.843145,0.842029,0.856091,0.839488
400,No log,0.276462,0.899095,0.898529,0.903813,0.898503
600,0.428800,0.244128,0.910188,0.909583,0.916224,0.909672
800,0.428800,0.241427,0.915734,0.915275,0.919557,0.915553


[I 2023-07-04 15:34:24,406] Trial 4 finished with value: 3.666118683211651 and parameters: {'learning_rate': 1e-05, 'num_train_epochs': 1}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.s

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.303253,0.885278,0.884799,0.886488,0.884151
400,No log,0.272751,0.910577,0.910428,0.910577,0.910338
600,0.372800,0.228079,0.916902,0.916491,0.919466,0.916446
800,0.372800,0.219647,0.925659,0.925345,0.928093,0.925784


[I 2023-07-04 15:43:25,938] Trial 5 finished with value: 3.7048807723537664 and parameters: {'learning_rate': 3e-05, 'num_train_epochs': 1}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.366577,0.849372,0.8482,0.86452,0.845575
400,No log,0.298726,0.903376,0.902899,0.907475,0.903262
600,0.411500,0.242055,0.920405,0.920086,0.921525,0.919872
800,0.411500,0.230889,0.92634,0.925953,0.929219,0.926231
1000,0.197900,0.261186,0.925075,0.924794,0.926284,0.924887
1200,0.197900,0.231314,0.927703,0.927321,0.930795,0.92772
1400,0.197900,0.235613,0.924881,0.924411,0.929643,0.924939
1600,0.153100,0.229402,0.929357,0.929089,0.931431,0.92956
1800,0.153100,0.228252,0.928092,0.927719,0.931376,0.928217


[I 2023-07-04 16:02:08,697] Trial 6 finished with value: 3.7154045118570203 and parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2}. Best is trial 0 with value: 3.7242138598056367.
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.622866,0.652525,0.648553,0.794777,0.54524


[I 2023-07-04 16:04:12,811] Trial 7 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.654347,0.647271,0.643296,0.820709,0.539004


[I 2023-07-04 16:06:16,679] Trial 8 pruned. 
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-strea

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
200,No log,0.468949,0.780383,0.778189,0.830208,0.760902


[I 2023-07-04 16:08:20,431] Trial 9 pruned. 


In [13]:
best_run

BestRun(run_id='0', objective=3.7242138598056367, hyperparameters={'learning_rate': 3e-05, 'num_train_epochs': 5}, run_summary=None)

## Train with best hyperparameters

In [9]:
training_args = TrainingArguments(
    model_store_path,
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    seed=SEED,
    data_seed=SEED,
    warmup_ratio=warmup_ratio,
    report_to="tensorboard",
    tf32=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)
    
trainer.train()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
1000,0.2277,0.224416,0.930233,0.929964,0.931968,0.930336
2000,0.1827,0.205268,0.935876,0.935546,0.937728,0.935671
3000,0.1756,0.158618,0.942006,0.941941,0.942293,0.942076
4000,0.1658,0.189042,0.945217,0.945116,0.94588,0.945376
5000,0.1324,0.192125,0.946774,0.946525,0.948194,0.946793
6000,0.13,0.189683,0.945996,0.945689,0.947967,0.945945
7000,0.1238,0.205602,0.948623,0.948414,0.949432,0.948554
8000,0.1205,0.178512,0.950569,0.950399,0.951255,0.95059
9000,0.1167,0.175734,0.95125,0.950976,0.952675,0.951161
10000,0.0946,0.202459,0.948331,0.948073,0.949947,0.94841


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [10]:
trainer.save_model(model_store_path.parent / (model_store_path.stem + ".out"))

# Evaluate

In [11]:
import pandas as pd
from sklearn.metrics import classification_report

## Test on validation data

In [12]:
preds = trainer.predict(data["validation"])
val = generate_doc_df(data["validation"], preds)

In [13]:
print(classification_report(y_true=val["actual"], y_pred=val["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       1.00      0.99      0.99      3428
        REFUTES       0.96      0.91      0.93      3384
       SUPPORTS       0.91      0.96      0.94      3465

       accuracy                           0.96     10277
      macro avg       0.96      0.96      0.96     10277
   weighted avg       0.96      0.96      0.96     10277



## Test on test data

### FEVER

In [14]:
preds = trainer.predict(data["fever_test"])
ftes = generate_doc_df(data["fever_test"], preds)

In [15]:
print(classification_report(y_true=ftes["actual"], y_pred=ftes["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       1.00      1.00      1.00      3333
        REFUTES       0.96      0.89      0.92      3333
       SUPPORTS       0.89      0.97      0.93      3333

       accuracy                           0.95      9999
      macro avg       0.95      0.95      0.95      9999
   weighted avg       0.95      0.95      0.95      9999



### Climate-FEVER

In [16]:
preds = trainer.predict(data["climatefever_test"])
cftes = generate_doc_df(data["climatefever_test"], preds)

In [18]:
print(classification_report(y_true=cftes["actual"], y_pred=cftes["predicted"]))

                 precision    recall  f1-score   support

NOT ENOUGH INFO       0.93      0.87      0.90        47
        REFUTES       0.77      0.40      0.53        25
       SUPPORTS       0.74      0.91      0.81        65

       accuracy                           0.80       137
      macro avg       0.81      0.73      0.75       137
   weighted avg       0.81      0.80      0.79       137

