In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, Dataset
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=250)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1.mean(),
        'precision': precision.mean(),
        'recall': recall.mean()
    }

def acc_at_k(y_true, y_pred, k=2):
    y_true = torch.tensor(y_true) if type(y_true) != torch.Tensor else y_true
    y_pred = torch.tensor(y_pred) if type(y_pred) != torch.Tensor else y_pred
    total = len(y_true)
    y_weights, y_idx = torch.topk(y_true, k=k, dim=-1)
    out_weights, out_idx = torch.topk(y_pred, k=k, dim=-1)
    correct = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
    acc = correct / total
    return acc.item()

def CEwST_loss(logits, target, reduction='mean'):
    """
    Cross Entropy with Soft Target (CEwST) Loss
    :param logits: (batch, *)
    :param target: (batch, *) same shape as logits, each item must be a valid distribution: target[i, :].sum() == 1.
    """
    logprobs = torch.nn.functional.log_softmax(logits.view(logits.shape[0], -1), dim=1)
    batchloss = - torch.sum(target.view(target.shape[0], -1) * logprobs, dim=1)
    if reduction == 'none':
        return batchloss
    elif reduction == 'mean':
        return torch.mean(batchloss)
    elif reduction == 'sum':
        return torch.sum(batchloss)
    else:
        raise NotImplementedError('Unsupported reduction mode.')

def compute_metrics_w_soft_target(pred):
    labels = pred.label_ids
    preds = pred.predictions
    acc = acc_at_k(labels, preds, k=2)
    return {
        'accuracy': acc,
    }

class Trainer_w_soft_target(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = CEwST_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss
    
class DefaultCollator:
    def __init__(self):
        pass
    def __call__(self, batch):
        return torch.utils.data.dataloader.default_collate(batch)

In [3]:
MODEL_NAMES = ['bert-base-uncased'] #, 'xlnet-base-cased']

In [4]:
use_pretrain = False
soft_target = False

for t in ['ORIG', 'INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']: 
    for MODEL_NAME in MODEL_NAMES:
        
        eval_only = False
        
        checkpoint = 'pretrained/' + MODEL_NAME + "-sst2-" + t 
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        if t == 'ORIG':
            train_dataset = load_dataset('glue', 'sst2', split='train[:90%]')
            train_dataset.rename_column_('sentence', 'text')
        else:
            # load custom data    
            text = npy_load("./assets/SST2/" + t + "/text.npy")
            label = npy_load("./assets/SST2/" + t + "/label.npy")
            if len(label.shape) > 1:
                df = pd.DataFrame({'text': text, 'label': label.tolist()})
                df.text = df.text.astype(str)
                df.label = df.label.map(lambda y: np.array(y))
            else:
                df = pd.DataFrame({'text': text, 'label': label})
                df.text = df.text.astype(str)
                df.label = df.label.astype(int)
            train_dataset = Dataset.from_pandas(df)  
            
        if use_pretrain and os.path.exists(checkpoint):
            print('loading {}...'.format(checkpoint))
            MODEL_NAME = checkpoint
            eval_only = True
            
        # split to get train
        dataset_dict = train_dataset.train_test_split(
            test_size = 0.1,
            train_size = 0.9,
            shuffle = True
        )
        train_dataset = dataset_dict['train']
        eval_dataset = dataset_dict['test']
        test_dataset = load_dataset('glue', 'sst2', split='train[-10%:]')
        test_dataset.rename_column_('sentence', 'text')
        
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
            
        train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
        eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=len(eval_dataset))
        test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
        train_dataset.rename_column_('label', 'labels')
        eval_dataset.rename_column_('label', 'labels')
        test_dataset.rename_column_('label', 'labels')
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        
        if len(np.array(train_dataset['labels']).shape) > 1:
            soft_target = True
        
        train_batch_size = 8
        eval_batch_size = 32
        num_epoch = 10
        max_steps = int((len(train_dataset) * num_epoch) / train_batch_size)

        training_args = TrainingArguments(
            output_dir=checkpoint,
            overwrite_output_dir=True,
            max_steps=max_steps,
            save_steps=int(max_steps / 10),
            save_total_limit=1,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            warmup_steps=int(max_steps / 10),
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=int(max_steps / 10),
            load_best_model_at_end=True,
            metric_for_best_model="loss",
            greater_is_better=False,
            evaluation_strategy="steps"
        )

        if soft_target:
            trainer = Trainer_w_soft_target(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics_w_soft_target,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=DefaultCollator(),
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )
        else: 
            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )

        if not eval_only:
            trainer.train()
        
        trainer.compute_metrics = compute_metrics
            
        # test ORIG
        trainer.eval_dataset = test_dataset
        out = trainer.evaluate()
        print('ORIG for {}\n{}'.format(checkpoint, out))

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ff9f2b7ae46b32a7.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-982522f400bc97b3.arrow
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
6819,0.3338,0.280366,0.925909,0.924538,0.926292,0.923139,23.1351,291.117
13638,0.2547,0.265556,0.929176,0.928578,0.92706,0.931793,23.0039,292.776
20457,0.2343,0.312254,0.924128,0.922785,0.92409,0.921697,23.1299,291.181
27276,0.247,0.295555,0.921604,0.920568,0.919923,0.92131,22.9837,293.033
34095,0.2039,0.310876,0.928137,0.92735,0.926083,0.929184,22.9988,292.841
40914,0.1689,0.320987,0.932294,0.931258,0.931419,0.9311,22.9693,293.218
47733,0.1401,0.325624,0.937639,0.936767,0.93638,0.937183,22.9616,293.316
54552,0.1124,0.290879,0.94046,0.9396,0.939398,0.939809,22.9408,293.581
61371,0.0796,0.322999,0.941054,0.940167,0.940216,0.940119,22.8959,294.158
68190,0.0567,0.339193,0.939866,0.938916,0.939307,0.938548,22.9624,293.305


ORIG for pretrained/bert-base-uncased-sst2-ORIG
{'eval_loss': 0.265555739402771, 'eval_accuracy': 0.9291759465478842, 'eval_f1': 0.9285784776156227, 'eval_precision': 0.9270600601929716, 'eval_recall': 0.9317925400673612, 'eval_runtime': 23.0604, 'eval_samples_per_second': 292.059, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
6819,0.5022,0.27815,0.900371,0.898552,0.900075,0.897329,23.1272,291.216
13638,0.4602,0.290945,0.896511,0.89589,0.894935,0.900394,23.045,292.254
20457,0.4566,0.349102,0.887899,0.88515,0.890302,0.882112,23.0443,292.263
27276,0.4981,0.376165,0.874536,0.873378,0.872114,0.875825,22.949,293.477
34095,0.4585,0.364585,0.879436,0.877239,0.878665,0.8761,23.1437,291.008
40914,0.437,0.366629,0.878099,0.874845,0.881332,0.871361,22.8231,295.095
47733,0.426,0.375363,0.875724,0.874908,0.873889,0.878858,22.8887,294.25
54552,0.431,0.381279,0.871269,0.870089,0.868837,0.872546,22.9471,293.502
61371,0.4245,0.357075,0.882851,0.881324,0.88069,0.88207,22.9682,293.232
68190,0.3888,0.346167,0.891759,0.88994,0.890731,0.889247,22.9188,293.864


ORIG for pretrained/bert-base-uncased-sst2-INV
{'eval_loss': 0.27815037965774536, 'eval_accuracy': 0.9003711952487008, 'eval_f1': 0.8985515646630078, 'eval_precision': 0.900074699135242, 'eval_recall': 0.8973294004645409, 'eval_runtime': 22.9151, 'eval_samples_per_second': 293.911, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
6819,0.665,0.897967,0.747736,23.1018,291.536
13638,0.6406,0.91328,0.726355,22.7993,295.404
20457,0.6208,1.016656,0.877803,22.8992,294.115
27276,0.6072,0.826896,1.041425,22.8204,295.131
34095,0.5933,0.927685,0.930067,22.9406,293.584
40914,0.5812,0.81601,0.991834,22.8705,294.485
47733,0.5715,0.938874,1.007275,22.6313,297.597
54552,0.5634,0.923562,0.947587,22.6956,296.753
61371,0.5564,0.940771,1.039941,22.7657,295.84
68190,0.5516,0.974348,1.055085,22.7994,295.403


ORIG for pretrained/bert-base-uncased-sst2-SIB
{'eval_loss': 0.8160098791122437, 'eval_accuracy': 0.852264291017075, 'eval_f1': 0.8516425534228604, 'eval_precision': 0.8517924386866914, 'eval_recall': 0.8570226714247255, 'eval_runtime': 22.7143, 'eval_samples_per_second': 296.509, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
6819,0.6284,1.089573,0.787825,22.7591,295.926
13638,0.5932,1.130238,0.848701,22.7678,295.812
20457,0.5628,1.287471,0.947884,22.7431,296.134
27276,0.5386,1.445761,0.80386,22.7608,295.903
34095,0.5177,1.547184,0.866221,22.8072,295.302
40914,0.4982,1.478244,0.820787,22.7802,295.652
47733,0.4841,1.247416,0.835635,22.7884,295.545
54552,0.473,1.349495,0.902747,22.8471,294.785
61371,0.4639,1.575682,0.866221,22.7961,295.446
68190,0.4571,1.580923,0.868894,22.7787,295.671


ORIG for pretrained/bert-base-uncased-sst2-INVSIB
{'eval_loss': 1.0895730257034302, 'eval_accuracy': 0.8874536005939124, 'eval_f1': 0.8842350419155984, 'eval_precision': 0.8922425741577718, 'eval_recall': 0.8801789176657253, 'eval_runtime': 22.7117, 'eval_samples_per_second': 296.543, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
6819,0.4624,2.311278,0.84781,22.8614,294.601
13638,0.391,2.474469,0.842168,22.9271,293.758
20457,0.3575,2.936913,0.861173,22.9372,293.628
27276,0.3356,3.454438,0.832665,22.98,293.08
34095,0.3216,3.140266,0.884633,22.8551,294.683
40914,0.309,3.661655,0.833556,22.694,296.774
47733,0.3008,3.586969,0.876912,22.8231,295.096
54552,0.2932,4.252845,0.868597,22.6929,296.788
61371,0.2891,4.408055,0.866815,22.8757,294.418
68190,0.2865,4.542927,0.863252,22.7519,296.019


ORIG for pretrained/bert-base-uncased-sst2-TextMix
{'eval_loss': 2.3112778663635254, 'eval_accuracy': 0.9233853006681515, 'eval_f1': 0.9219252954440313, 'eval_precision': 0.9239744885984076, 'eval_recall': 0.9203398959508766, 'eval_runtime': 22.6331, 'eval_samples_per_second': 297.573, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
13638,0.3301,2.025437,0.849295,22.7414,296.156
27276,0.2799,2.753082,0.903044,22.6706,297.081
40914,0.2561,3.098736,0.862955,22.7661,295.835
54552,0.2415,2.904123,0.888196,22.7524,296.012
68190,0.2313,3.762186,0.888493,22.9365,293.637
81828,0.2234,4.025983,0.87216,22.8379,294.904
95466,0.2176,4.380283,0.865033,22.5244,299.009
109104,0.2135,4.180207,0.859094,22.7366,296.219
122742,0.2099,4.634286,0.874239,24.3453,276.644
136380,0.2079,4.929869,0.858203,23.2061,290.226


ORIG for pretrained/bert-base-uncased-sst2-SentMix
{'eval_loss': 2.0254368782043457, 'eval_accuracy': 0.9273942093541203, 'eval_f1': 0.9260258642355056, 'eval_precision': 0.9279736070381231, 'eval_recall': 0.9245000798340026, 'eval_runtime': 22.9978, 'eval_samples_per_second': 292.854, 'epoch': 10.0}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-ffd37d6b7b587b4d.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
20457,0.271,1.999339,0.943133,23.1318,291.158
40914,0.2248,2.216923,0.892056,23.2942,289.128
61371,0.2029,2.871724,0.891759,23.1795,290.559
81828,0.1892,2.990594,0.889384,23.212,290.152
102285,0.1799,3.62501,0.865924,23.2858,289.232
122742,0.1728,3.562417,0.867706,23.1807,290.543
143199,0.1678,4.116907,0.867706,23.1158,291.359
163656,0.164,4.217583,0.890869,23.1498,290.931
184113,0.1615,4.233397,0.868894,23.1633,290.761
204570,0.16,4.605904,0.870379,23.2207,290.044


ORIG for pretrained/bert-base-uncased-sst2-WordMix
{'eval_loss': 1.9993385076522827, 'eval_accuracy': 0.9244246473645137, 'eval_f1': 0.9238200475664288, 'eval_precision': 0.9223412652352605, 'eval_recall': 0.9272627659224085, 'eval_runtime': 23.0944, 'eval_samples_per_second': 291.629, 'epoch': 10.0}
