In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets, Dataset
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
def one_hot_encode(y, nb_classes=2):
    if not isinstance(y, np.ndarray):
        y = np.expand_dims(np.array(y), 0)
    res = np.eye(nb_classes)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[nb_classes])[0]

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=250)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1.mean(),
        'precision': precision.mean(),
        'recall': recall.mean()
    }

def acc_at_k(y_true, y_pred, k=2):
    y_pred = torch.tensor(y_pred) if type(y_pred) != torch.Tensor else y_pred
    y_true = torch.tensor(y_true) if type(y_true) != torch.Tensor else y_true
    total = len(y_true)
    y_weights, y_idx = torch.topk(y_true, k=k, dim=-1)
    out_weights, out_idx = torch.topk(y_pred, k=k, dim=-1)
    correct = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
    acc = correct / total
    if acc.item() > 1:
        print(y_true.shape, y_true)
        print(y_pred.shape, y_pred)
    return acc.item()

def CEwST_loss(logits, target, reduction='mean'):
    """
    Cross Entropy with Soft Target (CEwST) Loss
    :param logits: (batch, *)
    :param target: (batch, *) same shape as logits, each item must be a valid distribution: target[i, :].sum() == 1.
    """
    logprobs = torch.nn.functional.log_softmax(logits.view(logits.shape[0], -1), dim=1)
    batchloss = - torch.sum(target.view(target.shape[0], -1) * logprobs, dim=1)
    if reduction == 'none':
        return batchloss
    elif reduction == 'mean':
        return torch.mean(batchloss)
    elif reduction == 'sum':
        return torch.sum(batchloss)
    else:
        raise NotImplementedError('Unsupported reduction mode.')

def compute_metrics_w_soft_target(pred):
    labels = pred.label_ids
    preds = pred.predictions
    acc = acc_at_k(labels, preds, k=2)
    return {
        'accuracy': acc,
    }

class Trainer_w_soft_target(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = CEwST_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss
    
class DefaultCollator:
    def __init__(self):
        pass
    def __call__(self, batch):
        return torch.utils.data.dataloader.default_collate(batch)

In [3]:
# ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']
# ['ORIG', 'INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']

In [4]:
MODEL_NAMES = ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']

In [None]:
use_pretrain = False

results = []
for MODEL_NAME in MODEL_NAMES:
    for t in ['ORIG', 'INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']: 
        
        soft_target = False
        eval_only = False
        
        checkpoint = 'pretrained/' + MODEL_NAME + "-sst2-ORIG+" + t 
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        if t == 'ORIG':
            train_dataset = load_dataset('glue', 'sst2', split='train[:90%]')
            train_dataset.rename_column_('sentence', 'text')
        else: 
            
            # load custom data    
            text = npy_load("./assets/SST2/" + t + "/text.npy")
            label = npy_load("./assets/SST2/" + t + "/label.npy")
            if len(label.shape) > 1:
                df = pd.DataFrame({'text': text, 'label': label.tolist()})
                df.text = df.text.astype(str)
                df.label = df.label.map(lambda y: np.array(y))
            else:
                df = pd.DataFrame({'text': text, 'label': label})
                df.text = df.text.astype(str)
                df.label = df.label.astype(object)
            train_dataset = Dataset.from_pandas(df) 
            
            # load orig data
            orig_dataset = load_dataset('glue', 'sst2', split='train[:90%]')
            orig_dataset.remove_columns_(['idx'])
            orig_dataset.rename_column_('sentence', 'text')
            df = orig_dataset.to_pandas()
            df = df[df.columns[::-1]]
            df.text = df.text.astype(str)
            if len(label.shape) > 1:
                df.label = df.label.map(one_hot_encode)
            else:
                df.label = df.label.astype(object)
            orig_dataset = Dataset.from_pandas(df)
            
            # merge orig + custom data
            train_dataset = concatenate_datasets([orig_dataset, train_dataset])
            train_dataset.shuffle()
            
        if use_pretrain and os.path.exists(checkpoint):
            print('loading {}...'.format(checkpoint))
            MODEL_NAME = checkpoint
            eval_only = True
            
        # split to get train
        dataset_dict = train_dataset.train_test_split(
            test_size = 0.05,
            train_size = 0.95,
            shuffle = True
        )
        train_dataset = dataset_dict['train']
        eval_dataset = dataset_dict['test']
        test_dataset = load_dataset('glue', 'sst2', split='train[-10%:]')
        test_dataset.rename_column_('sentence', 'text')
        
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
            
        train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
        eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=len(eval_dataset))
        test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
        train_dataset.rename_column_('label', 'labels')
        eval_dataset.rename_column_('label', 'labels')
        test_dataset.rename_column_('label', 'labels')
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        
        if len(np.array(train_dataset['labels']).shape) > 1:
            soft_target = True
        
        train_batch_size = 8
        eval_batch_size = 32
        num_epoch = 10
        gradient_accumulation_steps=1
        max_steps = int((len(train_dataset) * num_epoch / gradient_accumulation_steps) / train_batch_size)

        training_args = TrainingArguments(
            seed=1,
            # adafactor=True,
            output_dir=checkpoint,
            overwrite_output_dir=True,
            max_steps=max_steps,
            save_steps=int(max_steps / 10),
            save_total_limit=1,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            # gradient_accumulation_steps=gradient_accumulation_steps, 
            warmup_steps=int(max_steps / 10),
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=2000,
            logging_first_step=True,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            evaluation_strategy="steps",
            # run_name=checkpoint
        )

        if soft_target:
            trainer = Trainer_w_soft_target(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics_w_soft_target,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                data_collator=DefaultCollator(),
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )
        else: 
            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )

        if not eval_only:
            trainer.train()
        
        trainer.compute_metrics = compute_metrics
            
        # test ORIG
        trainer.eval_dataset = test_dataset
        out = trainer.evaluate()
        out['run'] = checkpoint
        print('ORIG for {}\n{}'.format(checkpoint, out))   
        
        results.append(out)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
rename_column_ is deprecated and will be removed in the next major version of datasets. Use the dataset.rename_column method instead.
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-32a6a23548aa6274.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0d3572f739d46d53.arrow
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.trans

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.3973,0.294396,0.91422,0.913627,0.912439,0.91618,12.3236,245.952
4000,0.3118,0.344711,0.906631,0.903933,0.915343,0.89907,12.3366,245.691
6000,0.3063,0.313438,0.917849,0.91746,0.916657,0.921431,12.3569,245.288
8000,0.2762,0.313195,0.928736,0.927734,0.928604,0.926985,12.3299,245.825
10000,0.2556,0.322501,0.931706,0.931101,0.930032,0.932669,12.3397,245.629
12000,0.2574,0.280137,0.931376,0.930789,0.929666,0.932519,12.3313,245.798
14000,0.2641,0.287617,0.929396,0.928151,0.931116,0.926112,12.3242,245.939
16000,0.2049,0.299354,0.935995,0.935232,0.935107,0.935359,12.3125,246.172
18000,0.2192,0.254932,0.937974,0.937198,0.93733,0.93707,12.3444,245.537
20000,0.2237,0.278389,0.939624,0.939119,0.937941,0.940978,12.3251,245.92


ORIG for pretrained/bert-base-uncased-sst2-ORIG+ORIG
{'eval_loss': 0.35198429226875305, 'eval_accuracy': 0.9453600593912398, 'eval_f1': 0.9446036526435704, 'eval_precision': 0.9441538801196128, 'eval_recall': 0.9450924194929575, 'eval_runtime': 27.4117, 'eval_samples_per_second': 245.698, 'epoch': 10.0, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
remove_columns_ is deprecated and will be removed in the next major version of datasets. Use the dataset.remove_columns method instead.
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architectu

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.5198,0.40699,0.827945,0.82094,0.841553,0.816177,102.903,58.91
4000,0.3838,0.321684,0.857638,0.854407,0.860959,0.851381,102.8513,58.939
6000,0.3609,0.407728,0.865556,0.86346,0.865522,0.862047,102.9431,58.887
8000,0.3587,0.327549,0.86935,0.868234,0.8675,0.869261,102.8635,58.932
10000,0.3675,0.322734,0.86836,0.865325,0.872318,0.8621,102.8017,58.968
12000,0.3656,0.331175,0.858628,0.856278,0.858877,0.854622,102.8721,58.928
14000,0.3966,0.463466,0.836688,0.829699,0.852733,0.82451,102.8316,58.951
16000,0.3751,0.48412,0.84708,0.845307,0.845472,0.845151,102.8488,58.941
18000,0.3867,0.399292,0.855823,0.854787,0.853915,0.856388,102.9031,58.91
20000,0.3693,0.391869,0.867865,0.865806,0.867879,0.864382,102.9795,58.866


ORIG for pretrained/bert-base-uncased-sst2-ORIG+INV
{'eval_loss': 0.22934286296367645, 'eval_accuracy': 0.9272457312546399, 'eval_f1': 0.9265553141535987, 'eval_precision': 0.9250828171225598, 'eval_recall': 0.929157122983028, 'eval_runtime': 27.3978, 'eval_samples_per_second': 245.823, 'epoch': 1.94, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5822,0.483918,0.757427,65.8866,92.007
4000,0.4937,0.46572,0.810361,65.8389,92.073
6000,0.4771,0.458156,0.811933,65.8952,91.995
8000,0.4695,0.463236,0.831102,65.9129,91.97
10000,0.4754,0.524379,0.838256,65.9192,91.961
12000,0.4574,0.457948,0.832105,65.8952,91.995
14000,0.4668,0.469572,0.83495,65.8891,92.003
16000,0.4407,0.487929,0.81731,65.8628,92.04
18000,0.4454,0.437296,0.822175,65.8709,92.028
20000,0.4394,0.450881,0.852468,65.8883,92.004


ORIG for pretrained/bert-base-uncased-sst2-ORIG+SIB
{'eval_loss': 3.8303322792053223, 'eval_accuracy': 0.940014847809948, 'eval_f1': 0.9392432285855458, 'eval_precision': 0.938451741012869, 'eval_recall': 0.9401792273357352, 'eval_runtime': 27.419, 'eval_samples_per_second': 245.633, 'epoch': 9.72, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5626,0.459765,0.812479,102.7363,59.005
4000,0.4623,0.423294,0.832323,102.7658,58.988
6000,0.4408,0.453584,0.842464,102.8045,58.966
8000,0.4439,0.452885,0.846708,102.9252,58.897
10000,0.4449,0.440391,0.850654,102.8314,58.951
12000,0.43,0.435332,0.843656,102.8733,58.927
14000,0.4455,0.468316,0.851173,102.8555,58.937
16000,0.4265,0.427456,0.861764,102.7902,58.974
18000,0.423,0.426851,0.861085,102.8193,58.958
20000,0.413,0.450499,0.856776,102.878,58.924


ORIG for pretrained/bert-base-uncased-sst2-ORIG+INVSIB
{'eval_loss': 2.4658586978912354, 'eval_accuracy': 0.9349665924276169, 'eval_f1': 0.9340570007822142, 'eval_precision': 0.9336723618090452, 'eval_recall': 0.9344706039030594, 'eval_runtime': 27.414, 'eval_samples_per_second': 245.677, 'epoch': 3.33, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4999,0.383299,0.851534,36.7915,164.766
4000,0.3718,0.316063,0.887001,36.7568,164.922
6000,0.3299,0.338949,0.902507,36.8167,164.654
8000,0.3261,0.301207,0.910261,36.8139,164.666
10000,0.3238,0.377753,0.900033,36.7859,164.791
12000,0.3067,0.317735,0.910426,36.8092,164.687
14000,0.3222,0.293654,0.926097,36.7866,164.788
16000,0.2929,0.329496,0.913725,36.679,165.272
18000,0.2923,0.315154,0.922963,36.7981,164.737
20000,0.2785,0.290189,0.922963,36.7944,164.753


ORIG for pretrained/bert-base-uncased-sst2-ORIG+TextMix
{'eval_loss': 3.459319591522217, 'eval_accuracy': 0.9459539717891611, 'eval_f1': 0.9451746127107483, 'eval_precision': 0.9449550204900994, 'eval_recall': 0.9454027160028868, 'eval_runtime': 27.4066, 'eval_samples_per_second': 245.743, 'epoch': 5.42, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+TextMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.474,0.338679,0.717515,54.9812,165.384
4000,0.3313,0.307838,0.74625,54.95,165.478
6000,0.3008,0.270661,0.76915,54.9081,165.604
8000,0.2836,0.261724,0.784238,55.0189,165.27
10000,0.273,0.253895,0.786836,54.9827,165.379
12000,0.2672,0.248588,0.792271,54.9772,165.396
14000,0.266,0.252973,0.79702,54.9837,165.376
16000,0.2712,0.240353,0.791805,54.986,165.369
18000,0.2639,0.256544,0.791607,54.9486,165.482
20000,0.2662,0.242523,0.790439,54.987,165.366


ORIG for pretrained/bert-base-uncased-sst2-ORIG+SentMix
{'eval_loss': 5.534175395965576, 'eval_accuracy': 0.9505567928730512, 'eval_f1': 0.9498420051845728, 'eval_precision': 0.9496353869170993, 'eval_recall': 0.950056000151792, 'eval_runtime': 27.4023, 'eval_samples_per_second': 245.783, 'epoch': 9.82, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+SentMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-80412efd19620de1.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4145,0.311611,0.58529,84.327,143.762
4000,0.3069,0.288362,0.5923,84.3279,143.76
6000,0.2778,0.256058,0.626746,84.3628,143.701
8000,0.2712,0.238809,0.639301,84.387,143.66
10000,0.2569,0.232733,0.645995,84.3104,143.79
12000,0.2435,0.24501,0.641539,84.3849,143.663
14000,0.2355,0.219021,0.648233,84.3883,143.657
16000,0.233,0.219463,0.646787,84.3586,143.708
18000,0.2335,0.224067,0.653201,84.3537,143.716
20000,0.2333,0.223048,0.639807,84.3841,143.665


ORIG for pretrained/bert-base-uncased-sst2-ORIG+WordMix
{'eval_loss': 3.940965414047241, 'eval_accuracy': 0.9447661469933185, 'eval_f1': 0.9439615883843995, 'eval_precision': 0.9438034857270928, 'eval_recall': 0.9441240294619689, 'eval_runtime': 27.4068, 'eval_samples_per_second': 245.742, 'epoch': 5.14, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+WordMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-16356167c47d9ed8.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-dec3809c51f6522d.arrow
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected i

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.4646,0.393213,0.892445,0.890267,0.894217,0.887833,11.4408,264.929
4000,0.3755,0.333075,0.903992,0.902587,0.903386,0.901897,11.4178,265.462
6000,0.3744,0.315249,0.901023,0.900414,0.899323,0.903664,11.4334,265.1
8000,0.3927,0.371964,0.891125,0.890248,0.889106,0.892361,11.4199,265.414
10000,0.3434,0.335921,0.901353,0.899877,0.900829,0.899073,11.4206,265.398
12000,0.3539,0.327914,0.902342,0.900638,0.902936,0.898985,11.4378,264.998
14000,0.4297,0.478937,0.823821,0.823655,0.827554,0.830885,11.4095,265.655
16000,0.5352,0.687046,0.556252,0.357431,0.278126,0.5,11.4378,264.998
18000,0.6907,0.68684,0.556252,0.357431,0.278126,0.5,11.4139,265.554
20000,0.6895,0.687298,0.556252,0.357431,0.278126,0.5,11.4199,265.414


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/roberta-base-sst2-ORIG+ORIG
{'eval_loss': 0.3174646496772766, 'eval_accuracy': 0.9091314031180401, 'eval_f1': 0.9075567763438361, 'eval_precision': 0.9086340328344555, 'eval_recall': 0.9066408912625086, 'eval_runtime': 27.2926, 'eval_samples_per_second': 246.77, 'epoch': 3.33, 'run': 'pretrained/roberta-base-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.5083,0.448942,0.841636,0.836365,0.85096,0.831964,102.6887,59.033
4000,0.4088,0.315076,0.859122,0.855947,0.862412,0.852935,102.8212,58.957
6000,0.3988,0.377053,0.852194,0.848505,0.856635,0.845124,102.7667,58.988
8000,0.4069,0.336144,0.856318,0.854471,0.855151,0.853896,102.7897,58.975
10000,0.4182,0.35589,0.854833,0.852745,0.854133,0.851713,102.7419,59.002
12000,0.4432,0.433356,0.835698,0.835161,0.835242,0.838944,102.7614,58.991
14000,0.491,0.487246,0.817717,0.816675,0.815976,0.818821,102.8141,58.961
16000,0.541,0.558281,0.767074,0.765293,0.76471,0.766371,102.7324,59.008
18000,0.67,0.687856,0.552458,0.35586,0.276229,0.5,102.7891,58.975
20000,0.6926,0.68766,0.552458,0.35586,0.276229,0.5,102.8119,58.962


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/roberta-base-sst2-ORIG+INV
{'eval_loss': 0.21874354779720306, 'eval_accuracy': 0.9131403118040089, 'eval_f1': 0.9119959859193577, 'eval_precision': 0.911353330569336, 'eval_recall': 0.9127387949576415, 'eval_runtime': 28.2108, 'eval_samples_per_second': 238.738, 'epoch': 1.67, 'run': 'pretrained/roberta-base-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5808,0.534998,0.798791,59.7624,101.435
4000,0.5237,0.514567,0.806445,58.6227,103.407
6000,0.5113,0.494694,0.815696,58.5516,103.533
8000,0.5114,0.456588,0.768023,57.9903,104.535
10000,0.5169,0.57377,0.798421,57.9481,104.611
12000,0.5104,0.521803,0.777465,57.8048,104.87
14000,0.5171,0.51326,0.782137,57.6877,105.083
16000,0.4991,0.55119,0.797481,57.9582,104.593
18000,0.4998,0.47857,0.805379,57.9046,104.689
20000,0.5134,0.562723,0.735629,57.9801,104.553


ORIG for pretrained/roberta-base-sst2-ORIG+SIB
{'eval_loss': 2.6049084663391113, 'eval_accuracy': 0.912249443207127, 'eval_f1': 0.9103195355109415, 'eval_precision': 0.9140740892870709, 'eval_recall': 0.907812938550014, 'eval_runtime': 24.3833, 'eval_samples_per_second': 276.214, 'epoch': 1.81, 'run': 'pretrained/roberta-base-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5578,0.484918,0.819946,93.4012,64.903
4000,0.4789,0.482283,0.837243,93.3874,64.912
6000,0.4786,0.456596,0.835451,93.353,64.936
8000,0.4727,0.455429,0.842848,93.5014,64.833
10000,0.487,0.509132,0.828326,93.4212,64.889
12000,0.4858,0.477208,0.831463,93.2867,64.982
14000,0.4988,0.447735,0.837782,93.3257,64.955
16000,0.4783,0.485917,0.828162,93.4349,64.879
18000,0.492,0.617844,0.54961,93.5694,64.786
20000,0.5287,0.537561,0.775004,93.461,64.861


ORIG for pretrained/roberta-base-sst2-ORIG+INVSIB
{'eval_loss': 2.4703521728515625, 'eval_accuracy': 0.9162583518930958, 'eval_f1': 0.9151405000270763, 'eval_precision': 0.9145500801449354, 'eval_recall': 0.9158120013059841, 'eval_runtime': 24.3979, 'eval_samples_per_second': 276.048, 'epoch': 1.94, 'run': 'pretrained/roberta-base-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.493,0.354478,0.866876,32.7717,184.977
4000,0.3767,0.345161,0.896569,32.7867,184.892
6000,0.3671,0.354805,0.916694,32.8041,184.794
8000,0.3681,0.30859,0.91521,32.7914,184.866
10000,0.3621,0.3746,0.913395,33.3378,181.836
12000,0.359,0.400968,0.901683,32.7864,184.894
14000,0.3655,0.317257,0.916364,32.7838,184.908
16000,0.3467,0.351028,0.912735,35.9654,168.551
18000,0.3415,0.33457,0.913065,35.9825,168.471
20000,0.3301,0.341216,0.922633,35.9671,168.543


ORIG for pretrained/roberta-base-sst2-ORIG+TextMix
{'eval_loss': 2.455782413482666, 'eval_accuracy': 0.9287305122494433, 'eval_f1': 0.9277940538465882, 'eval_precision': 0.9271141219390674, 'eval_recall': 0.9285801164645637, 'eval_runtime': 26.735, 'eval_samples_per_second': 251.917, 'epoch': 3.47, 'run': 'pretrained/roberta-base-sst2-ORIG+TextMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4428,0.306651,0.736133,53.8987,168.705
4000,0.3267,0.325114,0.735637,53.9247,168.624
6000,0.314,0.281911,0.781712,53.899,168.704
8000,0.2995,0.277646,0.78636,54.0728,168.162
10000,0.3028,0.299366,0.793283,53.952,168.539
12000,0.2964,0.261602,0.786774,54.0734,168.16
14000,0.2961,0.274705,0.785402,53.9557,168.527
16000,0.3066,0.254407,0.796274,53.9047,168.687
18000,0.2961,0.289614,0.785323,53.8596,168.828
20000,0.2963,0.307247,0.775591,53.9076,168.678


ORIG for pretrained/roberta-base-sst2-ORIG+SentMix
{'eval_loss': 2.464665651321411, 'eval_accuracy': 0.9315515961395694, 'eval_f1': 0.9304062881630002, 'eval_precision': 0.9312374332768985, 'eval_recall': 0.9296701369994445, 'eval_runtime': 26.722, 'eval_samples_per_second': 252.039, 'epoch': 3.24, 'run': 'pretrained/roberta-base-sst2-ORIG+SentMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect 

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-5361578f715a8ede.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.3993,0.27926,0.607026,85.3045,142.114
4000,0.2959,0.270901,0.626571,85.2659,142.179
6000,0.2798,0.269105,0.633852,85.2777,142.159
8000,0.2831,0.249096,0.642622,85.2813,142.153
10000,0.2716,0.262596,0.647533,85.2985,142.124
12000,0.2662,0.254958,0.650659,85.3373,142.06
14000,0.2604,0.254526,0.648893,85.3014,142.12
16000,0.2615,0.232856,0.65198,85.2972,142.127
18000,0.2546,0.229717,0.647144,85.8723,141.175
20000,0.2607,0.237214,0.6478,85.2831,142.15


ORIG for pretrained/roberta-base-sst2-ORIG+WordMix
{'eval_loss': 2.6165454387664795, 'eval_accuracy': 0.927691165553081, 'eval_f1': 0.9267435587803126, 'eval_precision': 0.9260538696514973, 'eval_recall': 0.9275435274313929, 'eval_runtime': 26.7068, 'eval_samples_per_second': 252.183, 'epoch': 1.46, 'run': 'pretrained/roberta-base-sst2-ORIG+WordMix'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-16356167c47d9ed8.arrow and C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-dec3809c51f6522d.arrow
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSe

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.4848,0.312352,0.892775,0.890635,0.894401,0.88828,18.8299,160.967
4000,0.3579,0.381782,0.908611,0.907923,0.906697,0.910409,18.8487,160.807
6000,0.3479,0.429949,0.904982,0.903807,0.903632,0.903989,18.9093,160.292
8000,0.3611,0.435633,0.897724,0.897017,0.895852,0.899796,18.8565,160.74
10000,0.3966,0.417101,0.874299,0.87297,0.872275,0.873853,18.8344,160.929
12000,0.509,0.688235,0.556252,0.357431,0.278126,0.5,18.8066,161.167
14000,0.6595,0.515749,0.817552,0.816136,0.815259,0.818033,18.8791,160.548
16000,0.5284,0.632806,0.736391,0.716224,0.758463,0.715455,18.8842,160.505
18000,0.6911,0.72105,0.556252,0.357431,0.278126,0.5,18.8081,161.154
20000,0.6923,0.705244,0.556252,0.357431,0.278126,0.5,18.8413,160.87


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+ORIG
{'eval_loss': 0.3458961248397827, 'eval_accuracy': 0.9167037861915367, 'eval_f1': 0.9158046376389559, 'eval_precision': 0.914547097908931, 'eval_recall': 0.9176716771154936, 'eval_runtime': 41.1171, 'eval_samples_per_second': 163.8, 'epoch': 3.33, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-09c1e84e8f5fbf93.arrow





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.5885,0.396362,0.837347,0.832646,0.843483,0.828887,242.2597,25.023
4000,0.419,0.387593,0.839987,0.833751,0.853355,0.828791,242.0612,25.043
6000,0.4031,0.393578,0.837842,0.836866,0.836063,0.83896,242.1054,25.039
8000,0.3952,0.332733,0.84708,0.84683,0.848546,0.852221,242.0571,25.044
10000,0.3961,0.443054,0.838502,0.830953,0.85797,0.825417,242.1349,25.036
12000,0.4033,0.351211,0.854174,0.850674,0.858121,0.847441,242.078,25.042
14000,0.4415,0.437923,0.833718,0.830494,0.834524,0.828368,242.0781,25.042
16000,0.4832,0.533394,0.762455,0.737825,0.817856,0.738426,242.062,25.043
18000,0.6269,0.534666,0.791818,0.790153,0.789529,0.791181,242.1502,25.034
20000,0.5399,0.632205,0.693665,0.693097,0.7114,0.705953,242.1718,25.032


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+INV
{'eval_loss': 0.265116423368454, 'eval_accuracy': 0.9101707498144024, 'eval_f1': 0.9080109152230734, 'eval_precision': 0.9131023574909436, 'eval_recall': 0.9048988632067636, 'eval_runtime': 41.31, 'eval_samples_per_second': 163.036, 'epoch': 2.22, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-09c1e84e8f5fbf93.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.6161,0.505184,0.77802,123.6942,49.008
4000,0.5205,0.495983,0.789968,123.9063,48.924
6000,0.5008,0.439925,0.821562,123.6244,49.036
8000,0.5035,0.445891,0.825906,123.6573,49.023
10000,0.5043,0.559553,0.822018,123.6198,49.037
12000,0.4947,0.486782,0.805589,123.6753,49.015
14000,0.5381,0.512846,0.811192,123.6205,49.037
16000,0.5262,0.533421,0.801672,123.7337,48.992
18000,0.6513,0.694392,0.555779,123.666,49.019
20000,0.7019,0.698962,0.555284,123.6986,49.006


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+SIB
{'eval_loss': 2.053861379623413, 'eval_accuracy': 0.9083890126206384, 'eval_f1': 0.9065926264078867, 'eval_precision': 0.9088810045927722, 'eval_recall': 0.9048819924562238, 'eval_runtime': 41.4256, 'eval_samples_per_second': 162.581, 'epoch': 1.94, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-09c1e84e8f5fbf93.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.6059,0.476557,0.810192,243.1817,24.928
4000,0.4952,0.464699,0.796834,242.7544,24.972
6000,0.4803,0.474495,0.833785,243.0226,24.944
8000,0.4728,0.463861,0.850495,242.9888,24.948
10000,0.4777,0.464701,0.845448,242.7849,24.969
12000,0.4655,0.422936,0.839863,242.7125,24.976
14000,0.4812,0.488287,0.829658,242.7272,24.975
16000,0.4969,0.495681,0.838411,242.6158,24.986
18000,0.4993,0.487839,0.829714,242.8482,24.962
20000,0.5193,0.537047,0.819009,242.9118,24.956


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+INVSIB
{'eval_loss': 3.0075347423553467, 'eval_accuracy': 0.9146250927988122, 'eval_f1': 0.9136101696900272, 'eval_precision': 0.9125951547670516, 'eval_recall': 0.914940405528093, 'eval_runtime': 41.7222, 'eval_samples_per_second': 161.425, 'epoch': 1.94, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-09c1e84e8f5fbf93.arrow





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5423,0.39448,0.850874,68.6296,88.329
4000,0.3947,0.364768,0.876773,68.5548,88.426
6000,0.3711,0.356549,0.89327,68.5695,88.407
8000,0.3576,0.321589,0.897559,68.5903,88.38
10000,0.3548,0.324251,0.91092,68.6383,88.318
12000,0.3474,0.343049,0.900693,68.6806,88.264
14000,0.3488,0.336528,0.91422,56.863,106.607


In [None]:
df = pd.DataFrame(results)
df

In [None]:
df.to_csv('train_SST2_r3.csv')

In [None]:
df.to_clipboard(excel=True)