In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets, Dataset
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
def one_hot_encode(y, nb_classes=2):
    if not isinstance(y, np.ndarray):
        y = np.expand_dims(np.array(y), 0)
    res = np.eye(nb_classes)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[nb_classes])[0]

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=250)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1.mean(),
        'precision': precision.mean(),
        'recall': recall.mean()
    }

def acc_at_k(y_true, y_pred, k=2):
    y_pred = torch.tensor(y_pred) if type(y_pred) != torch.Tensor else y_pred
    y_true = torch.tensor(y_true) if type(y_true) != torch.Tensor else y_true
    total = len(y_true)
    y_weights, y_idx = torch.topk(y_true, k=k, dim=-1)
    out_weights, out_idx = torch.topk(y_pred, k=k, dim=-1)
    correct = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
    acc = correct / total
    if acc.item() > 1:
        print(y_true.shape, y_true)
        print(y_pred.shape, y_pred)
    return acc.item()

def CEwST_loss(logits, target, reduction='mean'):
    """
    Cross Entropy with Soft Target (CEwST) Loss
    :param logits: (batch, *)
    :param target: (batch, *) same shape as logits, each item must be a valid distribution: target[i, :].sum() == 1.
    """
    logprobs = torch.nn.functional.log_softmax(logits.view(logits.shape[0], -1), dim=1)
    batchloss = - torch.sum(target.view(target.shape[0], -1) * logprobs, dim=1)
    if reduction == 'none':
        return batchloss
    elif reduction == 'mean':
        return torch.mean(batchloss)
    elif reduction == 'sum':
        return torch.sum(batchloss)
    else:
        raise NotImplementedError('Unsupported reduction mode.')

def compute_metrics_w_soft_target(pred):
    labels = pred.label_ids
    preds = pred.predictions
    acc = acc_at_k(labels, preds, k=2)
    return {
        'accuracy': acc,
    }

class Trainer_w_soft_target(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = CEwST_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss
    
class DefaultCollator:
    def __init__(self):
        pass
    def __call__(self, batch):
        return torch.utils.data.dataloader.default_collate(batch)

In [3]:
# ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']
# ['ORIG', 'INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']

In [4]:
MODEL_NAMES = ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']

In [5]:
use_pretrain = False

results = []
for MODEL_NAME in MODEL_NAMES:
    for t in ['ORIG', 'INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']: 
        
        soft_target = False
        eval_only = False
        
        checkpoint = 'pretrained/' + MODEL_NAME + "-sst2-ORIG+" + t 
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        if t == 'ORIG':
            train_dataset = load_dataset('glue', 'sst2', split='train[:90%]')
            train_dataset.rename_column_('sentence', 'text')
        else: 
            
            # load custom data    
            text = npy_load("./assets/SST2/" + t + "/text.npy")
            label = npy_load("./assets/SST2/" + t + "/label.npy")
            if len(label.shape) > 1:
                df = pd.DataFrame({'text': text, 'label': label.tolist()})
                df.text = df.text.astype(str)
                df.label = df.label.map(lambda y: np.array(y))
            else:
                df = pd.DataFrame({'text': text, 'label': label})
                df.text = df.text.astype(str)
                df.label = df.label.astype(object)
            train_dataset = Dataset.from_pandas(df) 
            
            # load orig data
            orig_dataset = load_dataset('glue', 'sst2', split='train[:90%]')
            orig_dataset.remove_columns_(['idx'])
            orig_dataset.rename_column_('sentence', 'text')
            df = orig_dataset.to_pandas()
            df = df[df.columns[::-1]]
            df.text = df.text.astype(str)
            if len(label.shape) > 1:
                df.label = df.label.map(one_hot_encode)
            else:
                df.label = df.label.astype(object)
            orig_dataset = Dataset.from_pandas(df)
            
            # merge orig + custom data
            train_dataset = concatenate_datasets([orig_dataset, train_dataset])
            train_dataset.shuffle()
            
        if use_pretrain and os.path.exists(checkpoint):
            print('loading {}...'.format(checkpoint))
            MODEL_NAME = checkpoint
            eval_only = True
            
        # split to get train
        dataset_dict = train_dataset.train_test_split(
            test_size = 0.05,
            train_size = 0.95,
            shuffle = True
        )
        train_dataset = dataset_dict['train']
        eval_dataset = dataset_dict['test']
        test_dataset = load_dataset('glue', 'sst2', split='train[-10%:]')
        test_dataset.rename_column_('sentence', 'text')
        
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
            
        train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
        eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=len(eval_dataset))
        test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
        train_dataset.rename_column_('label', 'labels')
        eval_dataset.rename_column_('label', 'labels')
        test_dataset.rename_column_('label', 'labels')
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        
        if len(np.array(train_dataset['labels']).shape) > 1:
            soft_target = True
        
        train_batch_size = 8
        eval_batch_size = 32
        num_epoch = 10
        gradient_accumulation_steps=1
        max_steps = int((len(train_dataset) * num_epoch / gradient_accumulation_steps) / train_batch_size)

        training_args = TrainingArguments(
            seed=1,
            # adafactor=True,
            output_dir=checkpoint,
            overwrite_output_dir=True,
            max_steps=max_steps,
            save_steps=int(max_steps / 10),
            save_total_limit=1,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            # gradient_accumulation_steps=gradient_accumulation_steps, 
            warmup_steps=int(max_steps / 10),
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=2000,
            logging_first_step=True,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            evaluation_strategy="steps",
            # run_name=checkpoint
        )

        if soft_target:
            trainer = Trainer_w_soft_target(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics_w_soft_target,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                data_collator=DefaultCollator(),
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )
        else: 
            trainer = Trainer(
                model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
            )

        if not eval_only:
            trainer.train()
        
        trainer.compute_metrics = compute_metrics
            
        # test ORIG
        trainer.eval_dataset = test_dataset
        out = trainer.evaluate()
        out['run'] = checkpoint
        print('ORIG for {}\n{}'.format(checkpoint, out))   
        
        results.append(out)

Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
rename_column_ is deprecated and will be removed in the next major version of datasets. Use the dataset.rename_column method instead.
Loading cached split indices for dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-f091fffe09931764.arrow and C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b8768bf7f4b494b6.arrow
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.den

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.4149,0.292297,0.91323,0.912701,0.911576,0.915729,7.5996,398.839
4000,0.3143,0.282485,0.916859,0.914942,0.921683,0.911367,7.6143,398.066
6000,0.2949,0.322373,0.925437,0.924978,0.923799,0.928048,7.602,398.713
8000,0.2838,0.33235,0.920488,0.91863,0.925671,0.914931,7.5962,399.015
10000,0.2754,0.299632,0.931376,0.930746,0.929743,0.932152,7.5918,399.246
12000,0.2609,0.26462,0.928736,0.928145,0.926992,0.929994,7.5969,398.98
14000,0.2536,0.332025,0.931046,0.929848,0.932649,0.927892,7.5904,399.322
16000,0.2042,0.297886,0.933025,0.932449,0.931331,0.934152,7.591,399.29
18000,0.2194,0.324828,0.925437,0.923869,0.929176,0.920784,7.5889,399.401
20000,0.2325,0.28399,0.938964,0.938267,0.937938,0.938622,7.5867,399.516


ORIG for pretrained/bert-base-uncased-sst2-ORIG+ORIG
{'eval_loss': 0.28216299414634705, 'eval_accuracy': 0.9383815887156645, 'eval_f1': 0.9373408328474395, 'eval_precision': 0.9382654159668381, 'eval_recall': 0.9365290144689284, 'eval_runtime': 16.9324, 'eval_samples_per_second': 397.758, 'epoch': 7.5, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
remove_columns_ is deprecated and will be removed in the next major version of datasets. Use the dataset.remove_columns method instead.
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.522,0.397224,0.833223,0.831385,0.83135,0.83142,78.8219,76.908
4000,0.392,0.327296,0.856813,0.853541,0.860192,0.850494,78.8666,76.864
6000,0.3666,0.336336,0.860607,0.857529,0.863688,0.854593,78.886,76.845
8000,0.3617,0.354546,0.866381,0.864359,0.866175,0.863074,78.8704,76.86
10000,0.3598,0.346059,0.862587,0.85851,0.870678,0.85418,78.8924,76.839
12000,0.3727,0.362666,0.855328,0.851567,0.86056,0.847961,78.9133,76.819
14000,0.4108,0.34657,0.861597,0.859049,0.862748,0.856924,78.9745,76.759
16000,0.3683,0.412682,0.866216,0.864153,0.866126,0.862784,79.0562,76.68
18000,0.3695,0.38419,0.851039,0.847425,0.855011,0.844184,79.06,76.676
20000,0.3954,0.470594,0.841142,0.833594,0.861561,0.827911,78.9389,76.794


ORIG for pretrained/bert-base-uncased-sst2-ORIG+INV
{'eval_loss': 0.25182414054870605, 'eval_accuracy': 0.9257609502598366, 'eval_f1': 0.9247647432059267, 'eval_precision': 0.9241842167590271, 'eval_recall': 0.9254199608634388, 'eval_runtime': 17.0465, 'eval_samples_per_second': 395.095, 'epoch': 1.94, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5836,0.509834,0.792949,39.5777,153.167
4000,0.4934,0.455351,0.818984,39.581,153.154
6000,0.4793,0.491604,0.819063,39.5588,153.24
8000,0.47,0.45814,0.786365,39.5377,153.322
10000,0.4709,0.491376,0.832549,39.5746,153.179
12000,0.4652,0.443572,0.816951,39.5647,153.217
14000,0.4813,0.495706,0.832122,39.6125,153.032
16000,0.4472,0.469546,0.842631,39.609,153.046
18000,0.4519,0.458105,0.849021,39.6,153.081
20000,0.4473,0.481938,0.849602,39.5538,153.26


ORIG for pretrained/bert-base-uncased-sst2-ORIG+SIB
{'eval_loss': 3.5992980003356934, 'eval_accuracy': 0.9431328878990349, 'eval_f1': 0.9422386560646349, 'eval_precision': 0.9425967851442318, 'eval_recall': 0.94189968589079, 'eval_runtime': 17.0787, 'eval_samples_per_second': 394.35, 'epoch': 9.31, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow





W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5679,0.45441,0.810525,79.9178,75.853
4000,0.4733,0.43495,0.8425,79.6951,76.065
6000,0.4464,0.408567,0.854189,79.6679,76.091
8000,0.4419,0.425516,0.856829,79.6493,76.109
10000,0.4442,0.544261,0.835693,79.6591,76.099
12000,0.4414,0.405728,0.858344,79.6389,76.119
14000,0.4409,0.444437,0.85254,79.7022,76.058
16000,0.411,0.431702,0.858422,79.7071,76.053
18000,0.4163,0.433078,0.857126,79.7241,76.037
20000,0.4179,0.422203,0.865341,79.7848,75.979


ORIG for pretrained/bert-base-uncased-sst2-ORIG+INVSIB
{'eval_loss': 2.6374518871307373, 'eval_accuracy': 0.9334818114328136, 'eval_f1': 0.9325562170110473, 'eval_precision': 0.9321457550488259, 'eval_recall': 0.9330002083560067, 'eval_runtime': 16.9467, 'eval_samples_per_second': 397.422, 'epoch': 4.17, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4905,0.379431,0.8484,29.855,203.048
4000,0.3617,0.330609,0.886176,29.8492,203.087
6000,0.3361,0.310528,0.901353,29.8402,203.149
8000,0.327,0.316083,0.909271,29.845,203.116
10000,0.3215,0.302671,0.885186,29.9091,202.681
12000,0.3098,0.333733,0.917684,29.9202,202.606
14000,0.3208,0.336578,0.920323,29.8842,202.849
16000,0.2954,0.312856,0.920158,29.9041,202.715
18000,0.2914,0.307231,0.914715,29.8817,202.866
20000,0.2778,0.292981,0.923788,29.9142,202.646


ORIG for pretrained/bert-base-uncased-sst2-ORIG+TextMix
{'eval_loss': 5.39630651473999, 'eval_accuracy': 0.9478841870824053, 'eval_f1': 0.9471608629562017, 'eval_precision': 0.9467223042155277, 'eval_recall': 0.9476359648243509, 'eval_runtime': 16.984, 'eval_samples_per_second': 396.55, 'epoch': 10.0, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+TextMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4785,0.343714,0.717633,45.7061,198.945
4000,0.3342,0.309481,0.74803,45.6573,199.158
6000,0.3038,0.278553,0.774221,45.6355,199.253
8000,0.2779,0.254339,0.782258,45.5996,199.41
10000,0.2708,0.270143,0.782673,45.6001,199.407
12000,0.2666,0.263715,0.784243,45.6044,199.389
14000,0.2719,0.250669,0.778033,45.6633,199.131
16000,0.2732,0.251409,0.784808,45.6244,199.301
18000,0.2629,0.267568,0.784437,45.6473,199.201
20000,0.2657,0.25565,0.791285,45.663,199.133


ORIG for pretrained/bert-base-uncased-sst2-ORIG+SentMix
{'eval_loss': 5.498501300811768, 'eval_accuracy': 0.9499628804751299, 'eval_f1': 0.9492755360871598, 'eval_precision': 0.9487796560710117, 'eval_recall': 0.9498188251442023, 'eval_runtime': 16.9856, 'eval_samples_per_second': 396.512, 'epoch': 10.0, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+SentMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-58c5c95f3f3c7bcb.arrow





W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4311,0.308022,0.583661,54.3054,223.237
4000,0.3083,0.281898,0.604936,54.31,223.218
6000,0.2784,0.25303,0.625251,54.3579,223.022
8000,0.2626,0.263535,0.625218,54.3147,223.199
10000,0.2585,0.241769,0.637543,54.3297,223.138
12000,0.2478,0.250115,0.641227,54.3144,223.2
14000,0.2398,0.225766,0.648061,54.3062,223.234
16000,0.2335,0.236058,0.645486,54.3179,223.186
18000,0.2299,0.236771,0.650597,54.3133,223.205
20000,0.2338,0.218487,0.650493,54.3224,223.167


ORIG for pretrained/bert-base-uncased-sst2-ORIG+WordMix
{'eval_loss': 4.707504749298096, 'eval_accuracy': 0.9452115812917594, 'eval_f1': 0.9444115030632203, 'eval_precision': 0.9442686863576941, 'eval_recall': 0.9445578359758507, 'eval_runtime': 16.9568, 'eval_samples_per_second': 397.186, 'epoch': 7.64, 'run': 'pretrained/bert-base-uncased-sst2-ORIG+WordMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.458,0.392995,0.903992,0.902964,0.902247,0.903852,7.3565,412.019
4000,0.3618,0.396985,0.907291,0.906836,0.906055,0.910952,7.3533,412.195
6000,0.3771,0.610443,0.860112,0.860079,0.867134,0.869371,7.3551,412.092
8000,0.388,0.35492,0.905312,0.904599,0.903386,0.907068,7.3535,412.186
10000,0.394,0.374456,0.87001,0.869829,0.872681,0.876915,7.3622,411.698
12000,0.4992,0.681548,0.556252,0.357431,0.278126,0.5,7.3717,411.167
14000,0.69,0.688401,0.556252,0.357431,0.278126,0.5,7.3523,412.254
16000,0.6908,0.688703,0.556252,0.357431,0.278126,0.5,7.376,410.926
18000,0.6898,0.687287,0.556252,0.357431,0.278126,0.5,7.3512,412.314
20000,0.6888,0.686938,0.556252,0.357431,0.278126,0.5,7.3607,411.78


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/roberta-base-sst2-ORIG+ORIG
{'eval_loss': 0.3848177194595337, 'eval_accuracy': 0.910913140311804, 'eval_f1': 0.910330272180576, 'eval_precision': 0.9091432971976775, 'eval_recall': 0.9145914450169262, 'eval_runtime': 16.5429, 'eval_samples_per_second': 407.124, 'epoch': 3.33, 'run': 'pretrained/roberta-base-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.5148,0.350331,0.846255,0.842631,0.849628,0.839574,78.9295,76.803
4000,0.4071,0.370621,0.855163,0.852245,0.857002,0.849771,78.8777,76.853
6000,0.3985,0.363835,0.856813,0.851883,0.868051,0.846994,78.9806,76.753
8000,0.4003,0.381902,0.852359,0.847801,0.860687,0.843488,78.9978,76.736
10000,0.4194,0.444136,0.854009,0.849729,0.861357,0.845611,78.9801,76.754
12000,0.463,0.434728,0.830254,0.823581,0.843033,0.818862,79.0522,76.683
14000,0.4825,0.427461,0.828769,0.823348,0.836146,0.819409,79.0297,76.705
16000,0.4942,0.575824,0.78159,0.774861,0.785518,0.771949,79.0059,76.728
18000,0.603,0.689445,0.552458,0.35586,0.276229,0.5,78.7665,76.962
20000,0.691,0.687633,0.552458,0.35586,0.276229,0.5,78.7051,77.022


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/roberta-base-sst2-ORIG+INV
{'eval_loss': 0.2837463915348053, 'eval_accuracy': 0.9086859688195991, 'eval_f1': 0.9061458273404981, 'eval_precision': 0.9139480235294204, 'eval_recall': 0.9020025983640831, 'eval_runtime': 16.495, 'eval_samples_per_second': 408.305, 'epoch': 1.81, 'run': 'pretrained/roberta-base-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.581,0.499616,0.815664,36.4868,166.142
4000,0.5211,0.523229,0.773213,36.493,166.114
6000,0.5085,0.477794,0.815535,36.6325,165.481
8000,0.5007,0.487641,0.808998,36.6355,165.468
10000,0.5081,0.565346,0.818425,36.6316,165.486
12000,0.5096,0.479479,0.832698,36.6417,165.44
14000,0.5317,0.52629,0.784091,36.6703,165.311
16000,0.5162,0.531673,0.806469,36.6598,165.358
18000,0.5134,0.472271,0.81443,36.6741,165.294
20000,0.507,0.550031,0.768398,36.6248,165.516


ORIG for pretrained/roberta-base-sst2-ORIG+SIB
{'eval_loss': 1.9173798561096191, 'eval_accuracy': 0.9121009651076466, 'eval_f1': 0.910556565044754, 'eval_precision': 0.9117629220685963, 'eval_recall': 0.9095451216054439, 'eval_runtime': 16.5944, 'eval_samples_per_second': 405.859, 'epoch': 2.22, 'run': 'pretrained/roberta-base-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow





W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5681,0.51279,0.826125,79.532,76.221
4000,0.486,0.447259,0.843316,79.5353,76.218
6000,0.4707,0.432479,0.834872,79.2735,76.469
8000,0.4803,0.439096,0.840223,79.2198,76.521
10000,0.484,0.620165,0.795299,79.24,76.502
12000,0.4868,0.471553,0.823666,79.3343,76.411
14000,0.4848,0.444277,0.844916,79.4394,76.31
16000,0.4793,0.500896,0.801584,79.4571,76.293
18000,0.4932,0.502388,0.821818,79.4905,76.261
20000,0.4773,0.515537,0.808649,79.476,76.275


ORIG for pretrained/roberta-base-sst2-ORIG+INVSIB
{'eval_loss': 2.224684476852417, 'eval_accuracy': 0.9162583518930958, 'eval_f1': 0.9151228430011014, 'eval_precision': 0.9146038762143631, 'eval_recall': 0.9157023190524742, 'eval_runtime': 16.5837, 'eval_samples_per_second': 406.122, 'epoch': 2.36, 'run': 'pretrained/roberta-base-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.485,0.388339,0.873144,25.0855,241.654
4000,0.3734,0.347085,0.904322,25.0867,241.642
6000,0.3731,0.379388,0.91455,25.081,241.697
8000,0.37,0.352828,0.91422,25.0786,241.72
10000,0.3671,0.37359,0.899373,25.0419,242.074
12000,0.3692,0.442638,0.885846,25.0247,242.241
14000,0.3728,0.341575,0.896734,24.9908,242.569
16000,0.3463,0.448308,0.850544,24.9555,242.913
18000,0.3407,0.328496,0.918674,24.9895,242.581
20000,0.3319,0.344003,0.919169,25.0756,241.749


ORIG for pretrained/roberta-base-sst2-ORIG+TextMix
{'eval_loss': 3.7596583366394043, 'eval_accuracy': 0.9478841870824053, 'eval_f1': 0.9470961333137561, 'eval_precision': 0.9471789245912419, 'eval_recall': 0.9470144320544618, 'eval_runtime': 16.5743, 'eval_samples_per_second': 406.353, 'epoch': 10.0, 'run': 'pretrained/roberta-base-sst2-ORIG+TextMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4452,0.323766,0.740683,42.1816,215.568
4000,0.3264,0.34934,0.736903,42.1336,215.814
6000,0.3128,0.292594,0.782381,42.1266,215.849
8000,0.3051,0.271406,0.787195,42.1179,215.894
10000,0.3029,0.283517,0.782676,42.1334,215.815
12000,0.2966,0.291495,0.776922,42.1049,215.961
14000,0.298,0.258915,0.785981,42.1108,215.931
16000,0.301,0.275139,0.776525,42.1307,215.828
18000,0.2961,0.281267,0.789144,42.1161,215.903
20000,0.2984,0.281268,0.790901,42.1329,215.817


ORIG for pretrained/roberta-base-sst2-ORIG+SentMix
{'eval_loss': 1.9896941184997559, 'eval_accuracy': 0.9358574610244988, 'eval_f1': 0.9350498827758339, 'eval_precision': 0.9341799024303312, 'eval_recall': 0.9361059927053919, 'eval_runtime': 16.5316, 'eval_samples_per_second': 407.401, 'epoch': 2.69, 'run': 'pretrained/roberta-base-sst2-ORIG+SentMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to b

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-b89985fedfbb9e0f.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.3932,0.28107,0.602371,52.8324,229.462
4000,0.2964,0.273747,0.618054,52.7772,229.702
6000,0.2822,0.246531,0.640134,52.8119,229.551
8000,0.2734,0.260096,0.639924,52.8424,229.418
10000,0.2756,0.252496,0.644632,52.8109,229.555
12000,0.2708,0.280762,0.618442,52.8271,229.485
14000,0.2676,0.241989,0.6418,52.8381,229.437
16000,0.2638,0.276112,0.634867,53.0654,228.454
18000,0.2565,0.251022,0.643711,52.9341,229.021
20000,0.264,0.240891,0.638753,52.6951,230.059


ORIG for pretrained/roberta-base-sst2-ORIG+WordMix
{'eval_loss': 3.0437018871307373, 'eval_accuracy': 0.9293244246473645, 'eval_f1': 0.9282264835871024, 'eval_precision': 0.9284875181185134, 'eval_recall': 0.9279763941952446, 'eval_runtime': 16.5389, 'eval_samples_per_second': 407.221, 'epoch': 1.04, 'run': 'pretrained/roberta-base-sst2-ORIG+WordMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached split indices for dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-82065d53de91e6a2.arrow and C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-4c7cdb0a6bd6d7fd.arrow
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceCl

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.4714,0.356708,0.903332,0.902733,0.901624,0.905965,11.8893,254.936
4000,0.3651,0.335287,0.91488,0.914263,0.913014,0.916946,11.8773,255.194
6000,0.3444,0.40729,0.916529,0.915478,0.915386,0.915572,11.9231,254.212
8000,0.3711,0.417115,0.893435,0.892762,0.891679,0.895865,11.8962,254.787
10000,0.3613,0.439453,0.875619,0.873799,0.874542,0.873159,11.9022,254.659
12000,0.403,0.400613,0.838667,0.838627,0.854139,0.851223,11.9451,253.745
14000,0.4222,0.456384,0.840977,0.838938,0.838938,0.838938,11.9301,254.063
16000,0.5111,0.690142,0.556582,0.358249,0.778218,0.500372,11.8892,254.937
18000,0.6966,0.688048,0.556252,0.357431,0.278126,0.5,11.8723,255.301
20000,0.694,0.702018,0.556252,0.357431,0.278126,0.5,11.8783,255.172


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+ORIG
{'eval_loss': 0.3966030478477478, 'eval_accuracy': 0.9184855233853007, 'eval_f1': 0.9172037132623617, 'eval_precision': 0.9175418485411704, 'eval_recall': 0.9168842113402947, 'eval_runtime': 26.6179, 'eval_samples_per_second': 253.026, 'epoch': 3.61, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+ORIG'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
2000,0.5697,0.387528,0.834708,0.829872,0.840915,0.826113,188.8887,32.093
4000,0.4117,0.339949,0.854668,0.851719,0.856567,0.849218,188.7274,32.12
6000,0.4053,0.356635,0.839327,0.832797,0.853931,0.827704,188.6936,32.126
8000,0.4041,0.366489,0.854833,0.85206,0.856134,0.849823,188.6847,32.128
10000,0.4133,0.458004,0.838337,0.835157,0.839412,0.832933,188.6967,32.126
12000,0.4221,0.47117,0.827285,0.819159,0.84574,0.814005,188.7055,32.124
14000,0.4925,0.499205,0.790663,0.779551,0.809228,0.775646,188.6508,32.133
16000,0.4933,0.570057,0.736556,0.704413,0.799621,0.709771,188.6345,32.136
18000,0.5381,0.583486,0.706203,0.66325,0.777151,0.676176,188.5703,32.147
20000,0.6058,0.664222,0.663312,0.597644,0.750014,0.627838,188.5115,32.157


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+INV
{'eval_loss': 0.2776615619659424, 'eval_accuracy': 0.9095768374164811, 'eval_f1': 0.9085893819342337, 'eval_precision': 0.9073790564335193, 'eval_recall': 0.9103651653816853, 'eval_runtime': 26.4643, 'eval_samples_per_second': 254.494, 'epoch': 1.94, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+INV'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.6217,0.515871,0.774126,91.2093,66.463
4000,0.5219,0.499466,0.77967,91.2372,66.442
6000,0.5001,0.443831,0.814927,91.2874,66.406
8000,0.4967,0.517605,0.797418,91.2672,66.42
10000,0.5084,0.556405,0.818988,91.3314,66.374
12000,0.5043,0.516226,0.811517,91.3132,66.387
14000,0.5246,0.478876,0.829072,91.2341,66.444
16000,0.5005,0.529117,0.818593,91.2167,66.457
18000,0.5562,0.546141,0.765836,91.237,66.442
20000,0.5476,0.653192,0.718221,91.1867,66.479


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+SIB
{'eval_loss': 1.9360039234161377, 'eval_accuracy': 0.9048255382331106, 'eval_f1': 0.9032405252138185, 'eval_precision': 0.903983997692196, 'eval_recall': 0.9025814843826074, 'eval_runtime': 26.5381, 'eval_samples_per_second': 253.786, 'epoch': 2.36, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+SIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.6057,0.472485,0.823622,188.4705,32.164
4000,0.4931,0.466087,0.8375,188.2461,32.203
6000,0.4789,0.446362,0.832852,188.163,32.217
8000,0.4704,0.450005,0.8263,188.2441,32.203
10000,0.4691,0.530234,0.838073,188.3141,32.191
12000,0.4629,0.420377,0.843609,188.4283,32.171
14000,0.4934,0.482716,0.826287,188.2924,32.195
16000,0.4667,0.461281,0.831581,188.4472,32.168
18000,0.4814,0.482679,0.827402,188.2194,32.207
20000,0.4988,0.531757,0.807617,188.2211,32.207


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+INVSIB
{'eval_loss': 2.087074041366577, 'eval_accuracy': 0.9236822568671121, 'eval_f1': 0.9226847571189281, 'eval_precision': 0.9219906083708108, 'eval_recall': 0.9234930258017768, 'eval_runtime': 26.6206, 'eval_samples_per_second': 253.0, 'epoch': 2.22, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+INVSIB'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.5376,0.379835,0.854339,52.1539,116.233
4000,0.3958,0.369713,0.872979,52.045,116.476
6000,0.3735,0.368484,0.885681,51.8578,116.896
8000,0.3601,0.322444,0.899208,51.8987,116.804
10000,0.3592,0.349202,0.89261,51.9981,116.581
12000,0.3496,0.394981,0.884856,51.9646,116.656
14000,0.3572,0.317636,0.905807,52.0974,116.359
16000,0.3338,0.339281,0.897394,52.1279,116.291
18000,0.3301,0.371489,0.91257,52.0062,116.563
20000,0.3285,0.330251,0.913065,52.1655,116.207


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+TextMix
{'eval_loss': 4.150676250457764, 'eval_accuracy': 0.9518930957683742, 'eval_f1': 0.9511852848332614, 'eval_precision': 0.9510878462701455, 'eval_recall': 0.9512842981910976, 'eval_runtime': 26.6727, 'eval_samples_per_second': 252.505, 'epoch': 10.0, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+TextMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow





W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4916,0.350863,0.715953,100.5134,90.466
4000,0.3544,0.389889,0.712904,100.6312,90.36
6000,0.332,0.326924,0.760228,100.7063,90.292
8000,0.3082,0.294704,0.771255,100.652,90.341
10000,0.3018,0.282394,0.779281,100.6943,90.303
12000,0.2961,0.29179,0.75894,100.7957,90.212
14000,0.2905,0.27474,0.771956,100.6954,90.302
16000,0.2998,0.261266,0.772919,100.7958,90.212
18000,0.2906,0.268101,0.784646,100.779,90.227
20000,0.2958,0.281862,0.774433,100.8256,90.185


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+SentMix
{'eval_loss': 2.606091260910034, 'eval_accuracy': 0.9291759465478842, 'eval_f1': 0.9277320216590118, 'eval_precision': 0.9306033664763365, 'eval_recall': 0.9256503338708106, 'eval_runtime': 26.574, 'eval_samples_per_second': 253.443, 'epoch': 1.76, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+SentMix'}


Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Loading cached processed dataset at C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4\cache-0617f0b0800fe559.arrow
W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.





Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
2000,0.4321,0.298179,0.59613,107.0784,113.216
4000,0.3195,0.330469,0.575991,106.7826,113.53
6000,0.3005,0.262498,0.622632,106.565,113.762
8000,0.288,0.291122,0.628319,106.3787,113.961
10000,0.2759,0.242179,0.63248,106.3704,113.97
12000,0.2724,0.257062,0.643878,106.3351,114.007
14000,0.2613,0.235138,0.645365,106.7356,113.58
16000,0.2556,0.261517,0.643793,106.6244,113.698
18000,0.2488,0.236509,0.638936,106.6641,113.656
20000,0.252,0.241187,0.644712,106.7883,113.524


ORIG for pretrained/xlnet-base-cased-sst2-ORIG+WordMix
{'eval_loss': 2.8750319480895996, 'eval_accuracy': 0.9365998515219005, 'eval_f1': 0.9357062694840792, 'eval_precision': 0.9353618379608855, 'eval_recall': 0.9360734147043492, 'eval_runtime': 26.6857, 'eval_samples_per_second': 252.383, 'epoch': 1.53, 'run': 'pretrained/xlnet-base-cased-sst2-ORIG+WordMix'}


In [6]:
df = pd.DataFrame(results)
df

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,epoch,run
0,0.282163,0.938382,0.937341,0.938265,0.936529,16.9324,397.758,7.5,pretrained/bert-base-uncased-sst2-ORIG+ORIG
1,0.251824,0.925761,0.924765,0.924184,0.92542,17.0465,395.095,1.94,pretrained/bert-base-uncased-sst2-ORIG+INV
2,3.599298,0.943133,0.942239,0.942597,0.9419,17.0787,394.35,9.31,pretrained/bert-base-uncased-sst2-ORIG+SIB
3,2.637452,0.933482,0.932556,0.932146,0.933,16.9467,397.422,4.17,pretrained/bert-base-uncased-sst2-ORIG+INVSIB
4,5.396307,0.947884,0.947161,0.946722,0.947636,16.984,396.55,10.0,pretrained/bert-base-uncased-sst2-ORIG+TextMix
5,5.498501,0.949963,0.949276,0.94878,0.949819,16.9856,396.512,10.0,pretrained/bert-base-uncased-sst2-ORIG+SentMix
6,4.707505,0.945212,0.944412,0.944269,0.944558,16.9568,397.186,7.64,pretrained/bert-base-uncased-sst2-ORIG+WordMix
7,0.384818,0.910913,0.91033,0.909143,0.914591,16.5429,407.124,3.33,pretrained/roberta-base-sst2-ORIG+ORIG
8,0.283746,0.908686,0.906146,0.913948,0.902003,16.495,408.305,1.81,pretrained/roberta-base-sst2-ORIG+INV
9,1.91738,0.912101,0.910557,0.911763,0.909545,16.5944,405.859,2.22,pretrained/roberta-base-sst2-ORIG+SIB


In [7]:
df.to_csv('train_SST2_r3.csv')

In [8]:
df.to_clipboard(excel=True)