# Targeted SIB Training

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    TrainerCallback, 
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers.trainer_callback import TrainerControl
from datasets import load_dataset
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transforms import TextMix, SentMix, WordMix

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



In [3]:
def tokenize_fn(text):
    return tokenizer(text, padding=True, truncation=True, max_length=250, return_tensors='pt')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=250)

def acc_at_k(y_true, y_pred, k=2):
    y_true = torch.tensor(y_true) if type(y_true) != torch.Tensor else y_true
    y_pred = torch.tensor(y_pred) if type(y_pred) != torch.Tensor else y_pred
    total = len(y_true)
    y_weights, y_idx = torch.topk(y_true, k=k, dim=-1)
    out_weights, out_idx = torch.topk(y_pred, k=k, dim=-1)
    correct = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
    acc = correct / total
    return acc.item()

def CEwST_loss(logits, target, reduction='mean'):
    """
    Cross Entropy with Soft Target (CEwST) Loss
    :param logits: (batch, *)
    :param target: (batch, *) same shape as logits, each item must be a valid distribution: target[i, :].sum() == 1.
    """
    logprobs = torch.nn.functional.log_softmax(logits.view(logits.shape[0], -1), dim=1)
    batchloss = - torch.sum(target.view(target.shape[0], -1) * logprobs, dim=1)
    if reduction == 'none':
        return batchloss
    elif reduction == 'mean':
        return torch.mean(batchloss)
    elif reduction == 'sum':
        return torch.sum(batchloss)
    else:
        raise NotImplementedError('Unsupported reduction mode.')

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1.mean(),
        'precision': precision.mean(),
        'recall': recall.mean()
    }        
        
def compute_metrics_w_soft_target(pred):
    labels = pred.label_ids
    preds = pred.predictions
    acc = acc_at_k(labels, preds, k=2)
    return {
        'accuracy': acc,
    }

class TargetedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = CEwST_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss

class TargetedMixturesCallback(TrainerCallback):
    """
    A callback that calculates a confusion matrix on the validation
    data and returns the most confused class pairings.
    """
    def __init__(self, dataloader, device):
        self.dataloader = dataloader
        self.device = device
        
    def on_evaluate(self, args, state, control, model, tokenizer, **kwargs):
        cnf_mat = self.get_confusion_matrix(model, tokenizer, self.dataloader)
        new_targets = self.get_most_confused_per_class(cnf_mat)
        print("New targets:", new_targets)
        control = TrainerControl
        control.new_targets = new_targets
        if state.global_step < state.max_steps:
            control.should_training_stop = False
        else:
            control.should_training_stop = True
        return control
        
    def get_confusion_matrix(self, model, tokenizer, dataloader, normalize=True):
        n_classes = max(dataloader.dataset['label']) + 1
        confusion_matrix = torch.zeros(n_classes, n_classes)
        with torch.no_grad():
            for batch in iter(self.dataloader):
                data, targets = batch['text'], batch['label']
                data = tokenizer(data, padding=True, truncation=True, max_length=250, return_tensors='pt')
                input_ids = data['input_ids'].to(self.device)
                attention_mask = data['attention_mask'].to(self.device)
                targets = targets.to(self.device)
                outputs = model(input_ids, attention_mask=attention_mask).logits
                preds = torch.argmax(outputs, dim=1).cpu()
                for t, p in zip(targets.view(-1), preds.view(-1)):
                    confusion_matrix[t.long(), p.long()] += 1    
            if normalize:
                confusion_matrix = confusion_matrix / confusion_matrix.sum(dim=0)
        return confusion_matrix

    def get_most_confused_per_class(self, confusion_matrix):
        idx = torch.arange(len(confusion_matrix))
        cnf = confusion_matrix.fill_diagonal_(0).max(dim=1)[1]
        return torch.stack((idx, cnf)).T.tolist()

class TargetedMixturesCollator:
    def __init__(self, tokenize_fn, transform, target_pairs=[], target_prob=1.0, num_classes=4):
        self.tokenize_fn = tokenize_fn
        self.transform = transform
        self.target_pairs = target_pairs
        self.target_prob = target_prob
        self.num_classes = num_classes
        print("TargetedMixturesCollator initialized with {}".format(transform.__class__.__name__))
        
    def __call__(self, batch):
        text = [x['text'] for x in batch]
        labels = [x['label'] for x in batch]
        batch = (text, labels)
        batch = self.transform(
            batch, 
            self.target_pairs,   
            self.target_prob,
            self.num_classes
        )
        text, labels = batch
        batch = self.tokenize_fn(text)
        batch['labels'] = torch.tensor(labels)
        return batch
    
class DefaultCollator:
    def __init__(self):
        pass
    def __call__(self, batch):
        return torch.utils.data.dataloader.default_collate(batch)

In [4]:
MODEL_NAMES = ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']
ts = [TextMix(), SentMix(), WordMix()]

In [5]:
results = []

for MODEL_NAME in MODEL_NAMES:
        
    for t in ts: 
        
        t_str = t.__class__.__name__
        checkpoint = './results/' + MODEL_NAME + '-targeted-' + t_str
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4).to(device)

        dataset = load_dataset('ag_news', split='train') 
        dataset_dict = dataset.train_test_split(
            test_size = 0.05,
            train_size = 0.95,
            shuffle = True
        )
        train_dataset = dataset_dict['train']
        eval_dataset = dataset_dict['test']

        test_dataset = load_dataset('ag_news', split='test') 
        test_dataset.rename_column_('label', 'labels')
        test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
        test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
        
        train_batch_size = 8
        eval_batch_size = 32
        num_epoch = 10
        gradient_accumulation_steps = 1
        max_steps = int((len(train_dataset) * num_epoch / gradient_accumulation_steps) / train_batch_size)

        tmcb = TargetedMixturesCallback(
            dataloader=DataLoader(eval_dataset, batch_size=32),
            device=device
        )
        escb = EarlyStoppingCallback(
            early_stopping_patience=10
        )
        tmc = TargetedMixturesCollator(
            tokenize_fn=tokenize_fn, 
            transform=t,
            target_prob=0.5
        )

        training_args = TrainingArguments(
            seed=1,
            output_dir=checkpoint,
            overwrite_output_dir=True,
            max_steps=max_steps,
            save_steps=int(max_steps / 10),
            save_total_limit=1,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps, 
            warmup_steps=int(max_steps / 10),
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=1000,
            logging_first_step=True,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            evaluation_strategy="steps",
            remove_unused_columns=False
        )

        trainer = TargetedTrainer(
            model=model, 
            tokenizer=tokenizer,
            args=training_args,
            compute_metrics=compute_metrics_w_soft_target,                  
            train_dataset=train_dataset,         
            eval_dataset=eval_dataset,
            data_collator=tmc,
            callbacks=[tmcb, escb]
        )

        trainer.train()

        # test with ORIG data
        trainer.eval_dataset = test_dataset
        trainer.compute_metrics = compute_metrics
        trainer.data_collator = DefaultCollator()
        trainer.remove_callback(tmcb)

        out_orig = trainer.evaluate()
        out_orig['run'] = checkpoint
        out_orig['test'] = "ORIG"
        print('ORIG for {}\n{}'.format(checkpoint, out_orig))

        results.append(out_orig)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with TextMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.1827,0.833958,0.652891,69.2162,86.685
2000,0.8144,0.784703,0.672584,69.4867,86.347
3000,0.7778,0.782599,0.67051,68.7291,87.299
4000,0.7589,0.772537,0.701771,69.5368,86.285
5000,0.7525,0.756096,0.702339,69.6488,86.146
6000,0.7488,0.763436,0.746416,69.327,86.546
7000,0.7523,0.759252,0.735518,70.1258,85.561
8000,0.7568,0.806503,0.687324,69.3124,86.565
9000,0.7575,0.767603,0.762755,69.4513,86.391
10000,0.7321,0.749025,0.73747,69.3042,86.575


New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 

ORIG for ./results/bert-base-uncased-targeted-TextMix
{'eval_loss': 24.60407829284668, 'eval_accuracy': 0.9381578947368421, 'eval_f1': 0.9381579117190124, 'eval_precision': 0.9383738209275129, 'eval_recall': 0.9381578947368422, 'eval_runtime': 127.4248, 'eval_samples_per_second': 59.643, 'epoch': 2.67, 'run': './results/bert-base-uncased-targeted-TextMix', 'test': 'ORIG'}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with SentMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.1746,0.863394,0.660165,66.8795,89.714
2000,0.8157,0.823462,0.623097,66.5788,90.119
3000,0.7731,0.781701,0.648245,66.9594,89.607
4000,0.7744,0.829549,0.658423,66.2593,90.553
5000,0.7718,0.787813,0.641309,67.2455,89.225
6000,0.7435,0.792298,0.650088,65.8354,91.136
7000,0.7706,0.823656,0.591991,66.7635,89.869
8000,0.7458,0.792736,0.702534,66.4505,90.293
9000,0.7437,0.794693,0.671797,66.1996,90.635
10000,0.7501,0.783926,0.680735,66.5138,90.207


New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 1], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 

ORIG for ./results/bert-base-uncased-targeted-SentMix
{'eval_loss': 23.754152297973633, 'eval_accuracy': 0.9301315789473684, 'eval_f1': 0.9300557925165926, 'eval_precision': 0.9300518992798888, 'eval_recall': 0.9301315789473684, 'eval_runtime': 127.3892, 'eval_samples_per_second': 59.66, 'epoch': 1.54, 'run': './results/bert-base-uncased-targeted-SentMix', 'test': 'ORIG'}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with WordMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.1519,0.914932,0.620536,70.9657,84.548
2000,0.8694,0.872325,0.589514,70.9087,84.616
3000,0.8209,0.821688,0.617198,71.5446,83.864
4000,0.8168,0.8367,0.614437,65.033,92.261
5000,0.8054,0.821856,0.570331,71.5033,83.912
6000,0.7934,0.800716,0.621906,71.4852,83.934
7000,0.7823,0.801144,0.609909,69.3331,86.539
8000,0.7928,0.837964,0.613703,67.0498,89.486
9000,0.785,0.796257,0.632827,66.3448,90.437
10000,0.7947,0.817158,0.590539,64.5849,92.901


New targets: [[0, 1], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 

ORIG for ./results/bert-base-uncased-targeted-WordMix
{'eval_loss': 21.714515686035156, 'eval_accuracy': 0.9153947368421053, 'eval_f1': 0.9153811071568305, 'eval_precision': 0.9160972740976473, 'eval_recall': 0.9153947368421053, 'eval_runtime': 128.1311, 'eval_samples_per_second': 59.314, 'epoch': 3.58, 'run': './results/bert-base-uncased-targeted-WordMix', 'test': 'ORIG'}


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with TextMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.1343,0.765374,0.716245,68.9153,87.063
2000,0.7768,0.800435,0.680535,68.4015,87.717
3000,0.7782,0.805896,0.67001,68.8644,87.128
4000,0.7767,0.800256,0.706893,68.1097,88.093
5000,0.7586,0.770295,0.704319,68.3689,87.759
6000,0.7611,0.814719,0.677127,69.0381,86.909
7000,0.7692,0.748191,0.758278,68.7181,87.313
8000,0.7537,0.774299,0.716635,68.858,87.136
9000,0.7712,0.765151,0.771367,67.8364,88.448
10000,0.7718,0.768043,0.67831,68.126,88.072


New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 0], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 0], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 

ORIG for ./results/roberta-base-targeted-TextMix
{'eval_loss': 24.987064361572266, 'eval_accuracy': 0.9331578947368421, 'eval_f1': 0.9329715926506041, 'eval_precision': 0.9334148742540789, 'eval_recall': 0.9331578947368422, 'eval_runtime': 1084.046, 'eval_samples_per_second': 7.011, 'epoch': 3.23, 'run': './results/roberta-base-targeted-TextMix', 'test': 'ORIG'}


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with SentMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.1007,0.7834,0.696285,67.1077,89.409
2000,0.7993,0.80588,0.705148,67.2569,89.21
3000,0.7666,0.787899,0.729695,67.4978,88.892
4000,0.7663,0.783725,0.674652,67.7545,88.555
5000,0.7684,0.763079,0.647183,67.3259,89.119
6000,0.7543,0.797541,0.648583,67.2116,89.27
7000,0.7536,0.794311,0.716171,67.9666,88.279
8000,0.7651,0.772531,0.671297,67.3172,89.13
9000,0.756,0.819323,0.662982,67.0499,89.486
10000,0.7438,0.82188,0.599392,67.4136,89.003


New targets: [[0, 2], [1, 0], [2, 3], [3, 0]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]


ORIG for ./results/roberta-base-targeted-SentMix
{'eval_loss': 22.001941680908203, 'eval_accuracy': 0.9157894736842105, 'eval_f1': 0.9155353226859555, 'eval_precision': 0.9155848007271494, 'eval_recall': 0.9157894736842105, 'eval_runtime': 128.1835, 'eval_samples_per_second': 59.29, 'epoch': 0.91, 'run': './results/roberta-base-targeted-SentMix', 'test': 'ORIG'}


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with WordMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.0947,0.847293,0.62586,69.7242,86.053
2000,0.8587,0.848029,0.575985,69.5197,86.307
3000,0.8063,0.846949,0.584822,68.8248,87.178
4000,0.7968,0.85323,0.559916,68.7244,87.305
5000,0.796,0.837976,0.573284,69.2285,86.669
6000,0.8164,0.834973,0.5819,69.0656,86.874
7000,0.801,0.845411,0.605646,69.1864,86.722
8000,0.8024,0.819453,0.601753,69.5117,86.316
9000,0.8066,0.872504,0.560564,68.7708,87.246
10000,0.8095,0.846931,0.60149,69.3087,86.569


New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 0]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 0], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 1], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 

ORIG for ./results/roberta-base-targeted-WordMix
{'eval_loss': 18.879497528076172, 'eval_accuracy': 0.8980263157894737, 'eval_f1': 0.8980891878968906, 'eval_precision': 0.8988408262938834, 'eval_recall': 0.8980263157894737, 'eval_runtime': 128.3595, 'eval_samples_per_second': 59.209, 'epoch': 2.11, 'run': './results/roberta-base-targeted-WordMix', 'test': 'ORIG'}


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with TextMix


Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1000,1.0999,0.787001,0.683776,156.4148,38.36
2000,0.8135,0.792994,0.616497,157.5783,38.076
3000,0.7821,0.772943,0.696355,140.2722,42.774
4000,0.7644,0.77716,0.674428,144.2825,41.585
5000,0.7584,0.775821,0.632179,142.5785,42.082
6000,0.753,0.780183,0.685985,142.348,42.15
7000,0.7504,0.784473,0.712454,142.6952,42.048
8000,0.7481,0.771263,0.670011,141.1493,42.508
9000,0.7408,0.797307,0.725105,142.259,42.177
10000,0.753,0.78144,0.734887,142.2278,42.186


New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 1], [1, 3], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 0], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 2], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 2], [2, 3], [3, 2]]
New targets: [[0, 3], [1, 3], [2, 

ORIG for ./results/xlnet-base-cased-targeted-TextMix
{'eval_loss': 26.102365493774414, 'eval_accuracy': 0.9273684210526316, 'eval_f1': 0.9271413058476453, 'eval_precision': 0.9272047321159348, 'eval_recall': 0.9273684210526316, 'eval_runtime': 265.1047, 'eval_samples_per_second': 28.668, 'epoch': 1.61, 'run': './results/xlnet-base-cased-targeted-TextMix', 'test': 'ORIG'}


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


TargetedMixturesCollator initialized with SentMix


Step,Training Loss,Validation Loss


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 6.00 GiB total capacity; 4.18 GiB already allocated; 18.44 MiB free; 4.56 GiB reserved in total by PyTorch)

In [None]:
df = pd.DataFrame(results)
df

In [None]:
df.to_csv('train_AG_NEWS_targeted_r1.csv')

In [None]:
df.to_clipboard(excel=True)

In [None]:
# ORIG for ./results/bert-base-uncased-targeted-TextMix
# {'eval_loss': 31.2364559173584, 'eval_accuracy': 0.9381578947368421, 'eval_f1': 0.9381945850526017, 'eval_precision': 0.938240633851668, 'eval_recall': 0.9381578947368421, 'eval_runtime': 117.2622, 'eval_samples_per_second': 64.812, 'epoch': 5.0, 'run': 'TextMix', 'test': 'ORIG'}