This is a development scratch pad for the targeted mixture mutations

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import itertools
from datasets import load_dataset

In [4]:
def concat_text(np_char1, np_char2):
    np_char1 = np_char1.astype(np.string_)
    np_char2 = np_char2.astype(np.string_)
    sep = np.full_like(np_char1, " ", dtype=np.string_)
    ret = np.char.add(np_char1, sep)
    ret = np.char.add(ret, np_char2)
    return ret

In [5]:
def find_value_idx(array, value):
    return np.asarray(array == value).nonzero()[0]

def find_other_idx(array, value):
    return np.asarray(array != value).nonzero()[0]

In [6]:
def one_hot_encode(y, nb_classes):
    if not isinstance(y, np.ndarray):
        y = np.expand_dims(np.array(y), 0)
    res = np.eye(nb_classes)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[nb_classes])

In [7]:
dataset = load_dataset('ag_news', split='test')
batch = (dataset['text'][:100], dataset['label'][:100])

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


# Transformation

In [8]:
target_prob = 1.0
target_pairs = [(0,1), (1,2), (2,3), (3,0)]
num_classes = 4

In [9]:
# unpack batch
data, targets = batch
batch_size = len(data)

# convert to numpy if not already
if type(data) == list:
    data = np.array(data, dtype=np.string_)
if type(targets) == list:
    if type(targets[0]) == np.ndarray:
        targets = np.stack(targets)
    else:
        targets = np.array(targets)

In [10]:
def concat_labels(source_text, 
                  target_text, 
                  source_labels, 
                  target_labels, 
                  num_classes):
    
    # create soft target labels 
    source_ohe = one_hot_encode(source_labels, num_classes)
    target_ohe = one_hot_encode(target_labels, num_classes)
    
    if targets.shape[-1] == 1:
        source_cls = source_labels
        target_cls = target_labels
    else:
        source_cls = np.argmax(source_ohe, axis=1)
        target_cls = np.argmax(target_ohe, axis=1)
        
    # calculate length of each data and use that
    # to determine the lambda weight assigned to
    # the index for the target
    len_data_source = np.char.str_len(source_text)
    len_data_target = np.char.str_len(target_text)
    lam = len_data_source / (len_data_source + len_data_target)   
        
    idx_ = np.arange(len(source_ohe))
    
    source_ohe[idx_, source_cls] *= lam
    target_ohe[idx_, target_cls] *= 1-lam
    
    ohe_targets = source_ohe + target_ohe
    
    return ohe_targets

In [11]:
# track indices for targeted cutmix to exclude later
idx = [x for x in range(batch_size)]
ex_idx = []

new_data = []
new_targets = []

# transform targeted pairings
for pair in target_pairs:
    
    # skip targeted transformation target_prob percent of the time
    use_targets = np.random.uniform() < target_prob
    if not use_targets:
        continue
        
    # unpack source and target pairs
    source_class, target_class = pair

    # find indices of both source and target 
    s_idx = find_value_idx(targets, source_class)
    t_idx = find_value_idx(targets, target_class)
    
    # if none of the source or target classes are in this batch, skip it
    if len(s_idx) == 0 or len(t_idx) == 0:
        continue
        
    # enforce source==target array size via sampling
    tt_idx = np.random.choice(np.arange(len(t_idx)), size=len(s_idx), replace=True)
    t_idx = t_idx[tt_idx]
    
    # create concatenated data
    textmix = concat_text(data[s_idx], data[t_idx])
    ohe_targets = concat_labels(data[s_idx], 
                                data[t_idx], 
                                targets[s_idx],
                                targets[t_idx],
                                num_classes)
    
    new_data.append(textmix)
    new_targets.append(ohe_targets)
    ex_idx.append(s_idx.tolist())
    
ex_idx = list(itertools.chain(*ex_idx))
s_idx = [i for i in idx if i not in ex_idx]
t_idx = np.random.choice(np.arange(len(s_idx)), size=len(s_idx), replace=True)

if s_idx:
    textmix = concat_text(data[s_idx], data[t_idx])
    ohe_targets = concat_labels(data[s_idx], 
                                data[t_idx], 
                                targets[s_idx],
                                targets[t_idx],
                                num_classes)

    new_data.append(textmix)
    new_targets.append(ohe_targets)
    
new_data = np.concatenate(new_data)
new_targets = np.concatenate(new_targets)

In [4]:
from transforms import TextMix, SentMix, WordMix



In [None]:
tm = TextMix()
sm = SentMix()
wm = WordMix()

In [None]:
new_batch1 = tm(
    batch=batch,
    target_pairs=[(0,1), (1,2), (2,3), (3,0)], 
    target_prob=1
  )

new_batch2 = sm(
    batch=batch,
    target_pairs=[(0,1), (1,2), (2,3), (3,0)], 
    target_prob=1
  )


new_batch3 = wm(
    batch=batch,
    target_pairs=[(0,1), (1,2), (2,3), (3,0)], 
    target_prob=1
  )

In [None]:
print(new_batch1[0].shape, new_batch1[1].shape)
print(new_batch2[0].shape, new_batch2[1].shape)
print(new_batch3[0].shape, new_batch3[1].shape)

# Target Training

### Native Approach

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, AdamW
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transforms import TextMix, SentMix, WordMix

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



In [4]:
class TargetedMixturesCollator:
    def __init__(self, transform, target_pairs=[], target_prob=1.0, num_classes=4):
        self.transform = transform
        self.target_pairs = target_pairs
        self.target_prob = target_prob
        self.num_classes = num_classes
        
    def __call__(self, batch):
        text = [x['text'] for x in batch]
        labels = [x['label'] for x in batch]
        batch = (text, labels)
        batch = self.transform(
            batch, 
            self.target_pairs,   
            self.target_prob,
            self.num_classes
        )
        return batch

In [5]:
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
dataset = load_dataset('ag_news', split='train[:10%]') 

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


In [None]:
model.train()

train_loader = DataLoader(
    dataset, 
    batch_size=2, 
    shuffle=True,
    collate_fn=TargetedMixturesCollator(TextMix())
)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        
        data, labels = batch
        data = [x.decode() for x in data]
        data = tokenizer(data, padding=True, truncation=True, max_length=250, return_tensors='pt')
        
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = torch.tensor(labels).to(device)
        
        print(input_ids.shape, attention_mask.shape, labels.shape)
        
        optim.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

# model.eval()

### Trainer Approach

In [4]:
%reload_ext autoreload
%autoreload 2

In [5]:
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    TrainerCallback, 
    EarlyStoppingCallback
)
from transformers.trainer_callback import TrainerControl
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transforms import TextMix, SentMix, WordMix

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
def tokenize(text):
    return tokenizer(text, padding=True, truncation=True, max_length=250, return_tensors='pt')

def acc_at_k(y_true, y_pred, k=2):
    y_true = torch.tensor(y_true) if type(y_true) != torch.Tensor else y_true
    y_pred = torch.tensor(y_pred) if type(y_pred) != torch.Tensor else y_pred
    total = len(y_true)
    y_weights, y_idx = torch.topk(y_true, k=k, dim=-1)
    out_weights, out_idx = torch.topk(y_pred, k=k, dim=-1)
    correct = torch.sum(torch.eq(y_idx, out_idx) * y_weights)
    acc = correct / total
    return acc.item()

def CEwST_loss(logits, target, reduction='mean'):
    """
    Cross Entropy with Soft Target (CEwST) Loss
    :param logits: (batch, *)
    :param target: (batch, *) same shape as logits, each item must be a valid distribution: target[i, :].sum() == 1.
    """
    logprobs = torch.nn.functional.log_softmax(logits.view(logits.shape[0], -1), dim=1)
    batchloss = - torch.sum(target.view(target.shape[0], -1) * logprobs, dim=1)
    if reduction == 'none':
        return batchloss
    elif reduction == 'mean':
        return torch.mean(batchloss)
    elif reduction == 'sum':
        return torch.sum(batchloss)
    else:
        raise NotImplementedError('Unsupported reduction mode.')

def compute_metrics_w_soft_target(pred):
    labels = pred.label_ids
    preds = pred.predictions
    acc = acc_at_k(labels, preds, k=2)
    return {
        'accuracy': acc,
    }

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        loss = CEwST_loss(logits, labels)
        if return_outputs:
            return loss, outputs
        return loss

class TargetedMixturesCallback(TrainerCallback):
    """
    A callback that calculates a confusion matrix on the validation
    data and returns the most confused class pairings.
    """
    def __init__(self, dataloader, device):
        self.dataloader = dataloader
        self.device = device
        
    def on_evaluate(self, args, state, control, model, tokenizer, **kwargs):
        cnf_mat = self.get_confusion_matrix(model, tokenizer, self.dataloader)
        new_targets = self.get_most_confused_per_class(cnf_mat)
        print("Selecting new targets:", new_targets)
        control = TrainerControl
        control.new_targets = new_targets
        return control
        
    def get_confusion_matrix(self, model, tokenizer, dataloader, normalize=True):
        n_classes = max(dataloader.dataset['label']) + 1
        confusion_matrix = torch.zeros(n_classes, n_classes)
        with torch.no_grad():
            for batch in iter(self.dataloader):
                data, targets = batch['text'], batch['label']
                data = tokenizer(data, padding=True, truncation=True, max_length=250, return_tensors='pt')
                input_ids = data['input_ids'].to(self.device)
                attention_mask = data['attention_mask'].to(self.device)
                targets = targets.to(self.device)
                outputs = model(input_ids, attention_mask=attention_mask).logits
                preds = torch.argmax(outputs, dim=1).cpu()
                for t, p in zip(targets.view(-1), preds.view(-1)):
                    confusion_matrix[t.long(), p.long()] += 1    
            if normalize:
                confusion_matrix = confusion_matrix / confusion_matrix.sum(dim=0)
        return confusion_matrix

    def get_most_confused_per_class(self, confusion_matrix):
        idx = torch.arange(len(confusion_matrix))
        cnf = confusion_matrix.fill_diagonal_(0).max(dim=1)[1]
        return torch.stack((idx, cnf)).T.tolist()

class TargetedMixturesCollator:
    def __init__(self, tokenize_fn, transform, target_pairs=[], target_prob=1.0, num_classes=4):
        self.tokenize_fn = tokenize_fn
        self.transform = transform
        self.target_pairs = target_pairs
        self.target_prob = target_prob
        self.num_classes = num_classes
        print("TargetedMixturesCollator initialized with {}".format(transform.__class__.__name__))
        
    def __call__(self, batch):
        text = [x['text'] for x in batch]
        labels = [x['label'] for x in batch]
        batch = (text, labels)
        batch = self.transform(
            batch, 
            self.target_pairs,   
            self.target_prob,
            self.num_classes
        )
        text, labels = batch
        batch = self.tokenize_fn([x.decode() for x in text])
        batch['labels'] = torch.tensor(labels)
        return batch

In [14]:
dataset = load_dataset('ag_news', split='train[:97%]') 
dataset_dict = dataset.train_test_split(
    test_size = 0.05,
    train_size = 0.95,
    shuffle = True
)
train_dataset = dataset_dict['train']
eval_dataset = dataset_dict['test']
targ_dataset = load_dataset('ag_news', split='train[97%:]')
test_dataset = load_dataset('ag_news', split='test') 

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)
Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)
Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


In [11]:
MODEL_NAMES = ['bert-base-uncased', 'roberta-base', 'xlnet-base-cased']
ts = [TextMix(), SentMix(), WordMix()]

In [None]:
results = []
for MODEL_NAME in MODEL_NAMES:
    for t in ts: 
        t_str = t.__class__.__name__

        train_batch_size = 8
        eval_batch_size = 32
        num_epoch = 3
        gradient_accumulation_steps = 1
        max_steps = int((len(train_dataset) * num_epoch / gradient_accumulation_steps) / train_batch_size)

        tmcb = TargetedMixturesCallback(
            dataloader=DataLoader(targ_dataset, batch_size=32),
            device=device
        )
        escb = EarlyStoppingCallback(
            early_stopping_patience=10
        )
        tmc = TargetedMixturesCollator(
            tokenize_fn=tokenize, 
            transform=t,
            target_prob=0.5
        )

        training_args = TrainingArguments(
            output_dir='./results/' + MODEL_NAME + '-targeted-' + t_str,
            overwrite_output_dir=True,
            max_steps=max_steps,
            save_steps=int(max_steps / 10),
            save_total_limit=1,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps, 
            warmup_steps=int(max_steps / 10),
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=1000,
            logging_first_step=True,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            evaluation_strategy="steps",
            remove_unused_columns=False,
            label_names=['World', 'Sports', 'Business', 'Sci/Tech']
        )

        trainer = MyTrainer(
            model=model, 
            tokenizer=tokenizer,
            args=training_args,
            compute_metrics=compute_metrics_w_soft_target,                  
            train_dataset=train_dataset,         
            eval_dataset=eval_dataset,
            data_collator=tmc,
            callbacks=[tmcb, escb]
        )

        trainer.train()

        # test with ORIG data
        trainer.compute_metrics = compute_metrics
        trainer.eval_dataset = test_dataset
        out_orig = trainer.evaluate()
        out_orig['run'] = t_str
        out_orig['test'] = "ORIG"
        print('ORIG for {}\n{}'.format(checkpoint, out_orig))

        results.append(out_orig)

TargetedMixturesCollator initialized with TextMix


Step,Training Loss,Validation Loss


In [4]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset, test_dataset = load_dataset('ag_news', split=['train[:5%]', 'test'])
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# trainer.train()

{'labels': tensor([3, 0, 1, 1, 0, 2, 2, 0, 0, 3, 2, 3, 0, 0, 1, 2]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'input_ids': tensor([[  101, 19413,  2678,  ...,     0,     0,     0],
        [  101,  6643,  2229,  ...,     0,     0,     0],
        [  101, 27669,  2015,  ...,     0,     0,     0],
        ...,
        [  101,  4586, 12642,  ...,     0,     0,     0],
        [  101,  2137, 10654,  ...,     0,     0,     0],
        [  101,  2149,  1024,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Fabrice\AppData\Local\Continuum\anaconda3\envs\python38\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-73ce61d0d4be>", line 31, in <module>
    trainer.train()
  File "C:\Users\Fabrice\AppData\Local\Continuum\anaconda3\envs\python38\lib\site-packages\transformers\trainer.py", line 1122, in train
    tr_loss += self.training_step(model, inputs)
  File "C:\Users\Fabrice\AppData\Local\Continuum\anaconda3\envs\python38\lib\site-packages\transformers\trainer.py", line 1526, in training_step
    loss = self.compute_loss(model, inputs)
  File "C:\Users\Fabrice\AppData\Local\Continuum\anaconda3\envs\python38\lib\site-packages\transformers\trainer.py", line 1558, in compute_loss
    outputs = model(**inputs)
  File "C:\Users\Fabrice\AppData\Local\Continuum\anaconda3\envs\python38\lib\site-packages\torch\nn\modules\module.py", line 727, i