In [None]:
!nvidia-smi

Thu Apr 22 13:49:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 450.51.05    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:0A.0 Off |                    0 |
| N/A   40C    P0    53W / 250W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq fastcore sentencepiece
    !pip install -Uqq --no-deps fastai
    !pip install -Uqq transformers datasets wandb 

In [None]:
from transformers import *
from datasets import load_dataset, concatenate_datasets, load_metric

  '"sox" backend is being deprecated. '


## Setup

In [None]:
model_name = 'roberta-base'
# data
max_length = 512
bs = 16
val_bs = bs*4
# training
lr = 3e-5

## Data preprocessing

In [None]:
ds_name = 'imdb'

In [None]:
dataset = load_dataset(ds_name)

Reusing dataset imdb (/workspace/.cache/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [None]:
train_ds = dataset['train']
valid_ds = dataset['test']

In [None]:
# train_ds = train_ds.select(range(100))
# valid_ds = valid_ds.select(range(100))

In [None]:
len(train_ds), len(valid_ds)

(25000, 25000)

In [None]:
train_ds.column_names

['label', 'text']

In [None]:
train_ds[2]

{'label': 1,
 'text': 'Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently "I\'m a lawyer" he says. Three funny words. Jeffrey Tambor, a favorite from the later Larry Sanders show, is fantastic here too as a mad millionaire who wants to crush the ghetto. His character is more malevolent than usual. The hospital scene, and the scene where the homeless invade a demolition site, are all-time classics. Look for the legs scene and the two big diggers fighting (one bleeds). This movie gets better each time I see it (which is quite often).'}

In [None]:
from collections import Counter

In [None]:
Counter(train_ds['label'])

Counter({1: 12500, 0: 12500})

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
return_special_tokens_mask = True
return_token_type_ids = False

In [None]:
def tokenize(batch):
    labels = batch['label']
    batch = tokenizer(
        batch['text'],
        add_special_tokens=True,
        padding=False,
        truncation=True,
        max_length=max_length,
        return_special_tokens_mask = return_special_tokens_mask,
        return_token_type_ids = return_token_type_ids,
        # return_tensors='pt'
        )
    batch['labels'] = labels
    return batch

In [None]:
train_ds = train_ds.map(tokenize, batched=True, batch_size=100, remove_columns=train_ds.column_names, num_proc=4)
valid_ds = valid_ds.map(tokenize, batched=True, batch_size=100, remove_columns=valid_ds.column_names, num_proc=4)

    

HBox(children=(FloatProgress(value=0.0, description='#0', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#3', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#1', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#2', max=63.0, style=ProgressStyle(description_width='ini…





    

HBox(children=(FloatProgress(value=0.0, description='#3', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#0', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#1', max=63.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description='#2', max=63.0, style=ProgressStyle(description_width='ini…







## Tracking

In [None]:
# import wandb

# WANDB_NAME = f'{ds_name}-{model_name}-hf'
# GROUP = f'{ds_name}-{model_name}-hf-{lr:.0e}'
# NOTES = f'HF finetuning {model_name} with AdamW lr={lr:.0e}'
# CONFIG = {}
# TAGS =[model_name,ds_name,'adamw']

In [None]:
import wandb

WANDB_NAME = f'{ds_name}-{model_name}-alum'
GROUP = f'{ds_name}-{model_name}-hf-{lr:.0e}'
NOTES = f'HF finetuning {model_name} with AdamW lr={lr:.0e}'
CONFIG = {}
TAGS =[model_name,ds_name,'adamw','alum']

In [None]:
%env WANDB_LOG_MODEL = false
%env WANDB_WATCH = false

env: WANDB_LOG_MODEL=false
env: WANDB_WATCH=false


In [None]:
wandb.init(reinit=True, project="vat", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);

[34m[1mwandb[0m: Currently logged in as: [33mfastai_community[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.27 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [None]:
training_args = TrainingArguments(
    'test', #f'{ds_name}-{model_name}-2',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = bs,
    per_device_eval_batch_size=val_bs,
    learning_rate=lr,
    num_train_epochs=5,
    lr_scheduler_type='cosine',
    warmup_ratio=0.2,
    logging_steps=200,
    fp16=True,
    group_by_length=True,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    report_to='none',#'wandb',
    save_strategy='epoch',
    save_total_limit=2,
    seed=8,
)

In [None]:
metric = load_metric('accuracy')

In [None]:
import numpy as np

In [None]:
def compute_metric(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

## Regular training

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# trainer = Trainer(
#     model,
#     training_args,
#     train_dataset=train_ds,
#     eval_dataset=valid_ds,
#     tokenizer=tokenizer,
#     # data_collator=DataCollatorWithPadding(),
#     compute_metrics=compute_metric
# )

In [None]:
# out = trainer.train()

In [None]:
# wandb.finish()

## VATrainer

In [None]:
from core import compute_adversarial_loss

In [None]:
class VATrainer(Trainer):

    def __init__(self, *args, vat_kwargs={}, **kwargs):
        super().__init__(*args, **kwargs)
        self.adv_alpha = vat_kwargs.pop('alpha', 1.)
        self.mask_special_tokens = vat_kwargs.pop('mask_special_tokens', False)
        self.one_token_type = vat_kwargs.pop('one_token_type', False)
        self.vat_start_epoch = vat_kwargs.pop('start_epoch', 1)
        self.vat_kwargs = vat_kwargs
        self._do_vat=False

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Loss computation with virtual adversarial loss component 
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        # if masking any of those are expected to be in the inputs and should be removed before forward
        special_tokens_mask = inputs.pop('special_tokens_mask', None)
        token_type_mask = inputs.pop('token_type_ids', None)
        # explicitly adding kwargs here, verify no conflicts may happen
        outputs = model(**inputs, output_hidden_states=model.training, return_dict=True)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs.loss
        #TODO add option to use vat_start_step
        if model.training and self.state.epoch >= self.vat_start_epoch:
            if not self._do_vat:
                print(f'Starting virtual adversarial training at epoch {self.state.epoch}')
                self._do_vat = True
            # ALUM training procedure
            embed = outputs.hidden_states[0].detach()
            # TODO add option mask special tokens or toke types here
            special_tokens_mask, token_type_mask = None, None
            if self.mask_special_tokens:
                if special_tokens_mask is not None:
                    special_tokens_mask = (1-special_tokens_mask).unsqueeze(-1)
                else:
                    print('`special_tokens_maks` not found in the inputs')
                    self.mask_special_tokens = False
            if self.one_token_type:
                token_type_mask = None

            adv_loss = compute_adversarial_loss(model, embed, outputs.logits, 
                special_tokens_mask=special_tokens_mask, token_type_mask=token_type_mask,
                **self.vat_kwargs)
            loss += self.adv_alpha*adv_loss
        return (loss, outputs) if return_outputs else loss

## Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
vat_kwargs = {
    'start_epoch':0,
    'alpha':.5,
    'mask_special_tokens':True
}

trainer = VATrainer(
    model,
    training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metric,
    vat_kwargs=vat_kwargs    
)

In [None]:
out = trainer.train()

Starting virtual adversarial training at epoch 0
`special_tokens_maks` not found in the inputs


Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.4811,0.210747,0.93136,250.0133,99.995
2,0.3036,0.14307,0.9496,249.9832,100.007
3,0.2597,0.12756,0.95424,250.0298,99.988
4,0.2371,0.123138,0.95596,250.0219,99.991
5,0.2455,0.135696,0.957,250.1752,99.93
