In [None]:
!nvidia-smi

Tue Apr 13 14:49:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq fastcore onnx onnxruntime sentencepiece seqeval rouge-score
    !pip install -Uqq --no-deps fastai ohmeow-blurr
    !pip install -Uqq transformers datasets wandb 

In [None]:
from fastai.text.all import *
from fastai.callback.wandb import *

In [None]:
def read_text(fn):
    return open(fn).read()

In [None]:
path = untar_data(URLs.IMDB)

## Setup

In [None]:
model_name = 'distilbert-base-uncased'

max_len = 512
bs = 8
val_bs = 16

lr = 3e-5

## Tracking

In [None]:
# !wandb login

In [None]:
import wandb

WANDB_NAME = f'imdb-{model_name}-simple'
GROUP = f'IMDB-{model_name}-simple-lr3e-5'
NOTES = f'Simple finetuning {model_name} with RAdam lr=3e-5'
CONFIG = {}
TAGS =[model_name,'imdb','radam','simple']

In [None]:
wandb.init(reinit=True, project="vat", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Training

In [None]:
def _to_device(e, device):
    if hasattr(e, 'to'): return e.to(device)
    elif isinstance(e, dict):
        for _, v in e.items():
            if hasattr(v, 'to'): v.to(device)
        return {k:(v.to(device) if hasattr(v, 'to') else v) for k, v in e.items()}

In [None]:
@patch
def one_batch(self:Learner, i, b):
        self.iter = i
        b_on_device = tuple(_to_device(e, self.dls.device) for e in b) if self.dls.device is not None else b
        self._split(b_on_device)
        self._with_events(self._do_one_batch, 'batch', CancelBatchException)

In [None]:
from transformers import *

from blurr.data.all import *
from blurr.modeling.all import *

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, tokenizer_cls=AutoTokenizer, tokenizer_kwargs={'max_len':512})

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(blocks=blocks, 
                   get_items=get_text_files,
                   get_x = read_text,
                   get_y=parent_label,
                   splitter=GrandparentSplitter(valid_name='test'))

dls = dblock.dataloaders(path, bs=bs, val_bs=val_bs)

In [None]:
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls,
                model,
                opt_func=RAdam,
                metrics=[accuracy],
                cbs=[HF_BaseModelCallback],
                splitter=hf_splitter).to_fp16()

# learn.blurr_summary()

In [None]:
# learn.show_training_loop()

### simple finetuning

In [None]:
learn.fit_one_cycle(4, lr, cbs=WandbCallback(log_preds=False, log_model=False))

Could not gather input dimensions


epoch,train_loss,valid_loss,accuracy,time
0,0.245041,0.233028,0.90556,08:35
1,0.149086,0.181933,0.9298,08:47
2,0.065082,0.208454,0.93468,08:45
3,0.020567,0.288048,0.93568,08:46




In [None]:
learn.save('distilbert-imdb-clf')

Path('models/distilbert-imdb-clf.pth')

In [None]:
learn = learn.load('distilbert-imdb-clf')

## Pseudo-labels

### Get dataloader for ulabel data

In [None]:
path.ls()

(#7) [Path('/root/.fastai/data/imdb/unsup'),Path('/root/.fastai/data/imdb/train'),Path('/root/.fastai/data/imdb/test'),Path('/root/.fastai/data/imdb/tmp_clas'),Path('/root/.fastai/data/imdb/README'),Path('/root/.fastai/data/imdb/imdb.vocab'),Path('/root/.fastai/data/imdb/tmp_lm')]

In [None]:
unsup_fns = get_text_files(path/'unsup')
len(fns)

50000

2x training ds size

In [None]:
unsup_dl = dls.test_dl(unsup_fns)

In [None]:
unsup_dl.one_batch()

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'input_ids': tensor([[  101,  1008,  1008,  ...,  1056,  2066,   102],
          [  101,  1026,  7987,  ...,  1005,  1056,   102],
          [  101,  1026,  7987,  ...,  1011, 18856,   102],
          ...,
          [  101,  2000,  1996,  ...,  6519,  2015,   102],
          [  101, 27594,  2121,  ...,  2613,  1010,   102],
          [  101,  2108,  3943,  ...,  1996,  2154,   102]], device='cuda:0')},)

TODOs:
- use lowest valid loss model for pseudo-labeling
- use ensamble for pseudo-labeling

In [None]:
model_preds = learn.get_preds(dl=unsup_dl)

In [None]:
unsup_plabels = model_preds[0]

In [None]:
torch.save(unsup_plabels, 'unsup_plabels')

In [None]:
unsup_plabels = torch.load('unsup_plabels')

### Hard pseudolabels

In [None]:
unsup_plabels_hard = unsup_plabels.argmax(-1)

In [None]:
train_fns = get_text_files(path/'train')
valid_fns = get_text_files(path/'test')
len(train_fns), len(valid_fns)

In [None]:
train_labels = train_fns.map(parent_label)
train_labels

(#25000) ['neg','neg','neg','neg','neg','neg','neg','neg','neg','neg'...]

In [None]:
valid_labels = valid_fns.map(parent_label)

In [None]:
unsup_labels = L([dls.vocab[e.item()] for e in unsup_plabels_hard])

In [None]:
all_fns = valid_fns + train_fns + unsup_fns
all_labels = valid_labels + train_labels + unsup_labels

In [None]:
from fastai.basics import Pipeline as TfmPipeline

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(blocks=blocks, 
                   get_x = TfmPipeline([ItemGetter(0), read_text]),
                   get_y=ItemGetter(1),
                   splitter=IndexSplitter(L(range_of(valid_fns))))

dls = dblock.dataloaders(all_fns.zipwith(all_labels), bs=bs, val_bs=val_bs)

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, tokenizer_cls=AutoTokenizer, tokenizer_kwargs={'max_len':512})

In [None]:
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls,
                model,
                opt_func=RAdam,
                metrics=[accuracy],
                cbs=[HF_BaseModelCallback],
                splitter=hf_splitter).to_fp16()

In [None]:
learn.fit_one_cycle(4, 2e-5)

epoch,train_loss,valid_loss,accuracy,time
0,0.173654,0.180957,0.92796,21:27
1,0.093212,0.208649,0.93228,21:28
2,0.017471,0.27084,0.93684,21:22
3,0.001908,0.312814,0.93808,21:20


### Soft pseudolabels

TODOs:
- Try other loss functions
- Combine hard and soft labels (potentially with weights)
- Take confidence of the pseudo-labels into account

In [None]:
unsup_fns = get_text_files(path/'unsup')
train_fns = get_text_files(path/'train')
valid_fns = get_text_files(path/'test')
len(train_fns), len(valid_fns)

(25000, 25000)

In [None]:
train_labels = train_fns.map(lambda x: parent_label(x) == 'pos')
valid_labels = valid_fns.map(lambda x: parent_label(x) == 'pos')

In [None]:
train_labels = F.one_hot(tensor(train_labels).long())
valid_labels = F.one_hot(tensor(valid_labels).long())

In [None]:
all_labels = torch.cat([valid_labels, train_labels, unsup_plabels])

In [None]:
all_fns = valid_fns + train_fns + unsup_fns

In [None]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, tokenizer_cls=AutoTokenizer, tokenizer_kwargs={'max_len':512})

In [None]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), RegressionBlock)
dblock = DataBlock(blocks=blocks,
                   get_x = TfmPipeline([ItemGetter(0), read_text]),
                   get_y=ItemGetter(1),
                   splitter=IndexSplitter(L(range_of(valid_fns))))

dls = dblock.dataloaders(all_fns.zipwith(all_labels), bs=bs, val_bs=val_bs)

In [None]:
accuracy??

In [None]:
def accuracy_(inp, targ):
    return accuracy(inp, targ.argmax(-1))

In [None]:
class OutputRangeCallback(Callback):
    def after_pred(self):
        self.learn.pred = self.pred.sigmoid()

In [None]:
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls,
                model,
                opt_func=RAdam,
                metrics=[accuracy_],
                cbs=[HF_BaseModelCallback, OutputRangeCallback],
                splitter=hf_splitter).to_fp16()

In [None]:
learn.fit_one_cycle(4, 2e-5)

epoch,train_loss,valid_loss,accuracy_,time
0,0.037959,0.055724,0.92816,21:22
1,0.031615,0.053254,0.92872,21:21
2,0.016389,0.050811,0.93664,21:20
3,0.007336,0.050041,0.938,21:22
