In [None]:
# default_exp learner

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai.basics import *
from fastai.text.all import TensorText
from inspect import signature
from fasthugs.data import TransformersTextBlock

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, BatchEncoding
from transformers.modeling_outputs import QuestionAnsweringModelOutput

# Learner for transformers

## Parameter groups

TODOs:
- [x] exclude modules w/o params
- [ ] add layerwise splitter for Transfomers

In [None]:
#skip
#hide
# for n, m in model.base_model.named_children(): print(n)

In [None]:
# export
def default_splitter(model):
    groups = L(model.base_model.children()) + L(m for m in list(model.children())[1:] if params(m))
    return groups.map(params)

In [None]:
def layerwise_splitter(model):
    raise NotImplementedError('use default_splitter for now')

## TransLearner and utils

In [None]:
#export
@typedispatch
def show_results(x: TensorText, y, samples, outs, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    elif trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_results[object](x, y, samples, outs, ctxs=ctxs, max_n=max_n, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs

In [None]:
#export
def to_device(b, device=None):
    "Recursively put `b` on `device`. Handles `BatchEncoding`s"
    if defaults.use_cuda==False: device='cpu'
    elif device is None: device=default_device()
    def _inner(o):      
        if isinstance(o,Tensor): return o.to(device, non_blocking=True)
        elif isinstance(o, (dict, BatchEncoding)):
            return {k:to_device(v) for k,v in o.items()}
        elif hasattr(o, "to_device"): return o.to_device(device)
        else: return o
    return apply(_inner, b)

In [None]:
#cuda
device = torch.device('cuda:0')
d = {'a':tensor([1,2,3])}
d_cuda = to_device(d)
assert d_cuda['a'].device == device

In [None]:
#export
class TransCallback(Callback):
    "Handles usecase with loss returned by HuggingFace model"
    def after_pred(self):
        if 'loss' in self.pred:
            self.learn.loss_grad = self.pred.loss
            self.learn.loss = self.pred.loss.clone()
            if 'labels' in self.xb[0].keys():
                self.learn.yb = (self.xb[0]['labels'], )
            self.learn.compute_loss = False
        if isinstance(self.pred, QuestionAnsweringModelOutput):
            self.learn.pred = (self.pred.start_logits, self.pred.end_logits)
        else: self.learn.pred = self.pred.logits

In [None]:
#skip
#hide
def __init__(self, dls, model, loss_func=None, opt_func=Adam, lr=defaults.lr, splitter=trainable_params, cbs=None,
                 metrics=None, path=None, model_dir='models', wd=None, wd_bn_bias=False, train_bn=True,
                 moms=(0.95,0.85,0.95)):
        path = Path(path) if path is not None else getattr(dls, 'path', Path('.'))
        if loss_func is None:
            loss_func = getattr(dls.train_ds, 'loss_func', None)
            assert loss_func is not None, "Could not infer loss function from the data, please pass a loss function."
        self.dls,self.model = dls,model
        store_attr(but='dls,model,cbs')
        self.training,self.create_mbar,self.logger,self.opt,self.cbs = False,True,print,None,L()
        self.add_cbs(L(defaults.callbacks)+L(cbs))
        self("after_create")

In [None]:
#export
@delegates(Learner.__init__)
class TransLearner(Learner):
    "Learner for training transformers from HuggingFace"
    def __init__(self, dls, model, **kwargs):
        splitter = kwargs.pop('splitter', None)
        if splitter is None: kwargs['splitter'] = default_splitter
        super().__init__(dls, model, **kwargs)
        self.model_args = set(signature(model.forward).parameters.keys())
        self.add_cb(TransCallback())
        self.compute_loss = True

    def one_batch(self, i, b):
        self.iter = i
        b_on_device = tuple(to_device(e) for e in b) if self.dls.device is not None else b
        self._split(b_on_device)
        self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    
    def _do_one_batch(self):
        x = self.xb[0]
        for k in x.keys():
            if k not in self.model_args: del x[k]
        self.pred = self.model(**self.x)
        self('after_pred')
        if len(self.yb) and self.compute_loss:
            self.loss_grad = self.loss_func(self.pred, *self.yb)
            self.loss = self.loss_grad.clone()
        self('after_loss')
        if not self.training or not len(self.yb): return
        self('before_backward')
        self.loss_grad.backward()
        self._with_events(self.opt.step, 'step', CancelStepException)
        self.opt.zero_grad()

### Using TransLearner for sequence classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)

In [None]:
#slow
model = AutoModelForSequenceClassification.from_pretrained(model_name)
learn = TransLearner(dls, model, metrics=accuracy)
learn.fit(2, 2e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

epoch,train_loss,valid_loss,accuracy,time
0,0.44615,0.542738,0.765,00:21
1,0.288374,0.300514,0.895,00:21


## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted index.ipynb.
