In [None]:
# default_exp learner

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai.basics import *
from fastai.text.all import TensorText
from fastai.learner import _ConstantFunc
from inspect import signature
from collections import namedtuple
from fasthugs.data import TransformersTextBlock, TransTensorText

from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, BatchEncoding,
                          PreTrainedModel)
from transformers.modeling_outputs import QuestionAnsweringModelOutput

# Learner for transformers

## Parameter groups

TODOs:
- [x] exclude modules w/o params
- [ ] add layerwise splitter for Transfomers

In [None]:
#skip
#hide
# for n, m in model.base_model.named_children(): print(n)

In [None]:
# export
def default_splitter(model):
    groups = L(model.base_model.children()) + L(m for m in list(model.children())[1:] if params(m))
    return groups.map(params)

In [None]:
def layerwise_splitter(model):
    raise NotImplementedError('use default_splitter for now')

## Utils

In [None]:
#export
@typedispatch
def show_results(x:TransTensorText, y, samples, outs, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    elif trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_results[object](x, y, samples, outs, ctxs=ctxs, max_n=max_n, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs

In [None]:
#export
def to_device(b, device=None):
    "Recursively put `b` on `device`. Handles `BatchEncoding`s"
    if defaults.use_cuda==False: device='cpu'
    elif device is None: device=default_device()
    # put custom `namedtuple` on `device`
    # there might be no need in it, mb to remove
    if (isinstance(b, tuple) and
        hasattr(b, "_asdict") and
        hasattr(b, "_fields")):
        return type(b)(**{k:to_device(v) for k,v in b._asdict().items()})
    def _inner(o):      
        if isinstance(o,Tensor): return o.to(device, non_blocking=True)
        elif isinstance(o,BatchEncoding): return o.to(device)
        # elif hasattr(o, "to_device"): return o.to_device(device)
        else: return o
    return apply(_inner, b)

In [None]:
#cuda
device = torch.device('cuda:0')
d = {'a':tensor([1,2,3])}
d_cuda = to_device(d)
assert d_cuda['a'].device == device

In [None]:
#hide
#cuda
ModelInputs = namedtuple("ModelInputs", ["a", "b", "c"], defaults=[None]*3)
b = ModelInputs(a=tensor([1,2,3]))
b = to_device(b)
assert b.a.device == device
assert b.b is None

In [None]:
#export
def nested_reorder(t, idxs):
    "Reorder all tensors in `t` using `idxs`"
    if isinstance(t, (Tensor,L)): return t[idxs]
    elif is_listy(t): return type(t)(nested_reorder(t_, idxs) for t_ in t)
    elif isinstance(t, dict): return {k:nested_reorder(v, idxs) for k,v in t.items()}
    if t is None: return t
    raise TypeError(f"Expected tensor, tuple, list or L but got {type(t)}")

In [None]:
d = {"a":tensor([[1,2], [3,4]])}
d = nested_reorder(d, tensor([1,0]))
assert torch.all(d["a"] == tensor([[1,2], [3,4]])[tensor(1,0)])

## Callbacks 

In [None]:
#export
class TransCallback(Callback):
    order = 1
    "Handles HuggingFace model inputs and outputs"
    def __init__(self, model):
        sig = signature(model.forward)
        ModelInputs = namedtuple('ModelInputs', sig.parameters.keys(), defaults=[v.default for v in sig.parameters.values()])
        self._model_inputs = ModelInputs
    
    def before_batch(self):
        self.learn.xb = self._model_inputs(**{k:v for k,v in self.xb[0].items() if k in self._model_inputs._fields})
    
    def after_pred(self):
        if 'loss' in self.pred:
            self.learn.loss_grad = self.pred.loss
            self.learn.loss = self.pred.loss.clone()
        if isinstance(self.pred, QuestionAnsweringModelOutput):
            self.learn.pred = (self.pred.start_logits, self.pred.end_logits)
        else: self.learn.pred = self.pred.logits
    
    def after_loss(self):
        if not (getattr(self.xb, 'labels', None) is None):
            self.learn.yb = (self.xb.labels, )

In [None]:
#export
class GeneratePreds(Callback):
    "Produces `generated_tokens` which can be used for metrics computation"
    order = TransCallback.order-1
    run_train, run_valid = False, True
    @delegates(PreTrainedModel.generate)
    def __init__(self, **kwargs):
        self.gen_kwargs = kwargs
    def before_fit(self):
        self.learn.predict_with_generate = True
    def before_batch(self):
        input_ids, attention_mask = self.xb[0]['input_ids'], self.xb[0]['attention_mask']
        self.learn.generated_tokens = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **self.gen_kwargs)

## Learner

In [None]:
Learner.get_preds??

[0;31mSignature:[0m
[0mLearner[0m[0;34m.[0m[0mget_preds[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mds_idx[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdl[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwith_input[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwith_decoded[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwith_loss[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mact[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minner[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreorder[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcbs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msave_preds[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msave_targs[0m[0;34m=[0m[0;32

In [None]:
#export
@delegates(Learner.__init__)
class TransLearner(Learner):
    "Learner for training transformers from HuggingFace"
    def __init__(self, dls, model:PreTrainedModel, predict_with_generate:bool=False, **kwargs):
        splitter = kwargs.get('splitter', None)
        if splitter is None: kwargs['splitter'] = default_splitter
        super().__init__(dls, model, **kwargs)
        self.add_cb(TransCallback(model))
        self.predict_with_generate = predict_with_generate
    # temporary patch to make nested_reoder work with dicts
    @delegates(GatherPredsCallback.__init__)
    def get_preds(self, ds_idx=1, dl=None, with_input=False, with_decoded=False, with_loss=False, act=None,
                  inner=False, reorder=True, cbs=None, **kwargs):
        if dl is None: dl = self.dls[ds_idx].new(shuffle=False, drop_last=False)
        else:
            try: len(dl)
            except TypeError as e:
                raise TypeError("`dl` is something other than a single `DataLoader` object")
        if reorder and hasattr(dl, 'get_idxs'):
            idxs = dl.get_idxs()
            dl = dl.new(get_idxs = _ConstantFunc(idxs))
        cb = GatherPredsCallback(with_input=with_input, with_loss=with_loss, **kwargs)
        ctx_mgrs = self.validation_context(cbs=L(cbs)+[cb], inner=inner)
        if with_loss: ctx_mgrs.append(self.loss_not_reduced())
        with ContextManagers(ctx_mgrs):
            self._do_epoch_validate(dl=dl)
            if act is None: act = getattr(self.loss_func, 'activation', noop)
            res = cb.all_tensors()
            pred_i = 1 if with_input else 0
            if res[pred_i] is not None:
                res[pred_i] = act(res[pred_i])
                if with_decoded: res.insert(pred_i+2, getattr(self.loss_func, 'decodes', noop)(res[pred_i]))
            if reorder and hasattr(dl, 'get_idxs'): res = nested_reorder(res, tensor(idxs).argsort())
            return tuple(res)
        self._end_cleanup()

In [None]:
#export
@patch
def _set_device(self:TransLearner, b):
    model_device = torch.device(torch.cuda.current_device()) if next(self.model.parameters()).is_cuda else torch.device('cpu')
    dls_device = getattr(self.dls, 'device', default_device())
    if model_device == dls_device: return to_device(b, dls_device)
    else: return to_device(b, model_device)

### Using TransLearner for sequence classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'google/electra-small-discriminator'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)

In [None]:
#slow
model = AutoModelForSequenceClassification.from_pretrained(model_name)
learn = TransLearner(dls, model, metrics=accuracy).to_fp16()
learn.fit(2, 5e-5)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

epoch,train_loss,valid_loss,accuracy,time
0,0.597513,0.440168,0.84,00:39
1,0.423275,0.33434,0.89,00:39


In [None]:
#slow
learn.show_results()

Unnamed: 0,text,category,category_
0,"the trouble with the book, "" memoirs of a geisha "" is that it had japanese surfaces but underneath the surfaces it was all an american man's way of thinking. reading the book is like watching a magnificent ballet with great music, sets, and costumes yet performed by barnyard animals dressed in those costumesso far from japanese ways of thinking were the characters. < br / > < br / > the movie isn't about japan or real geisha. it is a story about a few american men's mistaken ideas about japan and geisha filtered through their own ignorance and misconceptions. so what is this movie if it isn't about japan or geisha? is it pure fantasy as so many people have said? yes, but then why make it into an american fantasy? < br / > < br / > there were so many missed opportunities. imagine a culture",negative,negative
1,"< br / > < br / > i'm sure things didn't exactly go the same way in the real life of homer hickam as they did in the film adaptation of his book, rocket boys, but the movie "" october sky "" ( an anagram of the book's title ) is good enough to stand alone. i have not read hickam's memoirs, but i am still able to enjoy and understand their film adaptation. the film, directed by joe johnston and written by lewis colick, records the story of teenager homer hickam ( jake gyllenhaal ), beginning in october of 1957. it opens with the sound of a radio broadcast, bringing news of the russian satellite sputnik, the first artificial satellite in orbit. we see a images of a blue - gray town and its people : mostly miners working for the olga coal company. one of the miners",positive,positive
2,"how viewers react to this new "" adaption "" of shirley jackson's book, which was promoted as not being a remake of the original 1963 movie ( true enough ), will be based, i suspect, on the following : those who were big fans of either the book or original movie are not going to think much of this one... and those who have never been exposed to either, and who are big fans of hollywood's current trend towards "" special effects "" being the first and last word in how "" good "" a film is, are going to love it. < br / > < br / > things i did not like about this adaption : < br / > < br / > 1. it was not a true adaption of the book. from the articles i had read, this movie was supposed to cover other",negative,negative
3,"to review this movie, i without any doubt would have to quote that memorable scene in tarantino's "" pulp fiction "" ( 1994 ) when jules and vincent are talking about mia wallace and what she does for a living. jules tells vincent that the "" only thing she did worthwhile was pilot "". vincent asks "" what the hell is a pilot? "" and jules goes into a very well description of what a tv pilot is : "" well, the way they make shows is, they make one show. that show's called a'pilot '. then they show that show to the people who make shows, and on the strength of that one show they decide if they're going to make more shows. some pilots get picked and become television programs. some don't, become nothing. she starred in one of the ones that became nothing. "" now to stretch",negative,negative
4,"bonanza had a great cast of wonderful actors. lorne greene, pernell whitaker, michael landon, dan blocker, and even guy williams ( as the cousin who was brought in for several episodes during 1964 to replace adam when he was leaving the series ). the cast had chemistry, and they seemed to genuinely like each other. that made many of their weakest stories work a lot better than they should have. it also made many of their best stories into great western drama. < br / > < br / > like any show that was shooting over thirty episodes every season, there are bound to be some weak ones. however, most of the time each episode had an interesting story, some kind of conflict, and a resolution that usually did not include violence. while bonanza was a western, the gunfighting was never featured as the main attraction. while i am",positive,positive
5,"we saw the silent version of this film, and it is quite simply shimmeringly beautiful. it's quite hard to see how a sound version could have been created, since it is shot with pure silent technique, long wordless sweeps of narrative without a single intertitle - - save for a few disconcerting sequences where louise brooks, playing a french typist, is quite visibly speaking in english... the only section that obviously cries out for sound is the final scene, where brooks is watching the rushes for her test'for a sound film': footage which plays constantly in the background as the action unfolds, with her mouth moving in ceaseless soundless song. i was unsurprised to learn afterwards that this passage alone in the talkie version had been hailed as an exemplar of new technique! < br / > < br / > in the sunny beauty of its opening scenes and",positive,positive
6,"i couldn't believe that this movie dates from 2007, it had all the looks of a below - average seventies horror - flick. didn't they have any knowledge of modern special effects or cgi?!? didn't they know that in the post - millennium the violence in a supposed horror - and / or scifi - movie should at least be a little bit graphic? or did i get the purpose wrong, was it supposed to be a deep and meaningful story of man and animal, bound together in the big cycle of life, or a warning to mankind not to mess with nature, or something like that?? it doesn't really matter, either way it turned out wrong and to me this movie failed on all accounts. < br / > < br / > first of all : the premise is very improbable. if at a given time you're capable",negative,negative
7,"for me the only reason for having a look at this remake was to see how bad and funny it could be. there was no doubt about it being funny and bad, because i had seen "" voyna i mir "" ( 1968 ). shall we begin? here we go... < br / > < br / > robert dornhelm & brendan donnison's pierre bezukhov - a lean fellow that lacks the depth of the original ; robert dornhelm & brendan donnison's natasha rostova - a scarecrow, her image can cause insomnia ; robert dornhelm & brendan donnison's andrej bolkonsky - an ok incarnation which, like the lean fellow ( cf. above ), lacks depth of a russian soul and "" struggle within "" ; robert dornhelm & brendan donnison's napoleon - a rather unimpressive leader ; robert dornhelm & brendan donnison's prince bolkonsky - a turd with an english",negative,negative
8,"i don't usually write a comment when there are so many others but this time i feel i have to. i have spoken of taste in another review, saying it's all in the eye of the beholder but when it comes to this film, if you like it, it simply means you have bad taste. < br / > < br / > i love films. i loved "" isle of the dead "" which is pretty much an unknown b & w film. i even liked "" scream "" and "" scary movie "" i liked these films because they have, if not a lot, at least something good about them. i appreciate 99. 9 % of the films i've seen because they tell a story which i haven't heard before, and most directors only make films with a good storyline. throughout this film i was thinking "" where",negative,negative


In [None]:
#hide
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'google/electra-small-discriminator'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
learn = TransLearner(dls, model, metrics=accuracy).to_fp16()
learn.fit(2, 5e-5)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

epoch,train_loss,valid_loss,accuracy,time
0,0.597677,0.41316,0.86,00:37
1,0.420096,0.47256,0.8,00:38


## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted 12a_examples.glue-benchmark-sweeps.ipynb.
Converted index.ipynb.
