In [None]:
# default_exp data

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from fastai.basics import Transform, ItemTransform
from fastai.text.all import *
from functools import partial
from transformers import (AutoTokenizer, AutoConfig, BatchEncoding,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask)

# Data

> Transforms and DataBlocks.

## Utils

In [None]:
#export
def get_splits(dataset, train='train', valid='validation'):
    nt, nv = len(dataset[train]), len(dataset[valid])
    return L(range(nt)), L(range(nt, nt+nv))

# def DatasetSplitter(train='train', valid='validation'):
#     return partial(get_splits, train=train, valid=valid)

In [None]:
#export
class TextGetter(ItemTransform):
    def __init__(self, s1='text', s2=None):
        self.s1, self.s2 = s1, s2
    def encodes(self, sample):
        if self.s2 is None: return sample[self.s1]
        else: return sample[self.s1], sample[self.s2]

In [None]:
#export
@typedispatch
def show_batch(x:TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))

In [None]:
#export
def find_first(t, e):
    for i, v in enumerate(t):
        if v == e: return i
        
def split_by_sep(t, sep_tok_id):
    idx = find_first(t, sep_tok_id)
    return t[:idx], t[idx:]

## Transforms

In [None]:
#export
class TokTransform(Transform):
    "Tokenizes single piece of text using pretrained tokenizer"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False,
                 padding=False, truncation=False, max_length=None, 
                 pre_tokenized=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
        
    def encodes(self, x):
        if self.pre_tokenized:
            toks = x
        else:
            toks = self.tokenizer(x,
                          add_special_tokens=True,
                          padding=self.padding,
                          truncation=self.truncation,
                          max_length=self.max_length,
                          return_tensors='pt',
                          **self.kwargs)
        return toks
    
    def decodes(self, x:TensorText):
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=False))

In [None]:
#export
class TokBatchTransform(Transform):
    """
    Tokenizes texts in batches using pretrained HuggingFace tokenizer.
    The first element in a batch can be single string or 2-tuple of strings.
    If `with_labels=True` the "labels" are added to the output dictionary.
    """
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            self._two_texts = True
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            with self.tokenizer.as_target_tokenizer():
                targets = self.tokenizer(target_texts,
                                  padding=self.padding,
                                  truncation=self.truncation,
                                  max_length=self.max_length,
                                  return_tensors='pt', 
                                  **self.kwargs).input_ids
            inps['labels'] = targets
            res = (inps, )
        else:
            # inps are batched, collate targets into batches too
            labels = default_collate([s[1:] for s in batch])
            if self.with_labels:
                # TODO consider cases when there are multiple labels
                inps['labels'] = labels[0]
                res = (inps, )
            else:
                res = (inps, ) + tuple(labels)
        return res
    
    def decodes(self, x:TensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStr(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStr(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
class PadBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None,
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)

        # inps are batched, collate targets into batches too
        labels = default_collate([s[1:] for s in samples])
        
        res = (inps, ) + tuple(labels)
        return res

    def decodes(self, x:TensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStr(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStr(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
def untuple(l):
    return [e[0] for e in l]

def to_tuple(x):
    return (x, )

TODOs:
- verify CLM works as well and mb rename `masking_func` as it would be not only for masking
- add permutation LM

In [None]:
#export
class LMBatchTfm(Transform):
    "Collates batch of pretokenized and chunked inputs into a batch and creates labels as defined by `masking_func`"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, mlm_probability=0.15):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        if masking_func is None:
            masking_func = (DataCollatorForWholeWordMask(tokenizer, mlm, mlm_probability) 
                            if whole_word_masking else 
                            DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability))
        self.masking_func = masking_func
        self.batch_processor = compose(untuple, masking_func, to_tuple)
            
    def encodes(self, b):
        # we get list of tuples but need a list of dicts
        return self.batch_processor(b)
    
    def decodes(self, b:(dict, BatchEncoding)):
        if 'input_ids' in b: res = TensorText(b['input_ids'])
        return res

In [None]:
#export
class Undict(ItemTransform):
    
    def decodes(self, b):
        # this is done hacky way to make show_batch work both when labels are separate and when in dict
        # should be a better way
        x = b[0]
        if 'input_ids' in x: res = (TensorText(x['input_ids']), )
        if 'labels' in x: res += (x['labels'], )
        return res + tuple(b[1:])

In [None]:
#export 
class UndictS2S(ItemTransform):

    def decodes(self, b):
        x = b[0]
        if 'input_ids' in x: res = (TensorText(x['input_ids']), )
        if 'labels' in x: res += (TensorText(x['labels']), )
        return res + tuple(b[1:])

## DataBlocks

In [None]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, do_targets=False, 
                 **kwargs):
        batch_tfm_cls = PadBatchTransform if preprocessed else TokBatchTransform
        before_batch_tfm = batch_tfm_cls(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, do_targets=do_targets, **kwargs)
        return super().__init__(dl_type=SortedDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=UndictS2S() if do_targets else Undict()
                               )

#     @classmethod
#     def from_pretrained(cls, ):
#         pass

#     @classmethod
#     def from_tokenizer(cls, ):
#         pass

#     @classmethod
#     def from_config(cls, ):
#         pass

### DataLoaders for classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer),
                             CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())

In [None]:
#slow
#hide
# dblock.summary(texts)

In [None]:
#slow
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive
2,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself.",negative
3,"this film sat on my tivo for weeks before i watched it. i dreaded a self - indulgent yuppie flick about relationships gone bad. i was wrong ; this was an engrossing excursion into the screwed - up libidos of new yorkers. < br / > < br / > the format is the same as max ophuls'"" la ronde, "" based on a play by arthur schnitzler, who is given an "" inspired by "" credit. it starts from one person, a prostitute, standing on a street corner in brooklyn. she is picked up by a home contractor, who has sex with her on the hood of a car, but can't come. he refuses to pay her. when he's off peeing, she answers his cell phone and takes a message. she runs away with his keys. < br / > < br / > then the story switches to",positive


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  2116, 19046,  ...,  1010,  2004,   102],
          ...,
          [  101,  1000,  1996,  ...,  3683,  1010,   102],
          [  101,  2023,  2003,  ...,  2137,  1999,   102],
          [  101,  1045,  2123,  ...,  3773,  2023,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')},
 TensorCategory([0, 1, 1, 0, 0, 1, 1, 0], device='cuda:0'))

HuggingFace models can compute loss, to use loss computed by model you should pass `with_labels = True` to datablock constructor. The `show_batch` result didn't change, but actually the labels are moved to `dict` object, which is the first element of a batch.

In [None]:
#slow
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=8)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive
3,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself.",negative


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[ 101, 6274, 5125,  ..., 1998, 2130,  102],
          [ 101, 2085, 2008,  ..., 2839, 1025,  102],
          [ 101, 2023, 2143,  ..., 1037, 2265,  102],
          ...,
          [ 101, 2024, 2017,  ..., 2015, 2000,  102],
          [ 101, 1000, 2298,  ..., 2486, 6980,  102],
          [ 101, 1045, 2018,  ..., 1007, 1012,  102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'labels': TensorCategory([0, 0, 1, 0, 0, 0, 0, 0], device='cuda:0')},)

## Language modeling

In [None]:
#export
class TransformersLMBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    # @delegates
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, 
                 mlm_probability=0.15, **kwargs):
        tok_tfm = TokTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls, 
                 config=config, tokenizer=tokenizer, return_special_tokens_mask=True, is_lm=True, **kwargs)
        
        batch_tfms = LMBatchTfm(pretrained_model_name, tokenizer_cls, config, tokenizer, 
                                mlm=mlm, masking_func=masking_func, whole_word_masking=whole_word_masking,
                                mlm_probability=mlm_probability)
        create_batch = compose(untuple, DataCollatorForLanguageModeling(tokenizer), to_tuple)
        return super().__init__(dl_type=TfmdDL,
                                type_tfms=tok_tfm,
                                batch_tfms=batch_tfms,
                                dls_kwargs={'create_batch': fa_convert},
                               )

### Dataloaders for language modeling

In [None]:
#hide
import datasets

In [None]:
#export
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
model_name = 'distilbert-base-uncased'
max_length = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = datasets.Dataset.from_csv((path/'texts.csv').as_posix())
ds = ds.map(tokenize, remove_columns=ds.column_names)
block_size = max_length
lm_ds = ds.map(group_texts, batched=True, batch_size=1000)

Using custom data configuration default-baca2dc48733f0f6
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0/cache-032c7a403e6a8cc8.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0/cache-bd4c0fdd3d96b02e.arrow


In [None]:
#slow
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=RandomSplitter())

In [None]:
#slow
#hide
# dblock.summary(lm_ds)

In [None]:
#slow
dls = dblock.dataloaders(lm_ds, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text
0,"actually kinda funny. after awhile the pool shot feels like a [MASK] channel [MASK] s station identification logo, reminding us that we are watching "" grey matter "". < br / > < br / > i also enjoyed [MASK] boutshore name - calling. at one [MASK] an angry test subject taunts somebody in charge by [MASK] her a "" scientific b * tch! "". it'ambrose just a [MASK] inadequate [MASK]. several scenes later a different subject lets off steam by muttering about that "" scientific b * * tard! "". it just sounded very awkward to me. < br / > < br / > someday this movie [MASK] disappear forever"
1,": the [MASK] men [MASK] all hotter than the [MASK] guys. < br [MASK] > < br / [MASK] don [MASK] t get me wrong, im not saying all the gay guys were ugly and [MASK], [MASK] [MASK] [MASK] of fact i found some of them [MASK] cute. it's just that overall they were just blah compared to the men you [MASK] purse see on shows like avocation at love with [MASK]a tequila or [MASK] bachelor [MASK]. < br / [MASK] [MASK] br / > i don't know how many times i hit fast [MASK] during this show. i can accept a lead character as interesting as a cardboard [MASK], i can accept the mundane"
2,"i 1789 ever experienced. [MASK] possible line in the movie [MASK] unoriginal [MASK] cliche, or just [MASK] stupid. for [MASK] the name of the camp is "" camp blood "" ( lame ), the name of the clown is "" the [MASK] clown "" ( lame ). what is a clown doing in a forest anyway? was that [MASK] only mask they could find? 3. the last but certainly the least was the acting [MASK] [MASK] the worst [MASK] of actors and actresses ever assembled. a [MASK] cornucopia of shitty lines and poor acting. worst part by far was when [MASK] [MASK] flash back to this fat foreign girl getting naked for"
3,"s daughter )! as in the later film, too, fanfan ( [MASK] ideally cast gerard philipe [MASK], ironically, is so full of life here that one [MASK] it [MASK] [MASK] believe that he would be stricken down by cancer within 7 years'time ) [MASK] flanked by two fun - [MASK] yet [MASK]ly men ( one of them is actually his superior [MASK] and [MASK] heroine's own father ) and opposed [MASK] an unscrupulous figure within [MASK] own ranks ( the [MASK] [MASK] ro [MASK]vert, with whom the hero [MASK] engages in a rooftop duel since [MASK] too has amorous designs [MASK] the gypsy girl )! ; for the record"


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[ 2015,  2021,  2002,  ...,  2028, 16278,  2129],
         [ 2685,  2024,  1037,  ...,  1037,  4393,  1010],
         [ 2020,  2525,  5936,  ...,  3036,  6481,  1997],
         ...,
         [ 4516,  8223, 23242,  ..., 17529, 10652,  5092],
         [ 2065,  2023,   103,  ...,   103,  2478,  7658],
         [ 2369,  1996,  2616,  ...,  2166,  1010,  1998]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         ...,
         [ -100,  -100,  -100,  ...,  -100, 12954,  -100],
         [ -100,  -100,  2276,  ...,  2003,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -10

## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted 12a_examples.glue-benchmark-sweeps.ipynb.
Converted index.ipynb.
