In [None]:
# default_exp data

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from fastai.basics import Transform, ItemTransform
from fastai.text.all import *
from functools import partial
from transformers import (AutoTokenizer, AutoConfig, BatchEncoding,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask)

# Data

> Transforms and DataBlocks.

## Utils

In [None]:
#export
def get_splits(dataset, train='train', valid='validation'):
    nt, nv = len(dataset[train]), len(dataset[valid])
    return L(range(nt)), L(range(nt, nt+nv))

# def DatasetSplitter(train='train', valid='validation'):
#     return partial(get_splits, train=train, valid=valid)

In [None]:
#export
class TextGetter(ItemTransform):
    def __init__(self, s1='text', s2=None):
        self.s1, self.s2 = s1, s2
    def encodes(self, sample):
        if self.s2 is None: return sample[self.s1]
        else: return sample[self.s1], sample[self.s2]

In [None]:
#export
@typedispatch
def show_batch(x:TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))

In [None]:
#export
def find_first(t, e):
    for i, v in enumerate(t):
        if v == e: return i
        
def split_by_sep(t, sep_tok_id):
    idx = find_first(t, sep_tok_id)
    return t[:idx], t[idx:]

## Transforms

In [None]:
#export
class TokTransform(Transform):
    "Tokenizes single piece of text using pretrained tokenizer"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False,
                 padding=False, truncation=False, max_length=None, 
                 **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
        
    def encodes(self, text):
#         print(text)
#         print(type(text))
#         inps = self.tokenizer.encode_plus(text,
#                               add_special_tokens=True,
#                               padding=self.padding,
#                               truncation=self.truncation,
#                               max_length=self.max_length,
#                               return_tensors='pt',
#                               **self.kwargs)
        inps = text
        return inps
    def decodes(self, x:TensorText):
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=False))

In [None]:
#export
class TokBatchTransform(Transform):
    """
    Tokenizes texts in batches using pretrained HuggingFace tokenizer.
    The first element in a batch can be single string or 2-tuple of strings.
    If `with_labels=True` the "labels" are added to the output dictionary.
    """
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            self._two_texts = True
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            targets = self.tokenizer(target_texts,
                              add_special_tokens=False,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt', 
                                    **self.kwargs).input_ids
            # join inps and targs
        else:
            # inps are batched, collate targets into batches too
            labels = default_collate([s[1:] for s in batch])
            if self.with_labels:
                # TODO consider cases when there are multiple labels
                inps['labels'] = labels[0]
                res = (inps, )
            else:
                res = (inps, ) + tuple(labels)
#         if self.is_lm:
#             res = [(x, x) for x in res]
        return res
    
    def decodes(self, x:TensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStr(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStr(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
class PadBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None,
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)

        # inps are batched, collate targets into batches too
        labels = default_collate([s[1:] for s in samples])
        
        res = (inps, ) + tuple(labels)
        return res

    def decodes(self, x:TensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStr(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStr(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
def untuple(l):
    return [e[0] for e in l]

def to_tuple(x):
    return (x, )

TODOs:
- verify CLM works as well and mb rename `masking_func` as it would be not only for masking
- add permutation LM

In [None]:
#export
class LMBatchTfm(Transform):
    "Collates batch of pretokenized and chunked inputs into a batch and creates labels as defined by `masking_func`"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, mlm_probability=0.15):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        if masking_func is None:
            masking_func = (DataCollatorForWholeWordMask(tokenizer, mlm, mlm_probability) 
                            if whole_word_masking else 
                            DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability))
        self.masking_func = masking_func
        self.batch_processor = compose(untuple, masking_func, to_tuple)
            
    def encodes(self, b):
        # we get list of tuples but need a list of dicts
        return self.batch_processor(b)
    
    def decodes(self, b:(dict, BatchEncoding)):
        if 'input_ids' in b: res = TensorText(b['input_ids'])
        return res

In [None]:
#export
class Undict(ItemTransform):
    
    def decodes(self, b):
        # this is done hacky way to make show_batch work both when labels are separate and when in dict
        # should be a better way
        x = b[0]
        if 'input_ids' in x: res = (TensorText(x['input_ids']), )
        if 'labels' in x: res += (x['labels'], )
        return res + tuple(b[1:])

## DataBlocks

In [None]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, **kwargs):
        batch_tfm_cls = PadBatchTransform if preprocessed else TokBatchTransform
        before_batch_tfm = batch_tfm_cls(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, **kwargs)
        return super().__init__(dl_type=SortedDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict()
                               )

#     @classmethod
#     def from_pretrained(cls, ):
#         pass

#     @classmethod
#     def from_tokenizer(cls, ):
#         pass

#     @classmethod
#     def from_config(cls, ):
#         pass

### DataLoaders for classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer),
                             CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())

In [None]:
#slow
#hide
# dblock.summary(texts)

In [None]:
#slow
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself.",negative
3,"well, what can i say. < br / > < br / > "" what the bleep do we know "" has achieved the nearly impossible - leaving behind such masterpieces of the genre as "" the postman "", "" the dungeon master "", "" merlin "", and so fourth, it will go down in history as the single worst movie i have ever seen in its entirety. and that, ladies and gentlemen, is impressive indeed, for i have seen many a bad movie. < br / > < br / > this masterpiece of modern cinema consists of two interwoven parts, alternating between a silly and contrived plot about an extremely annoying photographer, abandoned by her husband and forced to take anti - depressants to survive, and a bunch of talking heads going on about how quantum physics supposedly justifies their new - agy pseudo - philosophy. basically, if",negative


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[ 101, 6274, 5125,  ..., 1998, 2130,  102],
          [ 101, 1996, 4497,  ..., 2090, 1005,  102],
          [ 101, 2085, 2008,  ..., 2839, 1025,  102],
          ...,
          [ 101, 2024, 2017,  ..., 2015, 2000,  102],
          [ 101, 1045, 3427,  ..., 2091, 1012,  102],
          [ 101, 1045, 2018,  ..., 1007, 1012,  102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')},
 TensorCategory([0, 1, 0, 1, 1, 0, 0, 0], device='cuda:0'))

HuggingFace models can compute loss, to use loss computed by model you should pass `with_labels = True` to datablock constructor. The `show_batch` result didn't change, but actually the labels are moved to `dict` object, which is the first element of a batch.

In [None]:
#slow
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=8)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"now that che ( 2008 ) has finished its relatively short australian cinema run ( extremely limited release : 1 screen in sydney, after 6wks ), i can guiltlessly join both hosts of "" at the movies "" in taking steven soderbergh to task. < br / > < br / > it's usually satisfying to watch a film director change his style / subject, but soderbergh's most recent stinker, the girlfriend experience ( 2009 ), was also missing a story, so narrative ( and editing? ) seem to suddenly be soderbergh's main challenge. strange, after 20 - odd years in the business. he was probably never much good at narrative, just hid it well inside "" edgy "" projects. < br / > < br / > none of this excuses him this present, almost diabolical failure. as david stratton warns, "" two parts of che don't ( even",negative
3,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself.",negative


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  2085,  2008,  ...,  2839,  1025,   102],
          ...,
          [  101,  1996,  2095,  ..., 13433, 21565,   102],
          [  101,  1045,  3427,  ...,  2091,  1012,   102],
          [  101,  2348,  3858,  ...,  1997,  1996,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'labels': TensorCategory([0, 1, 0, 1, 0, 1, 0, 1], device='cuda:0')},)

## Language modeling

In [None]:
#export
class TransformersLMBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    # @delegates
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, 
                 mlm_probability=0.15, **kwargs):
        tok_tfm = TokTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls, 
                 config=config, tokenizer=tokenizer, return_special_tokens_mask=True, is_lm=True, **kwargs)
        
        batch_tfms = LMBatchTfm(pretrained_model_name, tokenizer_cls, config, tokenizer, 
                                mlm=mlm, masking_func=masking_func, whole_word_masking=whole_word_masking,
                                mlm_probability=mlm_probability)
        create_batch = compose(untuple, DataCollatorForLanguageModeling(tokenizer), to_tuple)
        return super().__init__(dl_type=TfmdDL,
                                type_tfms=tok_tfm,
                                batch_tfms=batch_tfms,
                                dls_kwargs={'create_batch': fa_convert},
                               )

### Dataloaders for language modeling

In [None]:
#hide
import datasets

In [None]:
#export
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
model_name = 'distilbert-base-uncased'
max_length = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = datasets.Dataset.from_csv((path/'texts.csv').as_posix())
ds = ds.map(tokenize, remove_columns=ds.column_names)
block_size = max_length
lm_ds = ds.map(group_texts, batched=True, batch_size=1000)

Using custom data configuration default-baca2dc48733f0f6
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0)


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
#slow
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=RandomSplitter())

In [None]:
#slow
#hide
# dblock.summary(lm_ds)

In [None]:
#slow
dls = dblock.dataloaders(lm_ds, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text
0,"[MASK] hour and [MASK] half to kill and [MASK] soviets you [MASK] to end [MASK] feeling frustrated and confused 1716 rent this winner. [SEP] [CLS] "" gunga [MASK] [MASK] : one of the greatest adventure domingofra told! a story about [MASK] british foreign legion in 19th century india and a [MASK]ly "" [MASK] - bearer "" named gunga din, a local denize [MASK] who aspire [MASK] to be [MASK] like his tristan [MASK] ; three british [MASK] [MASK] whose loyalty and cam [MASK]derie for each other extend far beyond [MASK] boundsdie mere patriotism. their's is a true and abiding friendship [MASK] one another and each would be willing [MASK] sacrifice"
1,"meeting again an old friend, whom i had not seen for almost 15 years, which he spent in several panamanian jails. the young man i used to know is gone, not only because he insides older [MASK] but due to his exposure for a [MASK] time to the penal system. there are jails and there are jails, one must [MASK], but this one prisoner in "" shot in the heart "" is definitely out of this world. [SEP] [CLS] i [MASK] this movie. [MASK] [MASK] because i am a big fan of moritz bleibtreu, although he is in practically [MASK] german movies [MASK] count. but also because he fat"
2,"and replaced with songs. i've read this [MASK] [MASK] and recently was lucky enough to see it [MASK], at it remains one of [MASK] favourite shakespearian comedies, but this russian seems to take all that i [MASK] about it away [MASK] the princess, [MASK] no doubt doing what [MASK] was directed to do, had no regal bearing, [MASK] all the girls seemed to lose the clever [MASK] of their characters - gonzalez affected by unwise cuts, which not only took away the female characters already sparse [MASK], but took comments out of context - it [MASK] a little unnerving to hear the princess proclaim ; "" we are wise girls to mock"
3,for [MASK] the year that he ocean. i'm guessing that he stayed because he was supposed to see how good [MASK] life was [MASK] to the poverty [MASK] [MASK] frightening parador. i think he took too many things for granted in life and he needed to get a serious reality check by remaining in [MASK] country for as long as he did. < br / > < br / [MASK] but......... [unused9]. anyways. [MASK]... [MASK].. [MASK].. [MASK] is why i [MASK] this lit recital [MASK] out of 10. [SEP] [CLS] i don't quite get the rating for the amati


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[ 1010,  2743,  3215,  ...,  4569,   103,  1996],
         [10933,  2018,  2517,  ...,  1012,  1999,  2804],
         [ 8502,  2599,  5889,  ...,  2055,  2023,   103],
         ...,
         [ 2862,  2073,  2045,  ...,   103,  2000,  9191],
         [  103,  1010,  2069,  ...,  1997,  4438,   103],
         [ 2821,  2092,  1012,  ...,  2051,  2027,   103]]), 'labels': tensor([[-100, -100, -100,  ..., -100, 1010, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, 2061],
         ...,
         [-100, -100, -100,  ..., 2038, -100, -100],
         [4808, -100, -100,  ..., -100, -100, 2143],
         [-100, -100, -100,  ..., -100, -100, 3863]])},)

## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted 12a_examples.glue-benchmark-sweeps.ipynb.
Converted index.ipynb.
