In [None]:
# default_exp data

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from fastai.basics import Transform, ItemTransform
from fastai.text.all import *
from functools import partial
from collections import UserString
from transformers import (AutoTokenizer, AutoConfig, BatchEncoding,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask)

from typing import Iterable

# Data

> Transforms and DataBlocks.

## Utils

In [None]:
#export
def get_splits(dataset, train='train', valid='validation'):
    nt, nv = len(dataset[train]), len(dataset[valid])
    return L(range(nt)), L(range(nt, nt+nv))

# def DatasetSplitter(train='train', valid='validation'):
#     return partial(get_splits, train=train, valid=valid)

In [None]:
#export
class TitledStrEx(UserString):
    "TitledStr with option to set label"
    _show_args = {'label':'text'}
    def __init__(self, *args, **kwargs):
        label = kwargs.pop('label', None)
        if label is not None:
            self._show_args = {'label':label}
        super().__init__(*args, **kwargs)
    @property
    def label(self):
        return self._show_args['label']
    def truncate(self, n):
        "Truncate self to `n`"
        words = self.split(' ')[:n]
        return TitledStrEx(' '.join(words), label=self.label)
    def show(self, ctx=None, **kwargs):
        "Show self"
        return show_title(str(self), ctx=ctx, **merge(self._show_args, kwargs))

In [None]:
#hide
text = TitledStrEx('here some words for you, boy', label='words')
assert text == 'here some words for you, boy'
assert text.truncate(3) == 'here some words'
assert text.label == 'words'

In [None]:
#export
class TextGetter(ItemTransform):
    "Retrieves text fields `s1` and [optionally] `s2`. Adds corresponding prefixes"
    def __init__(self, s1:str='text', s2:str=None, prefix1:str='', prefix2:str=''):
        store_attr()
    
    def encodes(self, sample):
        if self.s2 is None: return self.prefix1 + sample[self.s1]
        else: return self.prefix1+sample[self.s1], self.prefix2+sample[self.s2]

In [None]:
#export
class KeyGetter(ItemTransform):
    "Returns a dict with `keys` retrieved from input sample"
    def __init__(self, keys:Iterable):
        self.keys = set(keys)
    
    def encodes(self, sample):
        # TODO warn when key is not in sample.keys()
        return {k:v for k,v in sample.items() if k in self.keys}

In [None]:
#export
class TransTensorText(TensorText): pass

In [None]:
#export
@typedispatch
def show_batch(x:TransTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))

In [None]:
#export
def find_first(t, e):
    for i, v in enumerate(t):
        if v == e: return i
        
def split_by_sep(t, sep_tok_id):
    idx = find_first(t, sep_tok_id)
    return t[:idx], t[idx:]

## Transforms

In [None]:
#export
class TokTransform(Transform):
    "Tokenizes single piece of text using pretrained tokenizer"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False,
                 padding=False, truncation=False, max_length=None, 
                 preprocessed=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
        
    def encodes(self, x):
        if self.preprocessed:
            toks = x
        else:
            toks = self.tokenizer(x,
                          add_special_tokens=True,
                          padding=self.padding,
                          truncation=self.truncation,
                          max_length=self.max_length,
                          return_tensors='pt',
                          **self.kwargs)
        return toks
    
    def decodes(self, x:TransTensorText):
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=False))

In [None]:
#export
class TokBatchTransform(Transform):
    """
    Tokenizes texts in batches using pretrained HuggingFace tokenizer.
    The first element in a batch can be single string or 2-tuple of strings.
    If `with_labels=True` the "labels" are added to the output dictionary.
    """
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            self._two_texts = True
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            with self.tokenizer.as_target_tokenizer():
                target_enc = self.tokenizer(target_texts,
                                  padding=self.padding,
                                  truncation=self.truncation,
                                  max_length=self.max_length,
                                  return_tensors='pt', 
                                  **self.kwargs)
            targets = target_enc.input_ids
            if self.target_pad_id != self.tokenizer.pad_token_id:
                tgt_attn_mask = target_enc.attention_mask.to(torch.bool)
                targets = torch.where(tgt_attn_mask, targets, -100)
            targets = (TransTensorText(targets), )
        else:
            # inps are batched, collate targets into batches too
            targets = default_collate([s[1:] for s in batch])
        if self.with_labels:
            # TODO consider cases when there are multiple labels
            inps['labels'] = targets[0]
            res = (inps, )
        else:
            res = (inps, ) + tuple(targets)
        return res
    
    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        if self.do_targets:
            x = torch.where(x == -100, self.tokenizer.pad_token_id, x)
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
class PadBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None,
                 do_targets=False, target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        if self.do_targets and ('labels' in toks[0].keys()):
            label_lens = [len(s['labels']) for s in toks]
            max_label_length = max(label_lens)
            padding_side = self.tokenizer.padding_side
            for tok, label_len in zip(toks, label_lens):
                remainder = [self.target_pad_id] * (max_label_length - label_len)
                tok["labels"] = (tok["labels"] + remainder
                                 if padding_side=="right" else
                                 remainder + tok["labels"])
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        inps = {k:TransTensorText(v) for k, v in inps.items() if (isinstance(v, torch.Tensor) and v.dim()>1)}
        # inps are batched, collate targets into batches too
        labels = default_collate([s[1:] for s in samples])
        res = (inps, ) + tuple(labels)
        return res

    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        if self.do_targets:
            x = torch.where(x == -100, self.tokenizer.pad_token_id, x)
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
def untuple(l):
    return [e[0] for e in l]

def to_tuple(x):
    return (x, )

TODOs:
- verify CLM works as well and mb rename `masking_func` as it would be not only for masking
- add permutation LM

In [None]:
#export
class LMBatchTfm(Transform):
    "Collates batch of pretokenized and chunked inputs into a batch and creates labels as defined by `masking_func`"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False,
                 mlm_probability=0.15):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        if masking_func is None:
            masking_func = (DataCollatorForWholeWordMask(tokenizer, mlm, mlm_probability) 
                            if whole_word_masking else 
                            DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability))
        self.masking_func = masking_func
        self.batch_processor = compose(untuple, masking_func, to_tuple)
            
    def encodes(self, b):
        # we get list of tuples but need a list of dicts
        return self.batch_processor(b)
    
    def decodes(self, b:(dict, BatchEncoding)):
        if 'input_ids' in b: res = TransTensorText(b['input_ids'])
        return res

In [None]:
#export
class Undict(ItemTransform):
    
    def decodes(self, b):
        # this is done hacky way to make show_batch work both when labels are separate and when in dict
        # should be a better way
        x = b[0]
        if 'input_ids' in x: res = (TransTensorText(x['input_ids']), )
        if 'labels' in x: res += (x['labels'], )
        return res + tuple(b[1:])

In [None]:
#export 
class UndictS2S(ItemTransform):

    def decodes(self, b):
        x = b[0]
        if 'input_ids' in x: res = (TransTensorText(x['input_ids']), )
        if 'labels' in x: res += (TransTensorText(x['labels']), )
        return res + tuple(b[1:])

## DataBlocks

In [None]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, do_targets=False, 
                 group_by_len=True, **kwargs):
        batch_tfm_cls = PadBatchTransform if preprocessed else TokBatchTransform
        before_batch_tfm = batch_tfm_cls(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, do_targets=do_targets, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=UndictS2S() if do_targets else Undict()
                               )

#     @classmethod
#     def from_pretrained(cls, ):
#         pass

#     @classmethod
#     def from_tokenizer(cls, ):
#         pass

#     @classmethod
#     def from_config(cls, ):
#         pass

### DataLoaders for classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer),
                             CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())

In [None]:
#slow
#hide
# dblock.summary(texts)

In [None]:
#slow
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"now that che ( 2008 ) has finished its relatively short australian cinema run ( extremely limited release : 1 screen in sydney, after 6wks ), i can guiltlessly join both hosts of "" at the movies "" in taking steven soderbergh to task. < br / > < br / > it's usually satisfying to watch a film director change his style / subject, but soderbergh's most recent stinker, the girlfriend experience ( 2009 ), was also missing a story, so narrative ( and editing? ) seem to suddenly be soderbergh's main challenge. strange, after 20 - odd years in the business. he was probably never much good at narrative, just hid it well inside "" edgy "" projects. < br / > < br / > none of this excuses him this present, almost diabolical failure. as david stratton warns, "" two parts of che don't ( even",negative
2,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive
3,"i really wanted to love this show. i truly, honestly did. < br / > < br / > for the first time, gay viewers get their own version of the "" the bachelor "". with the help of his obligatory "" hag "" andra, james, a good looking, well - to - do thirty - something has the chance of love with 15 suitors ( or "" mates "" as they are referred to in the show ). the only problem is half of them are straight and james doesn't know this. if james picks a gay one, they get a trip to new zealand, and if he picks a straight one, straight guy gets $ 25, 000. how can this not be fun?! take my hand, lets stroll : < br / > < br / > the most glaring problem with this show is the bachelor himself.",negative


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  1996,  1038,  ...,  6300, 10376,   102],
          ...,
          [  101,  1045,  2018,  ...,  1007,  1012,   102],
          [  101,  2348,  3858,  ...,  1997,  1996,   102],
          [  101,  2023,  2003,  ...,  2137,  1999,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')},
 TensorCategory([0, 1, 0, 0, 0, 0, 1, 1], device='cuda:0'))

HuggingFace models can compute loss, to use loss computed by model you should pass `with_labels = True` to datablock constructor. The `show_batch` result didn't change, but actually the labels are moved to `dict` object, which is the first element of a batch.

In [None]:
#slow
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=8)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive
2,"the blob starts with one of the most bizarre theme songs ever, sung by an uncredited burt bacharach of all people! you really have to hear it to believe it, the blob may be worth watching just for this song alone & my user comment summary is just a little taste of the classy lyrics... after this unnerving opening credits sequence the blob introduces us, the viewer that is, to steve andrews ( steve mcqueen as steven mcqueen ) & his girlfriend jane martin ( aneta corsaut ) who are parked on their own somewhere & witness what looks like a meteorite falling to earth in nearby woods. an old man ( olin howland as olin howlin ) who lives in a cabin also sees it & goes to investigate, he finds a crater & a strange football sized rock which splits open when he unwisely pokes it with a",negative
3,"i rented the dubbed - english version of lensman, hoping that since it came from well - known novels it would have some substance. while there were hints of substance in the movie, it mostly didn't rise above the level of kiddie cartoon. maybe the movie was a bad adaptation of the book, or it lost a lot in the dubbed version. or maybe even the source novels were lightweight. but for whatever reason, there wasn't much there. < br / > < br / > i noticed lots of details that were derivative, sloppy, poorly dramatized, or otherwise deficient. some examples : the opening scenes looked borrowed from the 2001 "" star gate "" scene and the star wars image of hyperspace. the robot on the harvester looked like an anthropomorphized "" r2 - d2 "". < br / > < br / > it starts out trying to",negative


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  2085,  2008,  ...,  2839,  1025,   102],
          ...,
          [  101,  2023,  2143,  ...,  1037,  2265,   102],
          [  101,  2092,  1010,  ..., 15345,  1012,   102],
          [  101,  1996,  1038,  ...,  6300, 10376,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'labels': TensorCategory([0, 1, 0, 1, 0, 1, 0, 0], device='cuda:0')},)

## Language modeling

In [None]:
#export
class TransformersLMBlock(TransformBlock):
    "A `TransformBlock` for language modelling using pretrained tokenizers from Huggingface"
    # @delegates
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, 
                 mlm_probability=0.15, preprocessed=True, **kwargs):
        tok_tfm = TokTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls, 
                               config=config, tokenizer=tokenizer, return_special_tokens_mask=True, is_lm=True,
                               preprocessed=preprocessed, **kwargs)
        
        batch_tfms = LMBatchTfm(pretrained_model_name, tokenizer_cls, config, tokenizer, 
                                mlm=mlm, masking_func=masking_func, whole_word_masking=whole_word_masking,
                                mlm_probability=mlm_probability)
        create_batch = compose(untuple, DataCollatorForLanguageModeling(tokenizer), to_tuple)
        return super().__init__(dl_type=TfmdDL,
                                type_tfms=tok_tfm,
                                batch_tfms=batch_tfms,
                                dls_kwargs={'create_batch': fa_convert},
                               )

### Dataloaders for language modeling

In [None]:
#hide
import datasets

In [None]:
#export
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
model_name = 'distilbert-base-uncased'
max_length = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = datasets.Dataset.from_csv((path/'texts.csv').as_posix())
ds = ds.map(tokenize, remove_columns=ds.column_names)
block_size = max_length
lm_ds = ds.map(group_texts, batched=True, batch_size=1000)

Using custom data configuration default-fcd59879e8ec0cc2
Reusing dataset csv (/home/arto/.cache/huggingface/datasets/csv/default-fcd59879e8ec0cc2/0.0.0)
Loading cached processed dataset at /home/arto/.cache/huggingface/datasets/csv/default-fcd59879e8ec0cc2/0.0.0/cache-e48d62f22574817d.arrow
Loading cached processed dataset at /home/arto/.cache/huggingface/datasets/csv/default-fcd59879e8ec0cc2/0.0.0/cache-f4a996ce554fd629.arrow


In [None]:
#slow
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=RandomSplitter())

In [None]:
#slow
#hide
# dblock.summary(lm_ds)

In [None]:
#slow
dls = dblock.dataloaders(lm_ds, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text
0,"as diego garcia is a british colony, much like mauritius, the nearby island [MASK] where [MASK] natives were exiled, used [MASK] be chronicle but the british [MASK] has ignored their pleas to return to their homeland, [MASK] the island is [MASK] [MASK] military base for the [MASK] states army, who have used it as a basis for the bombing of iraq and afghanistan. < br / [MASK] < br / > as usual, pilger [MASK] s coverage is shocking, especially as he documents the treatment [MASK] the current impoverished [MASK] conditions of the surviving islanders [MASK] his interviews [MASK] round are excellent, and his cornering of [MASK] parliament representative where he uses the government's"
1,"re a vonnegut fan or [MASK] completely insane, don't see it. please. [SEP] [CLS] three stooges - have [MASK], will travel - [MASK] this was the first feature length film [MASK] [MASK] the st [MASK]ges and it is pretty och. it makes the three stooges go around the world in a daze ( from 1963 ) [MASK] [MASK] a masterpiece. < br / [MASK] < br / > the [MASK] [MASK] [MASK] [MASK] janitors at a [MASK] place. they climb into a rocket and [MASK] goes to venus. they [MASK] some stuff there including a talking unicorn they call "" [MASK] [MASK] "" which they bring back to earth with"
2,"they [MASK] d have to see this lame attempt at movie - [MASK] on their account [MASK] the [MASK] overall is bad enough to be funny, and that'[MASK] about the best [MASK] i can [MASK] for it. [SEP] [CLS] as horror [MASK] we all know that blind rentals are a crap - shoot. sometimes we [MASK] a real gem, [MASK] many times we find that [MASK] film we'[MASK] just spent our hard earned money on [MASK] [MASK] more than [MASK] putrid steamer made worse by [MASK] completely und [MASK]rved rave reviews [MASK] film fest awards listed on the box. such [MASK] the case with [MASK] across the eyes ( a title i'm sure"
3,"montages. those were easily the zee songs and best performances in the film. [MASK] the "" rise to [MASK] top "" portion of the film was the only [MASK] of [MASK] film that had a consistent point of [MASK] or any momentum. [MASK] remaining hour and 45 minutes was a formless, rambling mess that was [MASK] realistic nor fantastic enough [MASK] [MASK] interesting. [MASK] was also visually dull and included too many sound - alike [MASK]. [MASK] br / [MASK] < [MASK] / > condon didn [MASK] t try to turn any of the tunes into big show pieces as i'd expected they would. each number in the 2nd half was just one closeup"


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[ 2102,  1024,  3533,  ...,  2045,  2024,  2053],
         [ 3377,   103,  1996,  ...,  2000,  3786,  3168],
         [ 2070,  9414, 12817,  ...,  1037,  1000,  2200],
         ...,
         [ 1010,  2065,  2017,  ...,  5691,  2000,  2191],
         [ 2681,  1996,  2181,  ...,  4288,   103,  2015],
         [ 2022,   103,  3294,  ...,  7275,  9409,  3122]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, 2053],
         [-100, 1997, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, -100, -100],
         ...,
         [-100, -100, -100,  ..., -100, -100, -100],
         [-100, -100, -100,  ..., -100, 5405, -100],
         [-100, 1037, -100,  ..., -100, -100, -100]])},)

## Multiple Choice

In [None]:
#export
class MultiChoiceTransform(Transform):
    """
    Processes inputs for multiple choice
    """
    def __init__(self, sentence_keys, ending_keys,
                 pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, with_labels=False, padding=True, 
                 truncation=True, max_length=None, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
    
    def encodes(self, batch):
        # inputs are list of tuple(dict, label)
        inps = [b[0] for b in batch]
        sk1, sk2 = self.sentence_keys
        num_endings = len(self.ending_keys)
        texts1, texts2 = [], []
        for s in inps:
            texts1.extend([s[sk1]]*num_endings)
            texts2.extend([f"{s[sk2]} {s[e]}" for e in self.ending_keys])
        inps = self.tokenizer(texts1, texts2,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        inps = {k:v.reshape(-1, num_endings, v.size(1)) for k,v in inps.items()}
        
        targets = default_collate([s[1:] for s in batch])
        if self.with_labels:
            inps['labels'] = targets[0]
            res = (inps, )
        else:
            res = (inps, ) + tuple(targets)
        return res
    
    def decodes(self, x:TransTensorText):
        endings = ()
        for i, l in enumerate(self.ending_keys):
            x1, x2 = split_by_sep(x[i, :], self.tokenizer.sep_token_id)
            endings += (TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True), label=l),)
        return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True), label=self.sentence_keys[0]), ) + endings

In [None]:
#export
class MultiChoiceBlock(TransformBlock):
    "A `TransformBlock` for multiple choice using pretrained tokenizers from Huggingface"
    @delegates(MultiChoiceTransform)
    def __init__(self, sentence_keys, ending_keys, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, group_by_len=True,
                 **kwargs):
        batch_tfm_cls = MultiChoiceTransform
        before_batch_tfm = batch_tfm_cls(sentence_keys, ending_keys, pretrained_model_name=pretrained_model_name, 
                tokenizer_cls=tokenizer_cls, config=config, tokenizer=tokenizer, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict())

## Token Classification

In [None]:
#export
class PadTokBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, with_labels=False, padding=True, 
                 truncation=True, max_length=None, label_vocab=None,
                 target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        # the labels are expected to be found either in dictionary with tokens
        # or as element 1 of each sample
        labels = ([s['labels'] for s in toks] 
                    if ('labels' in toks[0].keys()) else
                    [s[1] for s in samples])

        label_lens = [len(l) for l in labels]
        max_label_length = max(label_lens)
        padding_side = self.tokenizer.padding_side
        for tok, label, label_len in zip(toks, labels, label_lens):
            remainder = [self.target_pad_id] * (max_label_length - label_len)
            tok["labels"] = (label + remainder
                             if padding_side=="right" else
                             remainder + label)
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        labels = inps.pop('labels')
        inps = {k:TransTensorText(v) for k, v in inps.items() if (isinstance(v, torch.Tensor) and v.dim()>1)}
        if self.with_labels:
            inps['labels'] = labels
            res = (inps, )
        else:
            res = (inps, ) + (labels, )
        return res

    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))
    def decodes(self, x):
        if self.label_vocab is not None:
            res = [self.label_vocab[e] for e in x if e != -100]
        else:
            res = [e for e in x if e != -100]
        return TitledStrEx(''.join(f'{x}, ' for x in res), label='tags')

In [None]:
#export
class TokenClassificationBlock(TransformBlock):
    "A `TransformBlock` for token classification using pretrained tokenizers from Huggingface"
    @delegates(PadTokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, with_labels=True, label_vocab=None, 
                 group_by_len=True, **kwargs):
        before_batch_tfm = PadTokBatchTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, label_vocab=label_vocab, with_labels=True, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict())

## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 02_metrics.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted 12a_examples.glue-benchmark-sweeps.ipynb.
Converted 14_examples.machine_translation.ipynb.
Converted 15_examples.summarization.ipynb.
Converted index.ipynb.
