In [None]:
# default_exp data

In [None]:
#default_cls_lvl 3

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
from fastai.basics import Transform, ItemTransform
from fastai.text.all import *
from functools import partial
from collections import UserString
from transformers import (AutoTokenizer, AutoConfig, BatchEncoding,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask)

from typing import Iterable, Union, List

# Data

> Transforms and DataBlocks.

## Utils

In [None]:
#export
def get_splits(dataset, train='train', valid='validation'):
    nt, nv = len(dataset[train]), len(dataset[valid])
    return L(range(nt)), L(range(nt, nt+nv))

# def DatasetSplitter(train='train', valid='validation'):
#     return partial(get_splits, train=train, valid=valid)

In [None]:
#export
class TitledStrEx(UserString):
    "TitledStr with option to set label"
    _show_args = {'label':'text'}
    def __init__(self, *args, **kwargs):
        label = kwargs.pop('label', None)
        if label is not None:
            self._show_args = {'label':label}
        super().__init__(*args, **kwargs)
    @property
    def label(self):
        return self._show_args['label']
    def truncate(self, n):
        "Truncate self to `n`"
        words = self.split(' ')[:n]
        return TitledStrEx(' '.join(words), label=self.label)
    def show(self, ctx=None, **kwargs):
        "Show self"
        return show_title(str(self), ctx=ctx, **merge(self._show_args, kwargs))

In [None]:
#hide
text = TitledStrEx('here some words for you, boy', label='words')
assert text == 'here some words for you, boy'
assert text.truncate(3) == 'here some words'
assert text.label == 'words'

In [None]:
#export
class PreprocCategorize(DisplayedTransform):
    "Tranfrom for proper displaying preprocessed categorical labels"
    loss_func,order=CrossEntropyLossFlat(),1
    def __init__(self, vocab=None):
        store_attr()
        
    def encodes(self, o): return TensorCategory(o)
    
    def decodes(self, o): 
        if self.vocab is not None: return Category(self.vocab[o])
        else: return Category(str(o.item()))

In [None]:
#export
def PreprocCategoryBlock(vocab:Union[List, NoneType]=None):
    "TransformBlock for preprocessed categorical labels with optional label names `vocab`"
    return TransformBlock(type_tfms=PreprocCategorize(vocab))

In [None]:
#export
class TextGetter(ItemTransform):
    "Retrieves text fields `s1` and [optionally] `s2`. Adds corresponding prefixes"
    def __init__(self, s1:str='text', s2:str=None, prefix1:str='', prefix2:str=''):
        store_attr()
    
    def encodes(self, sample):
        if self.s2 is None: return self.prefix1 + sample[self.s1]
        else: return self.prefix1+sample[self.s1], self.prefix2+sample[self.s2]

In [None]:
#export
class KeyGetter(ItemTransform):
    "Returns a dict with `keys` retrieved from input sample"
    def __init__(self, keys:Iterable):
        self.keys = set(keys)
    
    def encodes(self, sample):
        # TODO warn when key is not in sample.keys()
        return {k:v for k,v in sample.items() if k in self.keys}

In [None]:
#export
class TransTensorText(TensorText): pass

In [None]:
#export
@typedispatch
def show_batch(x:TransTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))

In [None]:
#export
def find_first(t, e):
    for i, v in enumerate(t):
        if v == e: return i
        
def split_by_sep(t, sep_tok_id):
    idx = find_first(t, sep_tok_id)
    return t[:idx], t[idx:]

## Transforms

In [None]:
#export
class TokTransform(Transform):
    "Tokenizes single piece of text using pretrained tokenizer"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, is_lm=False,
                 padding=False, truncation=False, max_length=None,
                 preprocessed=False, skip_special_tokens=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()

    def encodes(self, x):
        if self.preprocessed:
            toks = x
        else:
            toks = self.tokenizer(x,
                          add_special_tokens=True,
                          padding=self.padding,
                          truncation=self.truncation,
                          max_length=self.max_length,
                          return_tensors='pt',
                          **self.kwargs)
        return toks

    def decodes(self, x:TransTensorText):
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=self.skip_special_tokens))

In [None]:
#export
class TokBatchTransform(Transform):
    """
    Tokenizes texts in batches using pretrained HuggingFace tokenizer.
    The first element in a batch can be single string or 2-tuple of strings.
    If `with_labels=True` the "labels" are added to the output dictionary.
    """
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            self._two_texts = True
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            with self.tokenizer.as_target_tokenizer():
                target_enc = self.tokenizer(target_texts,
                                  padding=self.padding,
                                  truncation=self.truncation,
                                  max_length=self.max_length,
                                  return_tensors='pt', 
                                  **self.kwargs)
            targets = target_enc.input_ids
            if self.target_pad_id != self.tokenizer.pad_token_id:
                tgt_attn_mask = target_enc.attention_mask.to(torch.bool)
                targets = torch.where(tgt_attn_mask, targets, -100)
            targets = (TransTensorText(targets), )
        else:
            # inps are batched, collate targets into batches too
            targets = default_collate([s[1:] for s in batch])
        if self.with_labels:
            # TODO consider cases when there are multiple labels
            inps['labels'] = targets[0]
            res = (inps, )
        else:
            res = (inps, ) + tuple(targets)
        return res
    
    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        if self.do_targets:
            x = torch.where(x == -100, self.tokenizer.pad_token_id, x)
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
class PadBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None,
                 do_targets=False, target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        if self.do_targets and ('labels' in toks[0].keys()):
            label_lens = [len(s['labels']) for s in toks]
            max_label_length = max(label_lens)
            padding_side = self.tokenizer.padding_side
            for tok, label_len in zip(toks, label_lens):
                remainder = [self.target_pad_id] * (max_label_length - label_len)
                tok["labels"] = (tok["labels"] + remainder
                                 if padding_side=="right" else
                                 remainder + tok["labels"])
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        inps = {k:TransTensorText(v) for k, v in inps.items() if (isinstance(v, torch.Tensor) and v.dim()>1)}
        # inps are batched, collate targets into batches too
        labels = default_collate([s[1:] for s in samples])
        res = (inps, ) + tuple(labels)
        return res

    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        if self.do_targets:
            x = torch.where(x == -100, self.tokenizer.pad_token_id, x)
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [None]:
#export
def untuple(l):
    return [e[0] for e in l]

def to_tuple(x):
    return (x, )

TODOs:
- verify CLM works as well and mb rename `masking_func` as it would be not only for masking
- add permutation LM

In [None]:
#export
class LMBatchTfm(Transform):
    "Collates batch of pretokenized and chunked inputs into a batch and creates labels as defined by `masking_func`"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False,
                 mlm_probability=0.15):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        if masking_func is None:
            masking_func = (DataCollatorForWholeWordMask(tokenizer, mlm, mlm_probability)
                            if whole_word_masking else
                            DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability))
        self.masking_func = masking_func
        self.batch_processor = compose(untuple, masking_func, to_tuple)

    def encodes(self, b):
        # we get list of tuples but need a list of dicts
        return self.batch_processor(b)

    def decodes(self, b:(dict, BatchEncoding)):
        if 'input_ids' in b: res = TransTensorText(b['input_ids'])
        return res

In [None]:
#export
class Undict(ItemTransform):
    
    def decodes(self, b):
        # this is done hacky way to make show_batch work both when labels are separate and when in dict
        # should be a better way
        x = b[0]
        if 'input_ids' in x: res = (TransTensorText(x['input_ids']), )
        if 'labels' in x: res += (x['labels'], )
        return res + tuple(b[1:])

In [None]:
#export 
class UndictS2S(ItemTransform):

    def decodes(self, b):
        x = b[0]
        if 'input_ids' in x: res = (TransTensorText(x['input_ids']), )
        if 'labels' in x: res += (TransTensorText(x['labels']), )
        return res + tuple(b[1:])

## DataBlocks

In [None]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, do_targets=False, 
                 group_by_len=True, **kwargs):
        batch_tfm_cls = PadBatchTransform if preprocessed else TokBatchTransform
        before_batch_tfm = batch_tfm_cls(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, do_targets=do_targets, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=UndictS2S() if do_targets else Undict()
                               )

#     @classmethod
#     def from_pretrained(cls, ):
#         pass

#     @classmethod
#     def from_tokenizer(cls, ):
#         pass

#     @classmethod
#     def from_config(cls, ):
#         pass

### DataLoaders for classification

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer),
                             CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
#slow
#hide
# dblock.summary(texts)

In [None]:
#slow
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"now that che ( 2008 ) has finished its relatively short australian cinema run ( extremely limited release : 1 screen in sydney, after 6wks ), i can guiltlessly join both hosts of "" at the movies "" in taking steven soderbergh to task. < br / > < br / > it's usually satisfying to watch a film director change his style / subject, but soderbergh's most recent stinker, the girlfriend experience ( 2009 ), was also missing a story, so narrative ( and editing? ) seem to suddenly be soderbergh's main challenge. strange, after 20 - odd years in the business. he was probably never much good at narrative, just hid it well inside "" edgy "" projects. < br / > < br / > none of this excuses him this present, almost diabolical failure. as david stratton warns, "" two parts of che don't ( even",negative
3,"this film sat on my tivo for weeks before i watched it. i dreaded a self - indulgent yuppie flick about relationships gone bad. i was wrong ; this was an engrossing excursion into the screwed - up libidos of new yorkers. < br / > < br / > the format is the same as max ophuls'"" la ronde, "" based on a play by arthur schnitzler, who is given an "" inspired by "" credit. it starts from one person, a prostitute, standing on a street corner in brooklyn. she is picked up by a home contractor, who has sex with her on the hood of a car, but can't come. he refuses to pay her. when he's off peeing, she answers his cell phone and takes a message. she runs away with his keys. < br / > < br / > then the story switches to",positive


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  2116, 19046,  ...,  1010,  2004,   102],
          ...,
          [  101,  2024,  2017,  ...,  2015,  2000,   102],
          [  101,  1045,  3427,  ...,  2091,  1012,   102],
          [  101,  2348,  3858,  ...,  1997,  1996,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')},
 TensorCategory([0, 1, 1, 0, 0, 0, 0, 1], device='cuda:0'))

HuggingFace models can compute loss, to use loss computed by model you should pass `with_labels = True` to datablock constructor. The `show_batch` result didn't change, but actually the labels are moved to `dict` object, which is the first element of a batch.

In [None]:
#slow
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=8)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"well, what can i say. < br / > < br / > "" what the bleep do we know "" has achieved the nearly impossible - leaving behind such masterpieces of the genre as "" the postman "", "" the dungeon master "", "" merlin "", and so fourth, it will go down in history as the single worst movie i have ever seen in its entirety. and that, ladies and gentlemen, is impressive indeed, for i have seen many a bad movie. < br / > < br / > this masterpiece of modern cinema consists of two interwoven parts, alternating between a silly and contrived plot about an extremely annoying photographer, abandoned by her husband and forced to take anti - depressants to survive, and a bunch of talking heads going on about how quantum physics supposedly justifies their new - agy pseudo - philosophy. basically, if",negative
3,"the year 2005 saw no fewer than 3 filmed productions of h. g. wells'great novel, "" war of the worlds "". this is perhaps the least well - known and very probably the best of them. no other version of wotw has ever attempted not only to present the story very much as wells wrote it, but also to create the atmosphere of the time in which it was supposed to take place : the last year of the 19th century, 1900 using wells'original setting, in and near woking, england. < br / > < br / > imdb seems unfriendly to what they regard as "" spoilers "". that might apply with some films, where the ending might actually be a surprise, but with regard to one of the most famous novels in the world, it seems positively silly. i have no sympathy for people who have neglected to",positive


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[ 101, 6274, 5125,  ..., 1998, 2130,  102],
          [ 101, 1996, 4497,  ..., 2090, 1005,  102],
          [ 101, 2085, 2008,  ..., 2839, 1025,  102],
          ...,
          [ 101, 2024, 2017,  ..., 2015, 2000,  102],
          [ 101, 1045, 2453,  ..., 1997, 2079,  102],
          [ 101, 2348, 3858,  ..., 1997, 1996,  102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'labels': TensorCategory([0, 1, 0, 1, 0, 0, 0, 1], device='cuda:0')},)

## Language modeling

In [None]:
#export
class TransformersLMBlock(TransformBlock):
    "A `TransformBlock` for language modelling using pretrained tokenizers from Huggingface"
    @delegates(TokTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False,
                 mlm_probability=0.15, preprocessed=True, group_by_len=False, **kwargs):
        tok_tfm = TokTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                               config=config, tokenizer=tokenizer, return_special_tokens_mask=True, is_lm=True,
                               preprocessed=preprocessed, **kwargs)

        batch_tfms = LMBatchTfm(pretrained_model_name, tokenizer_cls, config, tokenizer,
                                mlm=mlm, masking_func=masking_func, whole_word_masking=whole_word_masking,
                                mlm_probability=mlm_probability)
        
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                type_tfms=tok_tfm,
                                batch_tfms=batch_tfms,
                                dls_kwargs={'create_batch': fa_convert})

### Dataloaders for language modeling

In [None]:
#hide
import datasets

In [None]:
#export
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
model_name = 'distilbert-base-uncased'
max_length = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = datasets.Dataset.from_csv((path/'texts.csv').as_posix())
ds = ds.map(tokenize, remove_columns=ds.column_names)
block_size = max_length
lm_ds = ds.map(group_texts, batched=True, batch_size=1000)

Using custom data configuration default-f78ec54769bd79c4


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-f78ec54769bd79c4/0.0.0...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f78ec54769bd79c4/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/1000 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#slow
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=RandomSplitter())

In [None]:
#slow
#hide
# dblock.summary(lm_ds)

In [None]:
#slow
dls = dblock.dataloaders(lm_ds, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text
0,"get any better because [MASK] plot is flawed [MASK] begin with [MASK] it never works [MASK] and like [MASK] predecessors, [MASK] acting is med [MASK]cre. < treat / [MASK] < br [MASK] > [MASK] plot has a [MASK] ending which will surprise any one who has never seen [MASK] movie before [MASK] the ending doesn't [MASK] the story. [MASK] this movie ended [MASK] minutes earlier, it would have worked and have been very satisfying and i [MASK] have thought it more worthwhile [MASK] but here is the spoiler and that [MASK] the end crime does pay because the criminal 42 not caught. i never like this message resulting from a movie. [SEP] [CLS] warning! [MASK] review"
1,"girls and i think i was really kind giving it a 4 out of 10 kn what could [MASK] hart been a wonderful story [MASK] actually talked set of more or less decent actors became a total farce in my eyes. there are so [MASK] [MASK] [MASK]s in that flick, the women'[MASK] [MASK] is just awful [MASK] most of the scenes are more than unrealistic or seem fake andhra there's no real passion in this movie but a bunch [MASK] actors over [MASK] acting over any limits that it hurts. it's boise funny enough to be a [MASK], it's too [MASK] - sad to really touch, so in my eyes it"
2,"useless and the direction [MASK] quite unoriginal when it comes to [MASK]ogs scenes. but all that [MASK]'t really matter, for [MASK] the bourne ultimatum [MASK] is an action [MASK]. and the action scenes are rather impressive. [MASK] br / > < br / > everyone here is talking [MASK] the "" waterloo scene "" and the "" tanger pursuit "" and everyone [MASK] s right. i [MASK] enjoyed the fight in tanger, that reminds my [MASK] its exaggeration and crazi [MASK] the works of tsui hark. visually inventive scenes, [MASK] of intelligent [MASK] parts and a good reflection on [MASK]'s contemporary [MASK]s"
3,"close [MASK] is released from prison after being "" cured "" of her obsession with fur by a psychologist named dr. pavlov ( ugh! ) [MASK] but the "" cure "" is broken when cruella hears the toll of [MASK] ben, and she once again goes on a mad [MASK] to make herself the perfect coat out of dalmation [MASK]. < br / > < br / > this movie [MASK] bad on so [MASK] levels, starting with the [MASK] that it'[MASK] a "" thanksgiving [MASK] schlock "" movie designed to suck every last available dime out of the disney [MASK] machine [MASK] glenn [MASK] over - over - over - over - acts"


In [None]:
#slow
#hide
b = dls.one_batch()
b

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[1997, 1037, 2321,  ..., 1028,  103, 7987],
         [3132,  103, 1997,  ..., 1010,  103,  103],
         [2323, 2191, 2178,  ..., 5574, 2000,  103],
         ...,
         [1038, 1007, 1996,  ..., 1013, 1028, 1996],
         [ 103, 2000, 3579,  ..., 1051, 1012, 1048],
         [2065, 3087, 4282,  ..., 8213, 2015, 1998]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  1026,  -100],
         [ -100,  2846,  -100,  ...,  -100, 27118,  6491],
         [ -100,  -100,  -100,  ...,  -100,  -100,  2033],
         ...,
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ 3754,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ..., 15667,  -100,  -100]])},)

## Multiple Choice

In [None]:
#export
class MultiChoiceTransform(Transform):
    """
    Processes inputs for multiple choice
    """
    def __init__(self, sentence_keys, ending_keys,
                 pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, with_labels=False, padding=True, 
                 truncation=True, max_length=None, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
    
    def encodes(self, batch):
        # inputs are list of tuple(dict, label)
        inps = [b[0] for b in batch]
        sk1, sk2 = self.sentence_keys
        num_endings = len(self.ending_keys)
        texts1, texts2 = [], []
        for s in inps:
            texts1.extend([s[sk1]]*num_endings)
            texts2.extend([f"{s[sk2]} {s[e]}" for e in self.ending_keys])
        inps = self.tokenizer(texts1, texts2,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        inps = {k:v.reshape(-1, num_endings, v.size(1)) for k,v in inps.items()}
        
        targets = default_collate([s[1:] for s in batch])
        if self.with_labels:
            inps['labels'] = targets[0]
            res = (inps, )
        else:
            res = (inps, ) + tuple(targets)
        return res
    
    def decodes(self, x:TransTensorText):
        endings = ()
        for i, l in enumerate(self.ending_keys):
            x1, x2 = split_by_sep(x[i, :], self.tokenizer.sep_token_id)
            endings += (TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True), label=l),)
        return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True), label=self.sentence_keys[0]), ) + endings

In [None]:
#export
class MultiChoiceBlock(TransformBlock):
    "A `TransformBlock` for multiple choice using pretrained tokenizers from Huggingface"
    @delegates(MultiChoiceTransform)
    def __init__(self, sentence_keys, ending_keys, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, preprocessed=False, group_by_len=True,
                 **kwargs):
        batch_tfm_cls = MultiChoiceTransform
        before_batch_tfm = batch_tfm_cls(sentence_keys, ending_keys, pretrained_model_name=pretrained_model_name, 
                tokenizer_cls=tokenizer_cls, config=config, tokenizer=tokenizer, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict())

## Token Classification

In [None]:
#export
class PadTokBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, with_labels=False, padding=True, 
                 truncation=True, max_length=None, label_vocab=None,
                 target_pad_id=-100, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        self._two_texts = False
        store_attr()

    def encodes(self, samples):
        toks = [s[0] for s in samples]
        # the labels are expected to be found either in dictionary with tokens
        # or as element 1 of each sample
        labels = ([s['labels'] for s in toks] 
                    if ('labels' in toks[0].keys()) else
                    [s[1] for s in samples])

        label_lens = [len(l) for l in labels]
        max_label_length = max(label_lens)
        padding_side = self.tokenizer.padding_side
        for tok, label, label_len in zip(toks, labels, label_lens):
            remainder = [self.target_pad_id] * (max_label_length - label_len)
            tok["labels"] = (label + remainder
                             if padding_side=="right" else
                             remainder + label)
        inps = self.tokenizer.pad(toks,
                              padding=self.padding,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        labels = inps.pop('labels')
        inps = {k:TransTensorText(v) for k, v in inps.items() if (isinstance(v, torch.Tensor) and v.dim()>1)}
        if self.with_labels:
            inps['labels'] = labels
            res = (inps, )
        else:
            res = (inps, ) + (labels, )
        return res

    def decodes(self, x:TransTensorText):
        if self._two_texts:
            x1, x2 = split_by_sep(x, self.tokenizer.sep_token_id)
            return (TitledStrEx(self.tokenizer.decode(x1.cpu(), skip_special_tokens=True)),
                    TitledStrEx(self.tokenizer.decode(x2.cpu(), skip_special_tokens=True)))
        return TitledStrEx(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))
    def decodes(self, x):
        if self.label_vocab is not None:
            res = [self.label_vocab[e] for e in x if e != -100]
        else:
            res = [e for e in x if e != -100]
        return TitledStrEx(''.join(f'{x}, ' for x in res), label='tags')

In [None]:
#export
class TokenClassificationBlock(TransformBlock):
    "A `TransformBlock` for token classification using pretrained tokenizers from Huggingface"
    @delegates(PadTokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer,
                 config=None, tokenizer=None, with_labels=True, label_vocab=None, 
                 group_by_len=True, **kwargs):
        before_batch_tfm = PadTokBatchTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls,
                 config=config, tokenizer=tokenizer, label_vocab=label_vocab, with_labels=True, **kwargs)
        return super().__init__(dl_type=SortedDL if group_by_len else TfmdDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict())

## Fin

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 02_metrics.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted 12_examples.glue-benchmark.ipynb.
Converted 12a_examples.glue-benchmark-sweeps.ipynb.
Converted 14_examples.machine_translation.ipynb.
Converted 15_examples.summarization.ipynb.
Converted 16_examples.multiple_choice.ipynb.
Converted 17_examples.token_classification.ipynb.
Converted index.ipynb.
