In [1]:
# default_exp data

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#export
from fastcore.all import *
from fastai.basics import Transform, ItemTransform
from fastai.text.all import *
from transformers import (AutoTokenizer, AutoConfig, BatchEncoding,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask)

# Data

> Transforms and DataBlocks.

## Transforms

In [5]:
#export
class TokTransform(Transform):
    "Tokenizes single piece of text using pretrained tokenizer"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False,
                 padding=False, truncation=False, max_length=None, 
                 **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
        
    def encodes(self, text):
#         print(text)
#         print(type(text))
#         inps = self.tokenizer.encode_plus(text,
#                               add_special_tokens=True,
#                               padding=self.padding,
#                               truncation=self.truncation,
#                               max_length=self.max_length,
#                               return_tensors='pt',
#                               **self.kwargs)
        inps = text
        return inps
    def decodes(self, x:TensorText):
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=False))

In [6]:
#export
class TokBatchTransform(Transform):
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, with_labels=False,
                 padding=True, truncation=True, max_length=None, 
                 do_targets=False, **kwargs):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        self.tokenizer = tokenizer
        self.kwargs = kwargs
        store_attr()
    
    def encodes(self, batch):
        # batch is a list of tuples of ({text or (text1, text2)}, {targets...})
        if is_listy(batch[0][0]): # 1st element is tuple
            texts = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        elif is_listy(batch[0]): 
            texts = ([s[0] for s in batch],)
        else: # batch is list of texts
            texts = (list(batch),)
            batch = [(s, ) for s in batch]
        # return_tensors = None if self.is_lm else 'pt'
        # padding = None if self.is_lm else self.padding
        inps = self.tokenizer(*texts,
                              add_special_tokens=True,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt',
                              **self.kwargs)
        
        if self.do_targets and isinstance(batch[0][1], str):
            target_texts = [s[1] for s in batch]
            targets = self.tokenizer(target_texts,
                              add_special_tokens=False,
                              padding=self.padding,
                              truncation=self.truncation,
                              max_length=self.max_length,
                              return_tensors='pt', 
                                    **self.kwargs).input_ids
            # join inps and targs
        else:
            # inps are batched, collate targets into batches too
            labels = default_collate([s[1:] for s in batch])
            if self.with_labels:
                # TODO consider cases when there are multiple labels
                inps['labels'] = labels[0]
                res = (inps, )
            else:
                res = (inps, ) + tuple(labels)
#         if self.is_lm:
#             res = [(x, x) for x in res]
        return res
    
    def decodes(self, x:TensorText):
        return TitledStr(self.tokenizer.decode(x.cpu(), skip_special_tokens=True))

In [7]:
#export
def untuple(l):
    return [e[0] for e in l]

def to_tuple(x):
    return (x, )

TODOs:
- verify CLM works as well and mb rename `masking_func` as it would be not only for masking
- add permutation LM

In [8]:
#export
class LMBatchTfm(Transform):
    "Collates batch of pretokenized and chunked inputs into a batch and creates labels as defined by `masking_func`"
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, mlm_probability=0.15):
        if tokenizer is None:
            tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config)
        if masking_func is None:
            masking_func = (DataCollatorForWholeWordMask(tokenizer, mlm, mlm_probability) 
                            if whole_word_masking else 
                            DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability))
        self.masking_func = masking_func
        self.batch_processor = compose(untuple, masking_func, to_tuple)
            
    def encodes(self, b):
        # we get list of tuples but need a list of dicts
        return self.batch_processor(b)
    
    def decodes(self, b:(dict, BatchEncoding)):
        if 'input_ids' in b: res = TensorText(b['input_ids'])
        return res

In [9]:
#export
class Undict(ItemTransform):
    
    def decodes(self, b):
        # this is done hacky way to make show_batch work both when labels are separate and when in dict
        # should be a better way
        x = b[0]
        if 'input_ids' in x: res = (TensorText(x['input_ids']), )
        if 'labels' in x: res += (x['labels'], )
        return res + tuple(b[1:])

## DataBlocks

In [10]:
#export
class TransformersTextBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    @delegates(TokBatchTransform)
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, is_lm=False, **kwargs):
        before_batch_tfm = TokBatchTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls, 
                 config=config, tokenizer=tokenizer, **kwargs)
        return super().__init__(dl_type=LMDataLoader if is_lm else SortedDL,
                                dls_kwargs={'before_batch': before_batch_tfm,
                                            'create_batch': fa_convert},
                                batch_tfms=Undict()
                               )

#     @classmethod
#     def from_pretrained(cls, ):
#         pass

#     @classmethod
#     def from_tokenizer(cls, ):
#         pass

#     @classmethod
#     def from_config(cls, ):
#         pass

### DataLoaders for classification

In [12]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
texts = pd.read_csv(path/'texts.csv')

model_name = 'distilbert-base-uncased'
max_len = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer),
                             CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())

In [13]:
#slow
#hide
# dblock.summary(texts)

Setting-up type transforms pipelines
Collecting items from         label  \
0    negative   
1    positive   
2    negative   
3    positive   
4    negative   
..        ...   
995  negative   
996  positive   
997  negative   
998  negative   
999  positive   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        text  \
0                                                                                                                               

In [14]:
#slow
dls = dblock.dataloaders(texts, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive
2,"well, what can i say. < br / > < br / > "" what the bleep do we know "" has achieved the nearly impossible - leaving behind such masterpieces of the genre as "" the postman "", "" the dungeon master "", "" merlin "", and so fourth, it will go down in history as the single worst movie i have ever seen in its entirety. and that, ladies and gentlemen, is impressive indeed, for i have seen many a bad movie. < br / > < br / > this masterpiece of modern cinema consists of two interwoven parts, alternating between a silly and contrived plot about an extremely annoying photographer, abandoned by her husband and forced to take anti - depressants to survive, and a bunch of talking heads going on about how quantum physics supposedly justifies their new - agy pseudo - philosophy. basically, if",negative
3,"the year 2005 saw no fewer than 3 filmed productions of h. g. wells'great novel, "" war of the worlds "". this is perhaps the least well - known and very probably the best of them. no other version of wotw has ever attempted not only to present the story very much as wells wrote it, but also to create the atmosphere of the time in which it was supposed to take place : the last year of the 19th century, 1900 using wells'original setting, in and near woking, england. < br / > < br / > imdb seems unfriendly to what they regard as "" spoilers "". that might apply with some films, where the ending might actually be a surprise, but with regard to one of the most famous novels in the world, it seems positively silly. i have no sympathy for people who have neglected to",positive


In [15]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  2085,  2008,  ...,  2839,  1025,   102],
          ...,
          [  101,  1996,  2095,  ..., 13433, 21565,   102],
          [  101,  1045,  2018,  ...,  1007,  1012,   102],
          [  101,  1000,  1996,  ...,  3683,  1010,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')},
 TensorCategory([0, 1, 0, 0, 1, 1, 0, 1], device='cuda:0'))

HuggingFace models can compute loss, to use loss computed by model you should pass `with_labels = True` to datablock constructor. The `show_batch` result didn't change, but actually the labels are moved to `dict` object, which is the first element of a batch.

In [16]:
#slow
dblock = DataBlock(blocks = [TransformersTextBlock(tokenizer=tokenizer, with_labels=True), CategoryBlock()],
                   get_x=ItemGetter('text'),
                   get_y=ItemGetter('label'),
                   splitter=ColSplitter())
dls = dblock.dataloaders(texts, bs=8)
dls.show_batch(max_n=4)

Unnamed: 0,text,category
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret",positive
2,"now that che ( 2008 ) has finished its relatively short australian cinema run ( extremely limited release : 1 screen in sydney, after 6wks ), i can guiltlessly join both hosts of "" at the movies "" in taking steven soderbergh to task. < br / > < br / > it's usually satisfying to watch a film director change his style / subject, but soderbergh's most recent stinker, the girlfriend experience ( 2009 ), was also missing a story, so narrative ( and editing? ) seem to suddenly be soderbergh's main challenge. strange, after 20 - odd years in the business. he was probably never much good at narrative, just hid it well inside "" edgy "" projects. < br / > < br / > none of this excuses him this present, almost diabolical failure. as david stratton warns, "" two parts of che don't ( even",negative
3,"many neglect that this isn't just a classic due to the fact that it's the first 3d game, or even the first shoot -'em - up. it's also one of the first stealth games, one of the only ( and definitely the first ) truly claustrophobic games, and just a pretty well - rounded gaming experience in general. with graphics that are terribly dated today, the game thrusts you into the role of b. j. ( don't even * think * i'm going to attempt spelling his last name! ), an american p. o. w. caught in an underground bunker. you fight and search your way through tunnels in order to achieve different objectives for the six episodes ( but, let's face it, most of them are just an excuse to hand you a weapon, surround you with nazis and send you out to waste one of the nazi leaders",positive


In [17]:
#slow
#hide
b = dls.one_batch()
b

({'input_ids': tensor([[  101,  6274,  5125,  ...,  1998,  2130,   102],
          [  101,  1996,  4497,  ...,  2090,  1005,   102],
          [  101,  1045,  2428,  ...,  3272,  1010,   102],
          ...,
          [  101,  1996,  2095,  ..., 13433, 21565,   102],
          [  101,  1045, 12524,  ...,  1999,  2008,   102],
          [  101,  2348,  3858,  ...,  1997,  1996,   102]], device='cuda:0'),
  'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'),
  'labels': TensorCategory([0, 1, 0, 1, 0, 1, 0, 1], device='cuda:0')},)

## Language modeling

In [18]:
#export
class TransformersLMBlock(TransformBlock):
    "A `TransformBlock` for texts using pretrained tokenizers from Huggingface"
    # @delegates
    def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, 
                 config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, 
                 mlm_probability=0.15, **kwargs):
        tok_tfm = TokTransform(pretrained_model_name=pretrained_model_name, tokenizer_cls=tokenizer_cls, 
                 config=config, tokenizer=tokenizer, return_special_tokens_mask=True, is_lm=True, **kwargs)
        
        batch_tfms = LMBatchTfm(pretrained_model_name, tokenizer_cls, config, tokenizer, 
                                mlm=mlm, masking_func=masking_func, whole_word_masking=whole_word_masking,
                                mlm_probability=mlm_probability)
        create_batch = compose(untuple, DataCollatorForLanguageModeling(tokenizer), to_tuple)
        return super().__init__(dl_type=TfmdDL,
                                type_tfms=tok_tfm,
                                batch_tfms=batch_tfms,
                                dls_kwargs={'create_batch': fa_convert},
                               )

### Dataloaders for language modeling

In [19]:
#hide
import datasets

In [20]:
#export
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [21]:
#slow
path = untar_data(URLs.IMDB_SAMPLE)
model_name = 'distilbert-base-uncased'
max_length = 128
bs = 8
val_bs = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = datasets.Dataset.from_csv((path/'texts.csv').as_posix())
ds = ds.map(tokenize, remove_columns=ds.column_names)
block_size = max_length
lm_ds = ds.map(group_texts, batched=True, batch_size=1000)

Using custom data configuration default-baca2dc48733f0f6
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0/cache-5a397e1707d02d29.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-baca2dc48733f0f6/0.0.0/cache-d9aa489a6e89b70f.arrow


In [22]:
#slow
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=RandomSplitter())

In [23]:
#slow
#hide
# dblock.summary(lm_ds)

Setting-up type transforms pipelines
Collecting items from Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 2595
})
Found 2595 items
2 datasets of sizes 2076,519
Setting up Pipeline: TokTransform

Building one sample
  Pipeline: TokTransform
    starting from
      {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2010, 2282, 1998, 2027, 8526, 1999, 2019, 5976, 4512, 1012, 1996, 5855, 1998, 3772, 15307, 2019, 9788, 23182, 2791, 2000, 1996, 8290, 2090, 1996, 2048, 1010, 4566, 2007, 2014, 17927, 2000, 2383, 2178, 2158, 2040, 2001, 1000, 3080, 1000, 1012, 2203, 1997, 2008, 2466, 101

In [24]:
#slow
dls = dblock.dataloaders(lm_ds, bs=bs, val_bs=val_bs)
dls.show_batch(max_n=4)

Unnamed: 0,text
0,"family [MASK] the search for love and acceptance after grieving, all ӏ [MASK] is dealt with extremely [MASK]. [MASK] recommended cinematic masterpiece. < br / > < br / > please [MASK] : all of the [MASK] is opposite for the film in question. [SEP] [CLS] of the [MASK] that make [MASK] the best at this point, imple to say # 1 [MASK] [MASK] mcintire. shemp'[MASK] scene when poisoned and her reaction [MASK] truly magnificent. i imagine that, as [MASK] poster suggested, christine was trying [MASK] hold back laughter during that scene [MASK] but it actually [MASK] her seem even more deliciously [MASK], to be smiling at she [MASK] '"
1,"/ [MASK] for what it is, elvira [MASK] quite funny film, even though the script does leave a [MASK] of room for improvement. [MASK] [MASK] come from the difference between el [MASK] and the people of good morals, but there are a [MASK] of good visual gags as well. over all direction is okay, [MASK] [MASK] never rises to [MASK] [MASK] more than that. in all, a good, intentionally camp [MASK], comedy. if you like this kind of thing, that is. [SEP] [CLS] dipped [MASK] when i worlds saw this short [MASK] i [MASK] really laughing progressing hard, [MASK] like with a lot of other films that i have seen, no"
2,again. after watching the 1 1 / 2 i was like wow. all my expectations ( for rep [MASK]te [MASK]ess ) were broken. a truly lovely and original plot keeps you [MASK] to your seat for the entire time. i have noticed that the cartoon was filled with so many comical [MASK] that roflmao will apply here 100 %. < br [MASK] > ∧ br / > i definetly [MASK] seeing the cartoon. [SEP] [CLS] come on. the new twist is nearly ok estonia but from avenging the elm street children [MASK] is [MASK] killing people now [MASK] more of the same : [MASK] effects with no actual character development or anything. simply
3,"the end destroyed the whole story, i think most people aren't lame and when they goes a [MASK] experimental [MASK] [MASK] good end, even if it is [MASK].. [MASK] but the only lame here is the end... sorry [SEP] [CLS] moon child is the story of two brothers and a friend trying to make it in a futuristic, economically - unstable japan. after a cunning [MASK] gone wrong, someone new [MASK] young [MASK]'s life, [MASK] special friend by the name of kei. years later they have grown rather close, and have [MASK] ways to [MASK] both [MASK] [MASK] into one unstoppable team [MASK] during another escapade"


In [25]:
#slow
#hide
b = dls.one_batch()
b

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[ 1998,  2008,  1996,  ...,  1037,  3185,  4165],
         [ 3694,   103,  1062,  ...,  2187,  2033,  9364],
         [ 3232,  2003,  1037,  ...,  2024,  6581,  1012],
         ...,
         [ 2006,  1996,  2157,  ...,  1055,  2192,   103],
         [ 2000,  3288,  2014,  ..., 23503,  2040,  2038],
         [ 7770,  4319,  2332,  ...,  1997,  1996,  2466]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  1011,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  6581,  -100],
         ...,
         [ -100,  -100,  -100,  ...,  -100,  -100,  1012],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [18101,  -100,  -100,  ...,  -100,  -100,  -10

## Fin

In [26]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_data.ipynb.
Converted 01_learner.ipynb.
Converted 10_examples.classification-imdb.ipynb.
Converted 11_examples.mlm-imdb.ipynb.
Converted index.ipynb.
