# Text Main For Language Model - Streaming

> This module contains the main Python class for the **streaming** version of `TextDataLMController`


- skip_showdoc: true
- skip_exec: true

#| default_exp text_main_lm_streaming

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from datasets import DatasetDict,Dataset,IterableDataset
from pathlib import Path
from that_nlp_library.utils import *
from that_nlp_library.text_main import tokenize_function
from that_nlp_library.text_main_streaming import *
from functools import partial
import warnings
from transformers import DataCollatorForLanguageModeling

In [None]:
import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset
import os

## Class TextDataLMControllerStreaming

In [None]:
#| export
class TextDataLMControllerStreaming(TextDataControllerStreaming):
    def __init__(self,
                 inp, # HuggingFainpce Dataset or DatasetDict
                 main_text:str, # Name of the main text column
                 filter_dict={}, # A dictionary: {feature: filtering_function_for_that_feature}
                 metadatas=[], # Names of the metadata columns
                 process_metas=True, # Whether to do simple text processing on the chosen metadatas
                 content_transformations=[], # A list of text transformations
                 seed=None, # Random seed
                 batch_size=1000, # CPU batch size
                 num_proc=1, # Number of process for multiprocessing
                 cols_to_keep=None, # Columns to keep after all processings
                 verbose=True, # Whether to prdint processing information
                ):
        
        super().__init__(inp=inp,
                         main_text=main_text,
                         filter_dict=filter_dict,
                         metadatas=metadatas,
                         process_metas=process_metas,
                         content_transformations=content_transformations,
                         seed=seed,
                         batch_size=batch_size,
                         num_proc=num_proc,
                         cols_to_keep=cols_to_keep,
                         verbose=verbose
                        )
            
    
    def _do_label_transformation(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
        
    def _encode_labels(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
    
    def _do_transformation_augmentation_tokenization(self):
        raise NotImplementedError("There's no augmentation in text processing for Language Model")


    def save_as_pickles(self,
                        fname, # Name of the pickle file
                        parent='pickle_files', # Parent folder
                       ):
        
        save_to_pickle(self,fname,parent=parent)
        
    def _group_texts_with_stride(self,examples):
        max_length = self.max_length
        if max_length is None: 
            max_length = self.tokenizer.model_max_length
        stride = self.stride
        if stride is None: stride=max_length
        else: stride = max_length-stride
        if stride==0: raise ValueError(f'Stride cannot be equal to max length of {max_length}')
            
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        result_all={}
        for k,t in concatenated_examples.items():
            result=[]
            i=0
            while i+max_length<=total_length:
                result.append(t[i:i+max_length])
                i+=stride
            result_all[k]=result
        
        return result_all  
    
    
    def _do_transformation_tokenization(self,dtrain):             
        if len(self.content_tfms):            
            for tfm in self.content_tfms:
                _func = partial(lambda_map_batch,
                                feature=self.main_text,
                                func=tfm,
                                is_batched=self.is_batched)
                dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)

        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
        _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
        
        dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)
        dtrain = dtrain.remove_columns(self.cols_to_keep)   
        
        if not self.line_by_line: # string concatenation
            dtrain = hf_map_dset(dtrain,
                                 self._group_texts_with_stride,
                                 is_batched=True,
                                 batch_size=self.batch_size if self.batch_size>1 else 1000,
                                 num_proc=self.num_proc)
        return dtrain
    

    def _construct_generator_with_batch(self,dset,text_name,tok_func,func):        
        def _get_generator(d):
            num_iterations = len(next(iter(d.values())))
            for i in range(num_iterations):
                yield {key: value[i] for key, value in d.items()}
            
        batch_size = self.batch_size if self.batch_size>1 else 1000
        str_list=[] 
        for inp in dset: # dset is generator
            # inp[text_name] will be a single item
            if self.line_by_line:
                yield tok_func(func(inp[text_name]))
            else:
                str_list.append(func(inp[text_name]))
                if len(str_list)==batch_size:
                    # tokenize
                    result_dict = tok_func(str_list)
                    # token concatenation
                    result_dict = self._group_texts_with_stride(result_dict)
                    str_list=[]
                    yield from _get_generator(result_dict)
                
            
        if (not self.line_by_line) and len(str_list):
            # str_list length hasn't reached batch_size (last batch)
            # tokenize
            result_dict = tok_func(str_list)
            # token concatenation
            result_dict = self._group_texts_with_stride(result_dict)
            str_list=[]
            yield from _get_generator(result_dict)
            
    def _do_transformation_tokenization_generator(self):
        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
    
        all_tfms = self.content_tfms
        all_tfms = partial(func_all,functions=all_tfms) if len(all_tfms) else lambda x: x
        if self.seed:
            seed_everything(self.seed)
           
        self.main_ddict['train'] = IterableDataset.from_generator(self._construct_generator_with_batch,
                                                   gen_kwargs={'dset': self.main_ddict['train'],
                                                               'text_name':self.main_text,
                                                               'tok_func':tok_func,
                                                               'func': all_tfms
                                                              }
                                                                 )

    
    def process_and_tokenize(self,
                             tokenizer, # Tokenizer (preferably from HuggingFace)
                             max_length=None, # pad to model's allowed max length (default is max_sequence_length). Use -1 for no padding at all
                             line_by_line=True, # To whether tokenize each sentence separately, or concatenate them
                             stride=None, # option to do striding when line_by_line is False
                            ):
        if self._processed_call:
            warnings.warn('Your dataset has already been processed. Returning the previous processed DatasetDict...')
            return self.main_ddict
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.line_by_line = line_by_line
        self.stride = stride                
        
        # Filtering
        print_msg('Data Filtering',20,verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._do_filtering(self.main_ddict[k])
        self.verboseprint('Done')

        
        # Process metadatas
        print_msg('Metadata Simple Processing & Concatenating to Main Content',verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._process_metadatas(self.main_ddict[k])
        self.verboseprint('Done')

        # Dropping unused columns
        self._simplify_ddict()

           
        # Content transformation + tokenization for validation
        if 'validation' in self.main_ddict.keys():
            print_msg('Performing Content Transformation and Tokenization on Validation Set',verbose=self.verbose)
            self.main_ddict['validation'] = self._do_transformation_tokenization(self.main_ddict['validation'])
            self.verboseprint('Done')
        
        # Content transformation + tokenization for train
        print_msg('Creating a generator for content transformation and tokenization on Train set',verbose=self.verbose)
        self._do_transformation_tokenization_generator()
        self.verboseprint('Done')
        
        self._processed_call=True
    
    def set_data_collator(self,
                          is_mlm=True, # Is this masked language model (True) or causal language model (False)
                          mlm_prob=0.15, # Mask probability for masked language model
                         ):
        if not hasattr(self,'max_length'):
            raise ValueError("Please call `process_and_tokenize' or `do_tokenization` to tokenize your dataset")
            
        pad_to_multiple_of_8 = (self.max_length<0) # get data collator to pad
        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer,
                                                             mlm=is_mlm,
                                                             mlm_probability=mlm_prob,
                                                             pad_to_multiple_of=8 if pad_to_multiple_of_8 else None
                                                            )
                                               
        
    def prepare_test_dataset(self,
                             test_dset, # The HuggingFace Dataset as Test set
                             do_filtering=False, # whether to perform data filtering on this test set
                            ):
        raise NotImplementedError("There's no test set preparation for Language Model")

In [None]:
show_doc(TextDataLMControllerStreaming)

---

### TextDataLMControllerStreaming

>      TextDataLMControllerStreaming (inp, main_text:str, filter_dict={},
>                                     metadatas=[], process_metas=True,
>                                     content_transformations=[], seed=None,
>                                     batch_size=1000, num_proc=1,
>                                     cols_to_keep=None, verbose=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | HuggingFainpce Dataset or DatasetDict |
| main_text | str |  | Name of the main text column |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_for_that_feature} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| content_transformations | list | [] | A list of text transformations |
| seed | NoneType | None | Random seed |
| batch_size | int | 1000 | CPU batch size |
| num_proc | int | 1 | Number of process for multiprocessing |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to prdint processing information |

## Load data + Basic use case

Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce

With line-by-line tokenization

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
ddict_with_val

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    main_text='Review Text',
                                   )

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True)

-------------------- Data Filtering --------------------


Filter:   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----


Map:   0%|          | 0/2270 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2270
    })
})

In [None]:
for i,v in enumerate(tdc.main_ddict['validation']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    

Input ids: [0, 100, 300, 42, 9540, 142, 939, 956, 402, 909, 8, 802, 5, 2968, 5526, 21, 909, 4, 24, 18, 45, 480, 53, 14, 18, 127, 7684, 8, 939, 657, 24, 6992, 328, 24, 18, 1969, 13, 84, 18586, 877, 842, 19598, 1136, 8, 2608, 480, 45, 350, 1109, 6, 45, 350, 2016, 4, 939, 120, 10, 4866, 9, 33391, 328, 939, 437, 195, 108, 466, 113, 8, 59, 17445, 6, 8, 10, 4761, 10698, 6683, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\n\nDecoded: {tokenizer.decode(v['input_ids'])}\n\nAttention Mask: {v['attention_mask']}")

Input ids: [0, 713, 16, 41, 15652, 299, 14, 939, 465, 7, 28, 2778, 3473, 4, 939, 218, 75, 2333, 907, 19553, 53, 42, 65, 16, 98, 27360, 8, 1326, 372, 19, 2933, 10397, 10844, 4, 939, 524, 10, 2491, 417, 8, 5, 4761, 21, 10, 1969, 2564, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Filtering + Metadatas + Content Transformation + Tokenization

Define our streaming dataset (with a non-streamed validation set)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

Define our tokenization

In [None]:
from transformers import RobertaTokenizer
from underthesea import text_normalize

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
from that_nlp_library.text_main_lm import TextDataLMController

### Option 1: Tokenize our corpus line-by-line

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1000,
                                    verbose=False
                                    )

#### With no padding

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s>
<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't 

Compare to non-streamed version

In [None]:
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            seed=42,
                            batch_size=1000,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Filter (num_proc=4):   0%|          | 0/21137 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/2349 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20388 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20388 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20388 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/20388 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20388 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

In [None]:
tdc2.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 20388
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 2253
    })
})

In [None]:
# check whether train set is the same

# check whether validation set is the same

In [None]:
# next(iter(tdc2.main_ddict['train']))

#### With padding 

(set `max_length` to `None` if you want to pad to model's maximum sequence length)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

NameError: name 'TextDataLMController' is not defined

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(tdc.main_ddict['train']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


### Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 13573
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 3446
    })
})

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it yet, i should have waited for it to go on sale.</s><s>... the print is so
------------------------------------------------------------------------------------------

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs petite fit perfectly. the dress hits me at the same length that is pictured. i think it would
--------------------------------------------------------------------------------------------

### Striding (For Concatenation of tokens)

If your sentences (or paragraphs) are larger than `max_length`, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. **Striding** is a way to somewhat preserve the sentence's meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20)
# Stride is 20, meaning for the next entry, we go back 20 tokens

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it
----------------------------------------------------------------

For the second entry, we can see it starts with the last 20 tokens of the previous entry: `comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made`)

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
 was a little shorter than i had expected, but i still really enjoy the cut and fit of it.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs pet
---------------------------------------------------------------------------------------------

## Data Collator

In [None]:
from underthesea import text_normalize
from transformers import AutoTokenizer

### For masked language model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Let's define our text controller first

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

We will tokenize our corpus line-by-line

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
tdc.data_collator

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True), mlm=True, mlm_probability=0.15, pad_to_multiple_of=8, tf_experimental_compile=False, return_tensors='pt')

Before applying the collator...


In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [0, 118, 2740, 42, 804, 8, 21, 5779, 19, 5, 2564, 77, 24, 2035, 479, 939, 2740, 5, 3023, 29, 8, 24, 21, 202, 81, 10799, 7, 5, 477, 9, 145, 29747, 24203, 479, 939, 524, 6764, 195, 128, 361, 22, 59, 8325, 2697, 8, 33, 10, 5342, 7174, 28762, 8, 356, 275, 11, 21543, 29, 14, 33, 103, 3989, 479, 114, 47, 101, 10, 7082, 2564, 42, 429, 28, 13, 47, 479, 5, 1468, 16, 33997, 8, 3279, 8, 3473, 479, 939, 74, 3608, 12926, 159, 10, 1836, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

We can see that the length of each token list is different from each other

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[91, 24, 79, 82, 121]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

Now all token lists have the same length, which is 128: a multiple of 8 and larger than the longest list in the batch (which is 121)

In [None]:
out['input_ids'].shape

torch.Size([5, 128])

In [None]:
out['input_ids'][:2,:]

tensor([[    0,  8496,  2740,    42,   804,     8, 50264,  5779,    19,     5,
          2564, 50264,    24, 50264,   479, 50264,  2740,     5, 50264,    29,
             8,    24,    21,   202,    81, 50264,     7,     5, 50264,     9,
           145, 29747, 24203,   479,   939,   524,  6764, 50264,   128,   361,
            22,    59,  8325,  2697,     8,    33,    10, 50264,  7174, 28762,
         42013,   356,   275,    11, 21543,    29,    14,    33,   103,  3989,
           479,   114,    47,   101,    10,  7082, 50264,    42,   429,    28,
            13,    47,   479,     5,  1468,    16, 33997,     8,  3279,     8,
          3473,   479,   939,    74,  3608, 12926, 50264,    10,  1836,   479,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

The `labels` have also been constructed, which shows the "mask" tokens (non -100) in which the model has to predict. To increase the amount of masked tokens, increase the `mlm_prob`

In [None]:
out['labels'][:2,:]

tensor([[ -100,   118,  -100,  -100,  -100,  -100,    21,  -100,  -100,  -100,
          -100,    77,  -100,  2035,  -100,   939,  -100,  -100,  3023,  -100,
          -100,  -100,  -100,  -100,  -100, 10799,  -100,  -100,   477,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,   195,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  5342,  -100,  -100,
             8,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  2564,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,   159,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

If you apply padding in the tokenization step (by adjusting the `max_length` argument), no matter whether it's line-by-line tokenization or not, the data collator will skip the padding step

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[    0,   118,  2740,    42, 50264,     8,    21,  5779,    19,     5,
          2564,    77,    24,  2035,   479,   939,  2740,     5,  3023,    29,
             8,    24,    21,   202,    81, 10799,     7,     5, 40728,     9,
         50264, 29747, 24203, 50264,   939,   524, 50264,   195,   128,   361,
            22, 50264,  8325,  2697,     8, 50264,    10,  5342,  7174, 28762,
             8,   356,   275,    11, 21543,    29,    14,    33,   103,  3989,
           479,   114, 50264,   101,    10,  7082,  2564,    42,   429,    28,
         50264,    47,   479,     5,  1468,    16, 33997,     8, 50264,     8,
         50264,   479, 50264,    74,  3608, 12926,   159,    10,  1836,   479,
             2,     0,  2527, 29747, 50264, 27785,   269,  5779,   479,   156],
        [  162,   356,   231,   353,  5283,     8,   939,   437, 50264,  4716,
          1459,  1836,   132,   479,     2,     0,   118,   657,   910,  7474,
           268,     8,    42,    65,    16,   269, 

In [None]:
out['labels'][:2,:]

tensor([[ -100,  -100,  -100,  -100,   804,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,    24,  -100,  -100,  -100,  -100,  -100,  -100,   477,  -100,
           145,  -100,  -100,   479,  -100,  -100,  6764,  -100,  -100,  -100,
          -100,    59,  -100,  -100,  -100,    33,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    47,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
            13,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  3279,  -100,
          3473,  -100,   939,    74,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100, 24203,  -100,  -100,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,    10,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    42,  -100,  -100,  -100, 

Since we are using the concatenation-of-tokenization technique, one smart thing that the HuggingFace's `DataCollatorForLanguageModeling` (which is the data collator we use) does is to allow maskings at every position, at opposed to to the previous cases (with line-by-line tokenization), there's no masking near the end tokens of each list, because those end tokens are padding tokens

### For causal language model

In [None]:
from transformers import AutoTokenizer
from tokenizers import processors

Let's define our GPT2 tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

GPT2 does not use start/end-of-sentence token:

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']


If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']


With this modified tokenizer, let's perform concatenation-of-tokenization using GPT2

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Since it's casual language modeling, let's turn off `is_mlm`

In [None]:
tdc.set_data_collator(is_mlm=False)

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764, 50256,    72,  1842,   374,  3361,   364,   290,   428,
           530,   318,  1107, 13779,   764,  1312, 

In [None]:
out['labels'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764,  -100,    72,  1842,   374,  3361,   364,   290,   428,
           530,   318,  1107, 13779,   764,  1312, 

For CLM, the `labels` are essentially the same as `input_ids`. From HuggingFace documentation:
```
`DataCollatorForLanguageModeling` will take care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training.
```

## Save and Load TextDataController

In [None]:
show_doc(TextDataLMController.save_as_pickles)

---

### TextDataLMController.save_as_pickles

>      TextDataLMController.save_as_pickles (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

In [None]:
show_doc(TextDataLMController.from_pickle)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.from_pickle

>      TextDataController.from_pickle (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

TextDataLMController object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

In [None]:
from datasets import disable_caching

In [None]:
disable_caching()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.save_as_pickles('my_lm_tdc')

Load back our object

In [None]:
tdc2 = TextDataLMController.from_pickle('my_lm_tdc')

You can still access all its attributes, data, preprocessings, transformations ...

In [None]:
tdc2.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 18111
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4529
    })
})

In [None]:
tdc2.filter_dict,tdc2.content_tfms

({'Review Text': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>])

In [None]:
# #| hide
# import nbdev; nbdev.nbdev_export()