# Text Main For Language Model - Streaming

> This module contains the main Python class for the **streaming** version of `TextDataLMController`


- skip_showdoc: true
- skip_exec: true

In [None]:
#| default_exp text_main_lm_streaming

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from datasets import Dataset,IterableDataset
from that_nlp_library.utils import *
from that_nlp_library.text_main import tokenize_function
from that_nlp_library.text_main_streaming import *
from functools import partial
from collections import defaultdict
import warnings
from transformers import DataCollatorForLanguageModeling

In [None]:
import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset
import os

## Class TextDataLMControllerStreaming

In [None]:
#| export
class TextDataLMControllerStreaming(TextDataControllerStreaming):
    def __init__(self,
                 inp, # HuggingFainpce Dataset or DatasetDict
                 main_text:str, # Name of the main text column
                 filter_dict={}, # A dictionary: {feature: filtering_function_for_that_feature}
                 metadatas=[], # Names of the metadata columns
                 process_metas=True, # Whether to do simple text processing on the chosen metadatas
                 content_transformations=[], # A list of text transformations
                 seed=None, # Random seed
                 batch_size=1024, # Transformation + Tokenization batch size
                 num_proc=1, # Number of process for multiprocessing
                 cols_to_keep=None, # Columns to keep after all processings
                 verbose=True, # Whether to prdint processing information
                ):
        
        super().__init__(inp=inp,
                         main_text=main_text,
                         filter_dict=filter_dict,
                         metadatas=metadatas,
                         process_metas=process_metas,
                         content_transformations=content_transformations,
                         seed=seed,
                         batch_size=batch_size,
                         num_proc=num_proc,
                         cols_to_keep=cols_to_keep,
                         verbose=verbose
                        )
            
    
    def _do_label_transformation(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
        
    def _encode_labels(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
    
    def _do_transformation_augmentation_tokenization(self):
        raise NotImplementedError("There's no augmentation in text processing for Language Model")


    def save_as_pickles(self,
                        fname, # Name of the pickle file
                        parent='pickle_files', # Parent folder
                       ):
        
        save_to_pickle(self,fname,parent=parent)
        
    def _group_texts_with_stride(self,examples):
        max_length = self.max_length
        if max_length is None: 
            max_length = self.tokenizer.model_max_length
        stride = self.stride
        if stride is None: stride=max_length
        else: stride = max_length-stride
        if stride==0: raise ValueError(f'Stride cannot be equal to max length of {max_length}')
            
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        result_all={}
        for k,t in concatenated_examples.items():
            result=[]
            i=0
            while i+max_length<=total_length:
                result.append(t[i:i+max_length])
                i+=stride
            result_all[k]=result
        
        return result_all  
    
    
    def _do_transformation_tokenization(self,dtrain):             
        if len(self.content_tfms):            
            for tfm in self.content_tfms:
                _func = partial(lambda_map_batch,
                                feature=self.main_text,
                                func=tfm,
                                is_batched=self.is_batched)
                dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)

        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
        
        # Tokenization
        _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
        
        dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.tok_num_proc)
        if not self.line_by_line: dtrain = dtrain.remove_columns(self.cols_to_keep)   
        
        # Token concatenation
        if not self.line_by_line: 
            dtrain = hf_map_dset(dtrain,
                                 self._group_texts_with_stride,
                                 is_batched=True,
                                 batch_size=self.batch_size,
                                 num_proc=self.tok_num_proc)
        return dtrain
    
    
    def _construct_generator_with_batch(self,dset):        
        def _get_generator(dset):
            for v in dset: yield v
            
        final_dict = defaultdict(list)
        for inp in dset: # dset is generator
            # inp[text_name] will be a single item
            for k,v in inp.items():
                final_dict[k].append(v)
            
            if len(final_dict[self.main_text])==self.batch_size:
                # a full batch (self.batch_size) is created
                dtrain = Dataset.from_dict(final_dict)
                dtrain = self._do_transformation_tokenization(dtrain)
                yield from _get_generator(dtrain)
                final_dict=defaultdict(list)            
            
        if len(final_dict[self.main_text]):
            # hasn't reached batch_size (of last batch)
            dtrain = Dataset.from_dict(final_dict)
            dtrain = self._do_transformation_tokenization(dtrain)
            yield from _get_generator(dtrain)

    def _do_transformation_tokenization_generator(self):
        _tmp1 = self.num_proc
        _tmp2 = self.tok_num_proc
        self.num_proc=1
        self_tok_num_proc=1
        self.main_ddict['train'] = IterableDataset.from_generator(self._construct_generator_with_batch,
                                                                  gen_kwargs={'dset': self.main_ddict['train']}
                                                                 )
        self.num_proc = _tmp1
        self.tok_num_proc = _tmp2
    
    def _do_transformation_tokenization_generator_fast(self):
        # only use for line-by-line tokenization with no padding
        def _get_generator(dset,tok_func,all_tfms):
            for inp in dset:
                # inp[text_name] will be a single item
                results = tok_func(all_tfms(inp[self.main_text]))
                # add back cols_to_keep in inp
                results = dict(inp,**results)
                yield results
        
        # no padding for tokenization
        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=-1,
                           return_special_tokens_mask=True
                          )
        all_tfms = self.content_tfms 
        all_tfms = partial(func_all,functions=all_tfms) if len(all_tfms) else lambda x: x
           
        self.main_ddict['train'] = IterableDataset.from_generator(_get_generator,
                                                   gen_kwargs={'dset': self.main_ddict['train'],
                                                               'tok_func':tok_func,
                                                               'all_tfms': all_tfms
                                                              }
                                                                 )

    
    def process_and_tokenize(self,
                             tokenizer, # Tokenizer (preferably from HuggingFace)
                             max_length=None, # pad to model's allowed max length (default is max_sequence_length). Use -1 for no padding at all
                             tok_num_proc=None, # Number of processes for tokenization
                             line_by_line=True, # To whether tokenize each sentence separately, or concatenate them
                             stride=None, # option to do striding when line_by_line is False
                            ):
        if self._processed_call:
            warnings.warn('Your dataset has already been processed. Returning the previous processed DatasetDict...')
            return self.main_ddict
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.line_by_line = line_by_line
        if not self.line_by_line and self.batch_size==1:
            raise ValueError('Cannot perform token concatenation with batch size of 1')
        self.stride = stride        
        self.tok_num_proc = tok_num_proc if tok_num_proc else self.num_proc
        
        # Filtering
        print_msg('Data Filtering',20,verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._do_filtering(self.main_ddict[k])
        self.verboseprint('Done')

        
        # Process metadatas
        print_msg('Metadata Simple Processing & Concatenating to Main Content',verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._process_metadatas(self.main_ddict[k])
        self.verboseprint('Done')

        # Dropping unused columns
        self._simplify_ddict()

        if self.seed:
            seed_everything(self.seed)
            
        # Content transformation + tokenization for validation
        if 'validation' in self.main_ddict.keys():
            print_msg('Performing Content Transformation and Tokenization on Validation Set',verbose=self.verbose)
            self.main_ddict['validation'] = self._do_transformation_tokenization(self.main_ddict['validation'])
            self.verboseprint('Done')
        
        # Content transformation + tokenization for train
        print_msg('Creating a generator for content transformation and tokenization on Train set',verbose=self.verbose)
        if line_by_line and max_length is not None and max_length<0: # line-by-line tokenization with no padding
            self._do_transformation_tokenization_generator_fast()
        else:
            self._do_transformation_tokenization_generator()
        self.verboseprint('Done')
        
        self._processed_call=True
    
    def set_data_collator(self,
                          is_mlm=True, # Is this masked language model (True) or causal language model (False)
                          mlm_prob=0.15, # Mask probability for masked language model
                         ):
        if not hasattr(self,'max_length'):
            raise ValueError("Please call `process_and_tokenize' or `do_tokenization` to tokenize your dataset")
            
        pad_to_multiple_of_8 = (self.max_length<0) # get data collator to pad when tokenizer does not apply padding
        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer,
                                                             mlm=is_mlm,
                                                             mlm_probability=mlm_prob,
                                                             pad_to_multiple_of=8 if pad_to_multiple_of_8 else None
                                                            )
                                               
    def prepare_test_dataset_from_raws(self,
                                       content, # Either a single sentence, list of sentence or a dictionary with keys are metadata columns and values are list
                                       do_tokenize=False, # Whether to tokenize text
                                      ):
        if len(self.metadatas) and not isinstance(content,dict):
            raise ValueError(f'There is/are metadatas in the preprocessing step. Please include a dictionary including these keys for metadatas: {self.metadatas}, and texture content: {self.main_text}')
            
        _dic = {self.main_text:[content]} if isinstance(content,str) else content
        for k in _dic.keys():
            _dic[k] = val2iterable(_dic[k])
        
        test_dict = Dataset.from_dict(_dic)
        
        # set num_proc to 1 for small data processing
        _tmp1 = self.num_proc
        _tmp2 = self.tok_num_proc
        self.num_proc=1
        self.tok_num_proc=1
        results = self.prepare_test_dataset(test_dict,do_tokenize)
        self.num_proc = _tmp1
        self.tok_num_proc=_tmp2
        return results
        
    def prepare_test_dataset(self,
                             test_dset, # The HuggingFace Dataset as Test set
                             do_tokenize, # Whether to tokenize text
                            ):
            
        test_cols = set(get_dset_col_names(test_dset))
        missing_cols = set(self.cols_to_keep) - test_cols
        if len(missing_cols):
            raise ValueError(f'Test set does not have these columns required for preprocessings: {missing_cols}')
            
        print_msg('Start Test Set Transformation',20,verbose=self.verbose)
        
        # Process metadatas
        test_dset = self._process_metadatas(test_dset)
        
        # Content transformation
        test_dset = self._do_transformation(test_dset)
        
        # Drop unused columns
        cols_to_remove = test_cols - set(self.cols_to_keep)
        test_dset = test_dset.remove_columns(list(cols_to_remove))
        
        if do_tokenize:
            print_msg('Tokenization',20,verbose=self.verbose)
            tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
            
            _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
            test_dset = hf_map_dset(test_dset,_func,self.is_batched,self.batch_size,self.tok_num_proc)
            
        self.verboseprint('Done')
        return test_dset

In [None]:
show_doc(TextDataLMControllerStreaming)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main_lm_streaming.py#L18){target="_blank" style="float:right; font-size:smaller"}

### TextDataLMControllerStreaming

>      TextDataLMControllerStreaming (inp, main_text:str, filter_dict={},
>                                     metadatas=[], process_metas=True,
>                                     content_transformations=[], seed=None,
>                                     batch_size=1024, num_proc=1,
>                                     cols_to_keep=None, verbose=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | HuggingFainpce Dataset or DatasetDict |
| main_text | str |  | Name of the main text column |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_for_that_feature} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| content_transformations | list | [] | A list of text transformations |
| seed | NoneType | None | Random seed |
| batch_size | int | 1024 | Transformation + Tokenization batch size |
| num_proc | int | 1 | Number of process for multiprocessing |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to prdint processing information |

## Load data + Basic use case

Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce

With line-by-line tokenization

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
ddict_with_val

DatasetDict({
    train: IterableDataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        n_shards: 1
    })
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    main_text='Review Text',
                                   )

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
show_doc(TextDataLMControllerStreaming.process_and_tokenize)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main_lm_streaming.py#L181){target="_blank" style="float:right; font-size:smaller"}

### TextDataLMControllerStreaming.process_and_tokenize

>      TextDataLMControllerStreaming.process_and_tokenize (tokenizer,
>                                                          max_length=None,
>                                                          tok_num_proc=None,
>                                                          line_by_line=True,
>                                                          stride=None)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| tokenizer |  |  | Tokenizer (preferably from HuggingFace) |
| max_length | NoneType | None | pad to model's allowed max length (default is max_sequence_length). Use -1 for no padding at all |
| tok_num_proc | NoneType | None | Number of processes for tokenization |
| line_by_line | bool | True | To whether tokenize each sentence separately, or concatenate them |
| stride | NoneType | None | option to do striding when line_by_line is False |

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True)

-------------------- Data Filtering --------------------


Filter:   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----


Map:   0%|          | 0/2270 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
    validation: Dataset({
        features: ['Review Text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2270
    })
})

In [None]:
for i,v in enumerate(tdc.main_ddict['validation']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    

Input ids: [0, 243, 21, 657, 23, 78, 6112, 13, 162, 328, 939, 524, 4716, 1459, 195, 108, 176, 59, 12312, 23246, 8, 5, 650, 1006, 157, 13, 162, 4, 939, 2333, 645, 3023, 29, 11, 144, 1964, 11, 6215, 6, 53, 142, 51, 129, 56, 10, 650, 314, 11, 5, 1400, 6, 939, 802, 24, 74, 173, 25, 939, 109, 101, 127, 23734, 10, 410, 2671, 4, 77, 939, 1381, 24, 15, 6, 24, 21, 5, 1969, 1836, 328, 101, 5, 97, 37102, 26, 6, 939, 115, 192, 596, 24, 189, 45, 173, 13, 167, 19, 10, 35682, 7050, 443, 4, 939, 524, 15, 5, 2735, 526, 11, 14, 443, 8, 5, 754, 14, 939, 439, 62, 7, 10, 650, 1386, 9, 3023, 29, 189, 2489, 705, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==1:break
    print(f"Input ids: {v['input_ids']}\n\nDecoded: {tokenizer.decode(v['input_ids'])}\n\nAttention Mask: {v['attention_mask']}")

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Input ids: [0, 713, 16, 30, 444, 5, 847, 990, 6, 144, 18842, 65, 2125, 939, 33, 655, 1381, 15, 328, 24, 10698, 6683, 8, 16, 34203, 11, 5, 235, 2127, 6, 24, 630, 75, 311, 350, 203, 12479, 25571, 8, 16, 6473, 615, 13, 284, 1061, 4, 3668, 657, 42, 3235, 11, 13504, 4, 938, 75, 11, 657, 19, 5, 1079, 9, 5, 8117, 6, 2818, 51, 283, 66, 11, 55, 2705, 8089, 23090, 1010, 328, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

## Filtering + Metadatas + Content Transformation + Tokenization

Define our tokenization

In [None]:
from transformers import RobertaTokenizer
from underthesea import text_normalize

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
from that_nlp_library.text_main_lm import TextDataLMController

### Option 1: Tokenize our corpus line-by-line

#### With no padding

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

In [None]:
print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s>

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn't

In [None]:
%%time
for i,v in enumerate(tdc.main_ddict['train']):
    if i%100==0:
        print(i)
    if i==1024-1:
        break
    pass

0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 961 ms, sys: 95 µs, total: 961 ms
Wall time: 954 ms


Compare to non-streamed version

In [None]:
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            cols_to_keep=['Clothing ID','Review Text'],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1,shuffle_trn=False)

In [None]:
# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

In [None]:
iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a['input_ids']==b['input_ids']

In [None]:
print(a)
print('-'*20)
print(b)

{'Clothing ID': 1056, 'Review Text': 'general . perfect pant . I picked these up the other day looking for a good jeans alternative. i love them. they are the perfect fit of slim but not skinny. i went with my normal size (26) and so far after one wear, they are still in good shape. a little bit of stretch, but not too much. the moss color is so crisp and goes with a lot. they will be perfect for transitioning into fall.', 'input_ids': [0, 15841, 479, 1969, 16259, 479, 939, 2738, 209, 62, 5, 97, 183, 546, 13, 10, 205, 10844, 3626, 479, 939, 657, 106, 479, 51, 32, 5, 1969, 2564, 9, 11875, 53, 45, 22877, 479, 939, 439, 19, 127, 2340, 1836, 36, 973, 4839, 8, 98, 444, 71, 65, 3568, 2156, 51, 32, 202, 11, 205, 3989, 479, 10, 410, 828, 9, 4140, 2156, 53, 45, 350, 203, 479, 5, 40711, 3195, 16, 98, 17766, 8, 1411, 19, 10, 319, 479, 51, 40, 28, 1969, 13, 26135, 88, 1136, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

#### With padding 

(set `max_length` to `None` if you want to pad to model's maximum sequence length)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=True
                                    )
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,tok_num_proc=1)

-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----
Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done


In [None]:
print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. on

Compare to non-streamed version

In [None]:
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            cols_to_keep=['Clothing ID','Review Text'],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=True,max_length=256,shuffle_trn=False,tok_num_proc=1)

In [None]:
# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

In [None]:
iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

In [None]:
# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

### Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,tok_num_proc=1)

In [None]:
print(tokenizer.decode(next(iter(tdc.main_ddict['train']))['input_ids']))
print()
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a bit linger still 9 lower on calf for me ), the straps are almost tight, so i would say the dress is a reversed taper shape. color is beautiful, i ordered green as the other color ( plum ) doesn't have petite available. green is rich, and classy, the fabric i

In [None]:
%%time
for i,v in enumerate(tdc.main_ddict['train']):
    if i%100==0:
        print(i)
    if i==1024-1:
        break
    pass

0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 11.8 s, sys: 23.9 ms, total: 11.8 s
Wall time: 11.8 s


Compare to non-streamed version

In [None]:
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=256,shuffle_trn=False,tok_num_proc=1)

In [None]:
# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

In [None]:
# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

### Striding (For Concatenation of tokens)

If your sentences (or paragraphs) are larger than `max_length`, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. **Striding** is a way to somewhat preserve the sentence's meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20,tok_num_proc=1)
# Stride is 20, meaning for the next entry, we go back 20 tokens

In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==2: break
    print(tokenizer.decode(v['input_ids']))
    print('-'*20)
    

<s>general petite. beautiful top, worth the necessary tailoring. the beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back. however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ",
--------------------
 however, the fabric is beautiful, the fit was perfect ( size 2, 5'4 ", 106 lbs. ), the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si</s><s>general. not as short on me ( petite ). i ordered the xxs p as this dress is not a fitted dress, and that was the right size for me. only thing is the length is a
--------------------


In [None]:
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][1]))

<s>general. soft, feminine and fun pockets!. i love this tunic. purchased the dark orange in medium ( i am 5'9 and 140 lbs ). tried the small and almost kept it but i felt seams around my arm pits a tad, so went with the medium and glad i did - this top should be comfortable. feels very fall and perfect for casual get-togethers and running around town. only comment is that it is rayon... and for me anyway rayon doesn
 running around town. only comment is that it is rayon... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs.</s><s>general petite. a new staple!. tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are! they manage to look flowing & sleek without shortening the legs. took a size 6 with my 27 " waist, 37 " hips. it's a bit


For the second entry, we can see it starts with the last 20 tokens of the previous entry

Compare to non-streamed version

In [None]:
dset2 = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val2 = dset2.train_test_split(test_size=0.1,seed=42)
ddict_with_val2['validation'] = ddict_with_val2['test']
del ddict_with_val2['test']

tdc2 = TextDataLMController(ddict_with_val2,
                            main_text='Review Text',
                            filter_dict={'Review Text': lambda x: x is not None},
                            metadatas=['Title','Division Name'],
                            content_transformations=[text_normalize,str.lower],
                            seed=42,
                            batch_size=1024,
                            verbose=False
                            )
tdc2.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,shuffle_trn=False,
                          stride=20,tok_num_proc=1)

In [None]:
# check whether train sets are the same
assert len(list(tdc.main_ddict['train']))==len(tdc2.main_ddict['train'])

iter1 = iter(tdc.main_ddict['train'])
iter2 = iter(tdc2.main_ddict['train'])
for a,b in zip(iter1,iter2):
    assert a==b

In [None]:
# check whether validation set is the same
assert len(list(tdc.main_ddict['validation']))==len(tdc2.main_ddict['validation'])

iter1 = iter(tdc.main_ddict['validation'])
iter2 = iter(tdc2.main_ddict['validation'])
for a,b in zip(iter1,iter2):
    assert a==b

## Data Collator

In [None]:
from underthesea import text_normalize
from transformers import AutoTokenizer

### For masked language model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Let's define our text controller first

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

We will tokenize our corpus line-by-line

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
tdc.data_collator

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.15, pad_to_multiple_of=8, tf_exp

Before applying the collator...


In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==2: break
    print(v)
    print(f"Length of input_ids: {len(v['input_ids'])}")
    print('-'*20)

{'Clothing ID': 841, 'Review Text': 'general petite . beautiful top, worth the necessary tailoring . The beautiful bold print drew me to this top and it did not disappoint upon receipt. however, the bottom ruffle belled so far out on each side that it was laughable! the actual fit is nothing like the picture; clearly the model\'s arms are placed in front of all the extra fabric to hold the ruffle back.\r\nhowever, the fabric is beautiful, the fit was perfect (size 2, 5\'4", 106 lbs.), the quality is great and i love the print so i decided to take it to my tailor to "sew away" the "wings" on both si', 'input_ids': [0, 15841, 4716, 1459, 479, 2721, 299, 2156, 966, 5, 2139, 7886, 5137, 479, 5, 2721, 7457, 5780, 4855, 162, 7, 42, 299, 8, 24, 222, 45, 17534, 2115, 18245, 479, 959, 2156, 5, 2576, 910, 15315, 28, 9970, 98, 444, 66, 15, 349, 526, 14, 24, 21, 38677, 27785, 5, 3031, 2564, 16, 1085, 101, 5, 2170, 25606, 2563, 5, 1421, 18, 3701, 32, 2325, 11, 760, 9, 70, 5, 1823, 10199, 7, 946, 5,

We can see that the length of each token list is different from each other

Let's apply the collator

In [None]:
# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
    _inp = next(iter1)
    _result.append({k:_inp[k] for k in inp_keys})
        

In [None]:
out = tdc.data_collator(_result)

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

Now all token lists have the same length, which is a multiple of 8

In [None]:
out['input_ids'].shape

torch.Size([5, 136])

In [None]:
out['input_ids'][:3,:]

tensor([[    0, 15841,  4716,  1459,   479,  2721,   299,  2156,   966, 50264,
          2139,  7886,  5137,   479,     5,  2721,  7457,  5780,  4855,   162,
             7,    42,   299,     8, 50264,   222,    45, 17534,  2115, 50264,
           479, 50264,  2156,     5,  2576,   910, 15315,    28,  9970,    98,
           444,    66,    15,   349,   526,    14,    24,    21, 38677, 27785,
         17138,  3031, 50264,    16,  1085,   101,     5,  2170, 25606,  2563,
             5,  1421,    18,  3701,    32,  2325,    11,   760,     9,    70,
             5,  1823, 10199, 50264, 29261, 50264,   910, 15315,   124,   479,
           959,  2156,     5, 10199,    16,  2721,  2156, 50264,  2564,    21,
          1969,    36,  1836,   132, 50264,   195,   128,   204,    22, 50264,
         13442, 23246,   479, 50264,  2156,     5, 50264,    16, 50264,     8,
           939,   657,     5, 50264,    98,   939,  1276, 50264,   185,    24,
             7,   127, 26090,     7,    22, 35043,  

The `labels` have also been constructed, which shows the "mask" tokens (non -100) in which the model has to predict. To increase the amount of masked tokens, increase the `mlm_prob`

In [None]:
out['labels'][:3,:]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,    24,  -100,  -100,  -100,  -100, 18245,
          -100,   959,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             5,  -100,  2564,  -100,  -100,  -100,  -100,  -100,  -100,  2563,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,     7,   946,     5,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,  -100,  -100,
          -100,  -100,  -100,  -100,  2156,  -100,  -100,  -100,  -100,  2156,
          -100,  -100,  -100,  4839,  -100,  -100,  1318,  -100,   372,  -100,
          -100,  -100,  -100,  5780,  -100,  -100,  -100,     7,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

If you apply padding in the tokenization step (by adjusting the `max_length` argument), no matter whether it's line-by-line tokenization or not, the data collator will skip the padding step

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    cols_to_keep=['Clothing ID','Review Text'],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

Let's apply the collator

In [None]:
# extract only the required keys
inp_keys = tokenizer.model_input_names
iter1 = iter(tdc.main_ddict['train'])
_result=[]
for i in range(5):
    _inp = next(iter1)
    _result.append({k:_inp[k] for k in inp_keys})
        

out = tdc.data_collator(_result)

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[    0, 15841,  4716,  1459,   479,  2721,   299,  2156,   966, 50264,
          2139,  7886,  5137,   479,     5,  2721,  7457,  5780,  4855,   162,
             7,    42,   299,     8,    24,   222,    45, 17534,  2115, 50264,
           479, 50264,  2156,     5,  2576,   910, 15315,    28,  9970,    98,
           444,    66,    15,   349,   526,    14,    24,    21, 38677, 27785,
         50264,  3031, 50264,    16,  1085,   101,     5,  2170, 25606, 41316,
             5,  1421,    18,  3701,    32,  2325,    11,   760,     9,    70,
             5,  1823, 10199, 50264, 17204, 50264,   910, 15315,   124,   479,
           959,  2156,     5, 10199,    16,  2721,  2156, 50264,  2564,    21,
          1969,    36,  1836,   132, 50264,   195,   128,   204,    22, 50264],
        [13442, 23246,   479, 50264,  2156,     5, 50264,    16, 23781,     8,
           939,   657,     5,  5780,    98,   939,  1276, 50264,   185,    24,
             7,   127, 26090,     7,    22, 35043, 

In [None]:
out['labels'][:2,:]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,    24,  -100,  -100,  -100,  -100, 18245,
          -100,   959,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             5,  -100,  2564,  -100,  -100,  -100,  -100,  -100,  -100,  2563,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,     7,   946,     5,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,     5,  -100,  -100,
          -100,  -100,  -100,  -100,  2156,  -100,  -100,  -100,  -100,  2156],
        [ -100,  -100,  -100,  4839,  -100,  -100,  1318,  -100,   372,  -100,
          -100,  -100,  -100,  5780,  -100,  -100,  -100,     7,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100, 

Since we are using the concatenation-of-tokenization technique, one smart thing that the HuggingFace's `DataCollatorForLanguageModeling` (which is the data collator we use) does is to allow maskings at every position, at opposed to to the previous cases (with line-by-line tokenization), there's no masking near the end tokens of each list, because those end tokens are padding tokens

### For causal language model

In [None]:
from transformers import AutoTokenizer
from tokenizers import processors

Let's define our GPT2 tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

GPT2 does not use start/end-of-sentence token:

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']


If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']


With this modified tokenizer, let's perform concatenation-of-tokenization using GPT2

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Since it's casual language modeling, let's turn off `is_mlm`

In [None]:
tdc.set_data_collator(is_mlm=False)

Let's apply the collator

In [None]:
iter1 = iter(tdc.main_ddict['train'])
out = tdc.data_collator([next(iter1) for i in range(5)]) # simulation with batch size 5

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[24622,  4273,   578,   764,  4950,  1353,   837,  2861,   262,  3306,
          7894,  3255,   764,   262,  4950, 10758,  3601,  9859,   502,   284,
           428,  1353,   290,   340,   750,   407,  6703,  2402, 14507,   764,
          2158,   837,   262,  4220,   374, 18137,   307,  3353,   523,  1290,
           503,   319,  1123,  1735,   326,   340,   373, 47623,  5145,   262,
          4036,  4197,   318,  2147,   588,   262,  4286,  2162,  4084,   262,
          2746,   338,  5101,   389,  4624,   287,  2166,   286,   477,   262,
          3131,  9664,   284,  1745,   262,   374, 18137,   736,   764,  2158,
           837,   262,  9664,   318,  4950,   837,   262,  4197,   373,  2818,
           357,  2546,   362,   837,   642,   705,   604,   366,   837, 15696],
        [15785,   764,  1267,   837,   262,  3081,   318,  1049,   290,  1312,
          1842,   262,  3601,   523,  1312,  3066,   284,  1011,   340,   284,
           616, 35280,   284,   366, 34249,  1497, 

In [None]:
out['labels'][:2,:]

tensor([[24622,  4273,   578,   764,  4950,  1353,   837,  2861,   262,  3306,
          7894,  3255,   764,   262,  4950, 10758,  3601,  9859,   502,   284,
           428,  1353,   290,   340,   750,   407,  6703,  2402, 14507,   764,
          2158,   837,   262,  4220,   374, 18137,   307,  3353,   523,  1290,
           503,   319,  1123,  1735,   326,   340,   373, 47623,  5145,   262,
          4036,  4197,   318,  2147,   588,   262,  4286,  2162,  4084,   262,
          2746,   338,  5101,   389,  4624,   287,  2166,   286,   477,   262,
          3131,  9664,   284,  1745,   262,   374, 18137,   736,   764,  2158,
           837,   262,  9664,   318,  4950,   837,   262,  4197,   373,  2818,
           357,  2546,   362,   837,   642,   705,   604,   366,   837, 15696],
        [15785,   764,  1267,   837,   262,  3081,   318,  1049,   290,  1312,
          1842,   262,  3601,   523,  1312,  3066,   284,  1011,   340,   284,
           616, 35280,   284,   366, 34249,  1497, 

For CLM, the `labels` are essentially the same as `input_ids`. From HuggingFace documentation:
```
`DataCollatorForLanguageModeling` will take care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training.
```

## Save and Load TextDataController

In [None]:
show_doc(TextDataLMControllerStreaming.save_as_pickles)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main_lm_streaming.py#L57){target="_blank" style="float:right; font-size:smaller"}

### TextDataLMControllerStreaming.save_as_pickles

>      TextDataLMControllerStreaming.save_as_pickles (fname,
>                                                     parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

In [None]:
show_doc(TextDataLMControllerStreaming.from_pickle)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main_streaming.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataControllerStreaming.from_pickle

>      TextDataControllerStreaming.from_pickle (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

TextDataLMControllerStreaming object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

In [None]:
from datasets import disable_caching

In [None]:
disable_caching()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1,seed=42)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    main_text='Review Text',
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    metadatas=['Title','Division Name'],
                                    content_transformations=[text_normalize,str.lower],
                                    seed=42,
                                    batch_size=1024,
                                    verbose=False
                                    )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
tdc.save_as_pickles('my_lm_tdc')

Load back our object

In [None]:
tdc2 = TextDataLMController.from_pickle('my_lm_tdc')

You can still access all its attributes, data, preprocessings, transformations ...

In [None]:
tdc2.main_ddict

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 1
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2253
    })
})

In [None]:
tdc2.filter_dict,tdc2.content_tfms

({'Review Text': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()