# Text Main For Language Model - Streaming

> This module contains the main Python class for the **streaming** version of `TextDataLMController`


- skip_showdoc: true
- skip_exec: true

#| default_exp text_main_lm_streaming

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from datasets import DatasetDict,Dataset,IterableDataset
from pathlib import Path
from that_nlp_library.utils import *
from that_nlp_library.text_main import tokenize_function
from that_nlp_library.text_main_streaming import *
from functools import partial
import warnings
from transformers import DataCollatorForLanguageModeling

In [None]:
import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset
import os

## Class TextDataLMControllerStreaming

In [None]:
#| export
class TextDataLMControllerStreaming(TextDataControllerStreaming):
    def __init__(self,
                 inp, # HuggingFainpce Dataset or DatasetDict
                 main_text:str, # Name of the main text column
                 filter_dict={}, # A dictionary: {feature: filtering_function_for_that_feature}
                 metadatas=[], # Names of the metadata columns
                 process_metas=True, # Whether to do simple text processing on the chosen metadatas
                 content_transformations=[], # A list of text transformations
                 seed=None, # Random seed
                 batch_size=1000, # CPU batch size
                 num_proc=1, # Number of process for multiprocessing
                 cols_to_keep=None, # Columns to keep after all processings
                 verbose=True, # Whether to prdint processing information
                ):
        
        super().__init__(inp=inp,
                         main_text=main_text,
                         filter_dict=filter_dict,
                         metadatas=metadatas,
                         process_metas=process_metas,
                         content_transformations=content_transformations,
                         seed=seed,
                         batch_size=batch_size,
                         num_proc=num_proc,
                         cols_to_keep=cols_to_keep,
                         verbose=verbose
                        )
            
    
    def _do_label_transformation(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
        
    def _encode_labels(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
    
    def _do_transformation_augmentation_tokenization(self):
        raise NotImplementedError("There's no augmentation in text processing for Language Model")


    def save_as_pickles(self,
                        fname, # Name of the pickle file
                        parent='pickle_files', # Parent folder
                       ):
        
        save_to_pickle(self,fname,parent=parent)
        
    def _group_texts_with_stride(self,examples):
        max_length = self.max_length
        if max_length is None: 
            max_length = self.tokenizer.model_max_length
        stride = self.stride
        if stride is None: stride=max_length
        else: stride = max_length-stride
        if stride==0: raise ValueError(f'Stride cannot be equal to max length of {max_length}')
            
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        result_all={}
        for k,t in concatenated_examples.items():
            result=[]
            i=0
            while i+max_length<=total_length:
                result.append(t[i:i+max_length])
                i+=stride
            result_all[k]=result
        
        print(result_all['input_ids'])
        return result_all  
    
    
    def _do_transformation_tokenization(self,dtrain):             
        if len(self.content_tfms):            
            for tfm in self.content_tfms:
                _func = partial(lambda_map_batch,
                                feature=self.main_text,
                                func=tfm,
                                is_batched=self.is_batched)
                dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)

        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
        _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
        
        dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)
        dtrain = dtrain.remove_columns(self.cols_to_keep)   
        
        if not self.line_by_line: # string concatenation
            dtrain = hf_map_dset(dtrain,
                                 self._group_texts_with_stride,
                                 is_batched=True,
                                 batch_size=self.batch_size if self.batch_size>1 else 1000,
                                 num_proc=self.num_proc)
        return dtrain
    

    def _construct_generator_with_batch(self,dset,text_name,tok_func,func):
        batch_size = self.batch_size if self.batch_size>1 else 1000
        str_list=[] 
        for inp in dset: # dset is generator
            # inp[text_name] will be a single item
            if self.line_by_line:
                yield tok_func(func(inp[text_name]))
            else:
                str_list.append(func(inp[text_name]))
                if len(str_list)==batch_size:
                    # tokenize
                    result_dict = tok_func(str_list)
                    # token concatenation
                    result_dict = self._group_texts_with_stride(result_dict)
                    str_list=[]
                    yield result_dict
                
            
        if (not self.line_by_line) and len(str_list):
            # str_list length hasn't reached batch_size (last batch)
            # tokenize
            result_dict = tok_func(str_list)
            # token concatenation
            result_dict = self._group_texts_with_stride(result_dict)
            str_list=[]
            yield result_dict
            
    def _do_transformation_tokenization_generator(self):
        tok_func = partial(tokenize_function,
                           tok=self.tokenizer,
                           max_length=self.max_length if self.line_by_line else -1,
                           return_special_tokens_mask=True
                          )
    
        all_tfms = self.content_tfms
        all_tfms = partial(func_all,functions=all_tfms) if len(all_tfms) else lambda x: x
        if self.seed:
            seed_everything(self.seed)
           
        self.main_ddict['train'] = IterableDataset.from_generator(self._construct_generator_with_batch,
                                                   gen_kwargs={'dset': self.main_ddict['train'],
                                                               'text_name':self.main_text,
                                                               'tok_func':tok_func,
                                                               'func': all_tfms
                                                              }
                                                                 )

    
    def process_and_tokenize(self,
                             tokenizer, # Tokenizer (preferably from HuggingFace)
                             max_length=None, # pad to model's allowed max length (default is max_sequence_length). Use -1 for no padding at all
                             line_by_line=True, # To whether tokenize each sentence separately, or concatenate them
                             stride=None, # option to do striding when line_by_line is False
                            ):
        if self._processed_call:
            warnings.warn('Your dataset has already been processed. Returning the previous processed DatasetDict...')
            return self.main_ddict
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.line_by_line = line_by_line
        self.stride = stride                
        
        # Filtering
        print_msg('Data Filtering',20,verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._do_filtering(self.main_ddict[k])
        self.verboseprint('Done')

        
        # Process metadatas
        print_msg('Metadata Simple Processing & Concatenating to Main Content',verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._process_metadatas(self.main_ddict[k])
        self.verboseprint('Done')

        # Dropping unused columns
        self._simplify_ddict()

           
        # Content transformation + tokenization for validation
        if 'validation' in self.main_ddict.keys():
            print_msg('Performing Content Transformation and Tokenization on Validation Set',verbose=self.verbose)
            self.main_ddict['validation'] = self._do_transformation_tokenization(self.main_ddict['validation'])
            self.verboseprint('Done')
        
        # Content transformation + tokenization for train
        print_msg('Creating a generator for content transformation and tokenization on Train set',verbose=self.verbose)
        self._do_transformation_tokenization_generator()
        self.verboseprint('Done')
        
        self._processed_call=True
    
    def set_data_collator(self,
                          is_mlm=True, # Is this masked language model (True) or causal language model (False)
                          mlm_prob=0.15, # Mask probability for masked language model
                         ):
        if not hasattr(self,'max_length'):
            raise ValueError("Please call `process_and_tokenize' or `do_tokenization` to tokenize your dataset")
            
        pad_to_multiple_of_8 = (self.max_length<0) # get data collator to pad
        self.data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer,
                                                             mlm=is_mlm,
                                                             mlm_probability=mlm_prob,
                                                             pad_to_multiple_of=8 if pad_to_multiple_of_8 else None
                                                            )
                                               
        
    def prepare_test_dataset(self,
                             test_dset, # The HuggingFace Dataset as Test set
                             do_filtering=False, # whether to perform data filtering on this test set
                            ):
        raise NotImplementedError("There's no test set preparation for Language Model")

In [None]:
show_doc(TextDataLMControllerStreaming)

---

### TextDataLMControllerStreaming

>      TextDataLMControllerStreaming (inp, main_text:str, filter_dict={},
>                                     metadatas=[], process_metas=True,
>                                     content_transformations=[], seed=None,
>                                     batch_size=1000, num_proc=1,
>                                     cols_to_keep=None, verbose=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | HuggingFainpce Dataset or DatasetDict |
| main_text | str |  | Name of the main text column |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_for_that_feature} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| content_transformations | list | [] | A list of text transformations |
| seed | NoneType | None | Random seed |
| batch_size | int | 1000 | CPU batch size |
| num_proc | int | 1 | Number of process for multiprocessing |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to prdint processing information |

## Load data + Basic use case

Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce

With line-by-line tokenization

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
ddict_with_val

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    main_text='Review Text',
                                    batch_size=100
                                   )

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True)

-------------------- Data Filtering --------------------


Filter:   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----


Map:   0%|          | 0/2258 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation and tokenization on Train set -----
Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2258
    })
})

In [None]:
for i,v in enumerate(tdc.main_ddict['validation']):
    if i==2:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    print('-'*10)
    

Input ids: [0, 19065, 23204, 111, 182, 1256, 8, 3279, 4, 939, 74, 224, 24, 10698, 1528, 7, 1836, 6, 53, 5, 2564, 1495, 73, 5827, 16, 95, 10, 410, 929, 906, 111, 95, 101, 15, 5, 1421, 4, 939, 2333, 185, 41, 3023, 29, 8, 5, 3023, 29, 2564, 372, 4, 50118, 627, 6694, 161, 7, 3841, 2382, 50, 865, 10397, 11, 2569, 4, 939, 460, 10397, 127, 24787, 24043, 11837, 11, 2569, 514, 6, 11, 10, 25916, 3298, 6, 15, 15651, 10397, 8, 33, 393, 56, 10, 936, 4, 50118, 4297, 6, 77, 939, 222, 14, 13, 42, 23204, 6, 24, 28704, 7, 10, 1836, 14, 74, 2564, 10, 204, 76, 793, 4, 5, 10199, 11, 5, 3701, 4889, 561, 6, 101, 23021, 23324, 4, 182, 3222, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==2:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    print('-'*10)
    

Input ids: [0, 133, 23204, 1468, 21, 14283, 4, 89, 58, 416, 80, 5916, 11, 5, 10199, 77, 24, 2035, 4, 939, 1299, 101, 24, 74, 45, 3568, 50, 10397, 157, 4, 939, 21, 67, 5779, 11, 5, 16383, 50, 141, 24, 10601, 15, 127, 809, 4, 939, 524, 3357, 5, 23204, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

With token concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
tdc = TextDataLMControllerStreaming(ddict_with_val,
                                    filter_dict={'Review Text': lambda x: x is not None},
                                    main_text='Review Text',
                                    batch_size=100
                                   )

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False)

-------------------- Data Filtering --------------------


Filter:   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing Content Transformation and Tokenization on Validation Set -----


Map:   0%|          | 0/2255 [00:00<?, ? examples/s]

Map:   0%|          | 0/2255 [00:00<?, ? examples/s]

[0, 713, 16, 10, 372, 3588, 111, 2579, 3793, 10199, 8, 5, 1275, 16, 10, 2579, 6, 11577, 20406, 1275, 3195, 4, 1437, 939, 2740, 10, 650, 111, 939, 524, 195, 108, 306, 113, 8, 2333, 3568, 10, 204, 11, 7001, 4, 1437, 42, 3588, 21, 98, 1307, 111, 24, 429, 28, 21230, 3994, 196, 53, 16274, 10, 650, 4, 1437, 939, 40, 33, 7, 671, 24, 111, 45, 190, 127, 27595, 26194, 64, 146, 24, 2564, 328, 2, 0, 19065, 21183, 12778, 9304, 13, 1035, 19, 1256, 1236, 21625, 1120, 5780, 8, 19427, 21288, 4617, 23, 23385, 8, 13977, 1971, 4, 2333, 3568, 10, 231, 11, 6215, 53, 439, 62, 10, 1836, 7, 290, 19, 209, 13, 4600, 19780, 21183, 12778, 2564, 4, 10698, 127, 43159, 15481, 61, 16, 460, 10, 2704, 4, 64, 75, 2067, 7, 3568, 106, 328, 2, 0, 100, 1705, 75, 679, 141, 1769, 42, 3588, 1088, 66, 77, 51, 58, 519, 5, 1136, 6174, 131, 51, 56, 95, 342, 24, 15, 5, 998, 8, 24, 1088, 66, 11, 127, 1836, 11, 10, 948, 9, 722, 328, 1437, 939, 95, 829, 5, 3588, 452, 8, 269, 101, 5, 7708, 1571, 9, 24, 4, 1437, 47, 64, 2773, 3588, 24, 6

[0, 387, 12807, 42, 3588, 13, 127, 1354, 36, 13665, 6988, 43, 8, 24, 16, 1969, 4, 24, 16, 10, 11962, 3588, 6, 1969, 13, 1035, 1799, 6, 780, 7657, 4, 24, 1302, 7, 422, 10, 410, 650, 8, 1437, 14902, 15673, 2773, 6, 53, 81, 70, 6, 79, 8, 939, 258, 32, 1372, 19, 5, 3588, 4, 2, 0, 133, 2564, 6, 5, 5933, 6, 5, 3195, 6, 5, 10199, 734, 1395, 224, 932, 2430, 23, 70, 328, 5, 647, 11904, 23, 5, 1400, 147, 939, 1381, 106, 15, 26, 51, 56, 95, 5335, 106, 11, 6, 8, 79, 115, 45, 2067, 7, 192, 141, 51, 1415, 15, 4, 51, 58, 95, 25, 19267, 196, 25, 939, 21, 328, 5, 9304, 64, 2773, 28, 7001, 62, 50, 159, 7586, 5, 6219, 3195, 16, 98, 6, 98, 6, 2721, 328, 939, 74, 1057, 209, 7, 28, 2330, 5, 425, 13, 215, 10, 239, 1318, 16259, 4, 1437, 939, 1701, 24, 10, 25944, 847, 8, 64, 10610, 13, 2516, 7, 283, 4, 2, 0, 100, 300, 42, 299, 11, 5, 909, 8, 939, 657, 24, 4, 5, 2576, 16, 10, 11052, 4122, 1468, 14, 18, 269, 2721, 6, 8, 5, 847, 995, 1254, 32, 269, 1256, 4, 98, 171, 9, 6215, 18, 24043, 11837, 32, 7934, 6, 53, 42,

[0, 3750, 78, 939, 21, 551, 30, 5, 2721, 21789, 31466, 8, 5, 3493, 804, 2018, 99, 1382, 7, 28, 10, 7082, 53, 14216, 33585, 4, 5, 3493, 531, 33, 57, 10899, 19, 21321, 11, 5, 124, 142, 6, 11, 621, 6, 5, 6399, 10698, 101, 10, 20812, 10178, 4, 396, 38875, 6, 89, 16, 10, 10499, 9, 59, 545, 4877, 4, 50121, 50118, 50121, 50118, 627, 1468, 16, 202, 2721, 4, 8, 142, 9, 14, 6, 939, 2220, 75, 1276, 549, 939, 581, 671, 42, 6399, 50, 860, 3931, 66, 103, 7400, 10199, 7, 1045, 10, 16126, 6, 55, 15898, 3989, 4, 2, 0, 100, 1467, 77, 939, 2162, 42, 5378, 337, 3398, 14, 89, 21, 402, 780, 59, 24, 734, 627, 35727, 5073, 9, 5, 1521, 16, 269, 1256, 25, 16, 5, 2496, 734, 4297, 3640, 111, 358, 881, 86, 939, 3568, 42, 5378, 337, 3398, 66, 36, 6709, 7651, 19, 10, 11962, 299, 50, 3588, 43, 939, 120, 7741, 9, 33391, 31, 70, 9, 127, 964, 4, 939, 393, 802, 10, 5378, 337, 3398, 74, 9049, 98, 203, 1607, 328, 4364, 31, 14, 24, 16, 2422, 18300, 734, 118, 348, 56, 24, 13, 10, 76, 122, 6, 33, 15158, 24, 484, 498, 6, 8, 24

[0, 100, 2740, 42, 11, 5, 18521, 29868, 8, 524, 11, 657, 19, 5, 5780, 8, 3195, 6, 182, 27360, 396, 145, 81, 5, 299, 4, 50121, 50118, 118, 437, 195, 108, 306, 1297, 17072, 29882, 6, 2631, 102, 8, 5, 132, 2564, 1969, 4, 5, 847, 16, 385, 31766, 219, 396, 145, 21592, 101, 8, 939, 5478, 14, 18, 24, 45, 25, 251, 8, 455, 25, 171, 9, 5, 97, 13657, 66, 89, 6, 442, 24, 3013, 7, 3568, 19, 103, 9304, 50, 31296, 4, 24, 190, 34, 410, 11803, 7, 946, 110, 11689, 26123, 11, 317, 6, 1712, 939, 2980, 939, 74, 304, 167, 4, 42, 299, 16, 3901, 13, 143, 1046, 396, 546, 350, 25650, 352, 2, 0, 713, 3235, 16, 157, 156, 6, 8, 16, 98, 33800, 11962, 4, 939, 2333, 3568, 10, 231, 6, 53, 939, 524, 10, 1836, 290, 11, 42, 36, 398, 201, 1836, 6, 1712, 5, 1049, 6694, 16, 28410, 7085, 811, 1836, 4, 51, 2142, 66, 716, 15, 5, 201, 1836, 2740, 322, 5, 129, 696, 939, 33, 16, 14, 939, 33, 10, 251, 12, 1173, 28762, 1118, 7, 127, 1836, 6, 98, 24, 18, 655, 98, 2829, 3229, 11, 5, 42613, 6, 53, 45, 615, 7, 146, 162, 2142, 24, 124, 

[0, 16587, 42, 299, 328, 2, 0, 243, 18, 10, 1256, 299, 6, 53, 24, 1237, 182, 765, 4, 5, 124, 16, 67, 1256, 192, 149, 98, 939, 437, 45, 686, 939, 40, 489, 24, 4, 24, 18, 373, 160, 5, 4793, 53, 24, 16, 98, 3229, 77, 47, 860, 8, 888, 3568, 24, 160, 5, 4793, 4, 372, 1521, 6, 95, 45, 686, 939, 40, 489, 24, 528, 7, 5933, 2, 0, 100, 300, 42, 3588, 13, 127, 3240, 2761, 54, 18, 11, 5, 609, 9, 2086, 2408, 4, 79, 18, 855, 2498, 10, 1836, 316, 19, 5863, 929, 53, 64, 10930, 15020, 88, 2735, 10070, 6122, 15, 5, 1152, 4, 42, 3588, 16, 263, 16771, 6608, 1528, 7, 1836, 4, 23, 78, 18969, 24, 1326, 101, 10, 929, 219, 3588, 53, 5, 8725, 233, 9, 5, 3588, 992, 7418, 62, 588, 3229, 4, 25, 10, 898, 6, 5, 2233, 219, 299, 233, 9, 5, 3588, 630, 75, 356, 350, 380, 8, 16, 11, 754, 182, 34203, 4, 5, 3588, 16, 182, 11052, 4122, 8, 1326, 25, 239, 1318, 25, 63, 425, 4, 2, 0, 16587, 42, 23204, 4, 24, 64, 356, 3588, 219, 50, 95, 235, 19, 10844, 328, 122, 114, 24, 40, 95, 3035, 159, 939, 64, 3568, 24, 328, 2, 0, 713, 358

[0, 30327, 372, 6013, 4, 2564, 1237, 10, 410, 650, 114, 47, 32, 6764, 4, 2, 0, 27331, 10, 9869, 6399, 8, 190, 600, 24, 16, 1104, 6, 12741, 8, 15651, 24, 16, 45, 192, 12, 11672, 98, 47, 40, 45, 33, 7, 4022, 59, 2498, 41, 18983, 28517, 11, 5, 1035, 3778, 4, 9869, 299, 328, 2, 0, 347, 4467, 8, 3473, 328, 939, 2638, 5, 8089, 8, 141, 3793, 5, 6399, 16, 328, 2, 0, 16587, 42, 1198, 12949, 23204, 13, 1136, 4, 129, 4617, 939, 938, 75, 5373, 59, 21, 5, 1844, 3369, 2629, 11, 5, 2380, 6, 24, 4044, 293, 10, 410, 33442, 172, 198, 5, 7524, 53, 45, 615, 9, 10, 17447, 13, 162, 7, 2142, 24, 124, 4, 2, 0, 100, 657, 11642, 2225, 6, 8, 77, 939, 794, 42, 3588, 804, 6, 939, 1467, 14, 939, 56, 7, 33, 24, 4, 42, 3588, 16, 190, 39083, 906, 11, 621, 87, 24, 16, 11, 5, 2170, 4, 24, 18, 1256, 8, 15651, 6, 8, 64, 2773, 28, 7001, 62, 50, 159, 4, 939, 303, 42, 3588, 7, 2564, 1528, 7, 1836, 4, 24, 2299, 34, 14, 22, 29126, 19495, 3588, 113, 2564, 7, 24, 6, 53, 939, 657, 24, 328, 42, 16, 10, 3588, 14, 939, 581, 619, 272

In [None]:
for i,v in enumerate(tdc.main_ddict['validation']):
    if i==2:break
    print(f"Input ids: {v['input_ids']}\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    print('-'*10)
    

Input ids: [0, 100, 2740, 5, 1275, 3195, 169, 6, 8, 5, 5103, 5780, 16, 182, 11962, 4, 14, 26, 6, 24, 16, 847, 10, 12, 1902, 8, 45, 101, 10, 1675, 6148, 159, 4, 149, 939, 303, 24, 326, 1872, 8, 205, 1318, 6, 24, 21, 10941, 87, 939, 74, 33, 6640, 4, 42, 26044, 5, 10, 12, 1902, 3989, 4, 939, 399, 75, 2198, 28101, 24, 6, 53, 939, 399, 75, 657, 24, 1169, 4, 2, 0, 100, 2162, 42, 299, 13, 127, 3795, 13, 29224, 13738, 4, 939, 938, 75, 686, 114, 79, 74, 101, 24, 6, 53, 24, 16, 269, 2579, 11, 621, 8, 79, 2638, 24, 4, 24, 2564, 1528, 7, 1836, 8, 64, 28, 7001, 159, 6, 25, 24, 16, 15, 5, 1421, 6, 50, 7001, 62, 4, 939, 524, 98, 7785, 939, 1276, 7, 120, 42, 299, 4, 2200, 5940, 4, 2, 0, 11475, 42, 2037, 127, 2295, 6, 939, 1705, 75, 905, 24, 213, 4, 939, 1682, 25614, 2838, 24, 734, 22710, 24, 21, 350, 3214, 6, 1153, 24, 17414, 6, 50121, 50118, 15605, 35156, 6, 10, 936, 114, 47, 770, 7, 3529, 402, 4, 50121, 50118, 4297, 5, 3195, 1415, 1423, 22383, 98, 939, 1747, 2740, 24, 17220, 50121, 50118, 463, 939, 

In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==1:break
#     print(f"Input ids: {v['input_ids']}")#\nDecoded: {tokenizer.decode(v['input_ids'])}\nAttention Mask: {v['attention_mask']}")
    print('-'*10)
    

[0, 43670, 3215, 42, 11, 1104, 4, 24, 16, 269, 2579, 8, 13350, 53, 182, 809, 31164, 8, 34262, 18116, 4, 2532, 24, 16, 127, 739, 9377, 6, 53, 24, 27401, 10, 828, 350, 203, 13, 127, 25896, 4, 648, 939, 206, 1374, 5, 1521, 16, 182, 2579, 98, 939, 524, 2396, 24, 8, 2818, 939, 64, 2217, 103, 2408, 11, 127, 18093, 4, 2, 0, 20930, 15, 5, 4281, 6173, 59, 2564, 6, 939, 9010, 7, 2229, 454, 939, 115, 860, 24, 15, 11, 5, 1400, 8, 21, 45, 5779, 4, 939, 437, 8020, 114, 89, 32, 430, 34183, 9, 42, 3588, 116, 939, 437, 195, 108, 406, 8, 5, 3588, 2564, 25, 11, 5, 1345, 4, 5, 526, 3369, 2629, 283, 1065, 127, 15145, 53, 45, 350, 239, 4, 3124, 6538, 32, 10, 828, 739, 98, 114, 47, 236, 55, 1953, 42, 189, 45, 28, 13, 47, 4, 5, 1468, 16, 23889, 8, 3793, 4, 5, 6353, 3195, 16, 12058, 4, 1836, 11036, 6, 24, 2299, 1326, 2233, 219, 8, 16377, 37359, 12, 3341, 114, 350, 7082, 4, 939, 524, 201, 257, 1250, 2, 0, 1185, 40, 657, 42, 1886, 6703, 4, 182, 157, 156, 19, 2721, 1254, 4, 5, 21764, 33, 5, 276, 4617, 25, 5, 1900

In [None]:
len(v['input_ids'])

14

In [None]:
len(v['input_ids'][0])

512

## Metadatas concatenation

If we think metadatas can be helpful, we can concatenate them into the front of your text, so that our text classification model is aware of it.

In this example, Let's add 'Title' as our metadata

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
tdc = TextDataLMController.from_df(df,
                                   main_text='Review Text',
                                   filter_dict={'Review Text': lambda x: x is not None},
                                   metadatas='Title',
                                   process_metas=True, # to preprocess the metadata (currently it's just empty space stripping and lowercasing),
                                   seed=42
                                  )

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 0, which is 0.00% of training set
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train'][:3]

{'Title': ['not flattering on me', '', ''],
 'Review Text': ['not flattering on me . I ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5\'9" about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.',
  " . So unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.",
  ' . This t-shirt does a great job of elevating the basic t-shirt in to one with a touch of flair. i typically wear a medium but luckily read earlier reviews and went with the small.']}

In [None]:
ddict['validation'][:3]

{'Title': ['', '', ''],
 'Review Text': [" . This picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.",
  ' . Easy to wear! cute, comfy...will be a go to for summer.',
  ' . Nice sweater, just did not look good on me. sorry, going back.']}

## Content Transformation

This processing allows you to **alter the text content in your dataset**. You need to define a function that accepts a single string and returns a new, processed string. Note that this transformation will be applied to ALL of your dataset (both train and validation)

Let's say we want to normalize our text, because the text might contain some extra spaces between words, or not follow the "single space after a period" rule

In [None]:
_tmp = "This is a      sentence,which doesn't follow any rule!No single space is provided after period or punctuation marks.    Maybe there are too many spaces!?!   "

In [None]:
from underthesea import text_normalize

In [None]:
text_normalize(_tmp)

"This is a sentence , which doesn't follow any rule ! No single space is provided after period or punctuation marks . Maybe there are too many spaces ! ? !"

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=text_normalize,
                         seed=42
                        )

In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train']['Review Text'][0]

'I ordered this online and was disappointed with the fit when it arrived . i ordered the xs and it was still oversize to the point of being unflattering . i am tall 5 \' 9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape . if you like a loose fit this might be for you . the material is thicker and warm and comfortable . i would suggest ordering down a size .'

In [None]:
ddict['validation']['Review Text'][0]

"This picture doesn't do the skirt justice . i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt . it is really pretty and flattering on ."

You can chain multiple functions. Let's say after text normalizing, I want to lowercase the text

In [None]:
str.lower('tHis IS NoT lowerCASE')

'this is not lowercase'

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42
                        )

In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
----- lower -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train']['Review Text'][0]

'i ordered this online and was disappointed with the fit when it arrived . i ordered the xs and it was still oversize to the point of being unflattering . i am tall 5 \' 9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape . if you like a loose fit this might be for you . the material is thicker and warm and comfortable . i would suggest ordering down a size .'

In [None]:
ddict['validation']['Review Text'][0]

"this picture doesn't do the skirt justice . i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt . it is really pretty and flattering on ."

## Tokenization

Define our tokenization

In [None]:
from transformers import RobertaTokenizer
from underthesea import text_normalize

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

### Option 1: Tokenize our corpus line-by-line

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

With no padding

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 18111
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 4529
    })
})

In [None]:
print(tokenizer.decode(tdc.main_ddict['train']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s>
<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s>


With padding (set `max_length` to `None` if you want to pad to model's maximum sequence length)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(tdc.main_ddict['train']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


### Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 13573
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 3446
    })
})

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it yet, i should have waited for it to go on sale.</s><s>... the print is so
------------------------------------------------------------------------------------------

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs petite fit perfectly. the dress hits me at the same length that is pictured. i think it would
--------------------------------------------------------------------------------------------

### Striding (For Concatenation of tokens)

If your sentences (or paragraphs) are larger than `max_length`, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. **Striding** is a way to somewhat preserve the sentence's meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20)
# Stride is 20, meaning for the next entry, we go back 20 tokens

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it
----------------------------------------------------------------

For the second entry, we can see it starts with the last 20 tokens of the previous entry: `comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made`)

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
 was a little shorter than i had expected, but i still really enjoy the cut and fit of it.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs pet
---------------------------------------------------------------------------------------------

## Data Collator

In [None]:
from underthesea import text_normalize
from transformers import AutoTokenizer

### For masked language model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Let's define our text controller first

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

We will tokenize our corpus line-by-line

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
tdc.data_collator

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True), mlm=True, mlm_probability=0.15, pad_to_multiple_of=8, tf_experimental_compile=False, return_tensors='pt')

Before applying the collator...


In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [0, 118, 2740, 42, 804, 8, 21, 5779, 19, 5, 2564, 77, 24, 2035, 479, 939, 2740, 5, 3023, 29, 8, 24, 21, 202, 81, 10799, 7, 5, 477, 9, 145, 29747, 24203, 479, 939, 524, 6764, 195, 128, 361, 22, 59, 8325, 2697, 8, 33, 10, 5342, 7174, 28762, 8, 356, 275, 11, 21543, 29, 14, 33, 103, 3989, 479, 114, 47, 101, 10, 7082, 2564, 42, 429, 28, 13, 47, 479, 5, 1468, 16, 33997, 8, 3279, 8, 3473, 479, 939, 74, 3608, 12926, 159, 10, 1836, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

We can see that the length of each token list is different from each other

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[91, 24, 79, 82, 121]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

Now all token lists have the same length, which is 128: a multiple of 8 and larger than the longest list in the batch (which is 121)

In [None]:
out['input_ids'].shape

torch.Size([5, 128])

In [None]:
out['input_ids'][:2,:]

tensor([[    0,  8496,  2740,    42,   804,     8, 50264,  5779,    19,     5,
          2564, 50264,    24, 50264,   479, 50264,  2740,     5, 50264,    29,
             8,    24,    21,   202,    81, 50264,     7,     5, 50264,     9,
           145, 29747, 24203,   479,   939,   524,  6764, 50264,   128,   361,
            22,    59,  8325,  2697,     8,    33,    10, 50264,  7174, 28762,
         42013,   356,   275,    11, 21543,    29,    14,    33,   103,  3989,
           479,   114,    47,   101,    10,  7082, 50264,    42,   429,    28,
            13,    47,   479,     5,  1468,    16, 33997,     8,  3279,     8,
          3473,   479,   939,    74,  3608, 12926, 50264,    10,  1836,   479,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

The `labels` have also been constructed, which shows the "mask" tokens (non -100) in which the model has to predict. To increase the amount of masked tokens, increase the `mlm_prob`

In [None]:
out['labels'][:2,:]

tensor([[ -100,   118,  -100,  -100,  -100,  -100,    21,  -100,  -100,  -100,
          -100,    77,  -100,  2035,  -100,   939,  -100,  -100,  3023,  -100,
          -100,  -100,  -100,  -100,  -100, 10799,  -100,  -100,   477,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,   195,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  5342,  -100,  -100,
             8,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  2564,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,   159,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

If you apply padding in the tokenization step (by adjusting the `max_length` argument), no matter whether it's line-by-line tokenization or not, the data collator will skip the padding step

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[    0,   118,  2740,    42, 50264,     8,    21,  5779,    19,     5,
          2564,    77,    24,  2035,   479,   939,  2740,     5,  3023,    29,
             8,    24,    21,   202,    81, 10799,     7,     5, 40728,     9,
         50264, 29747, 24203, 50264,   939,   524, 50264,   195,   128,   361,
            22, 50264,  8325,  2697,     8, 50264,    10,  5342,  7174, 28762,
             8,   356,   275,    11, 21543,    29,    14,    33,   103,  3989,
           479,   114, 50264,   101,    10,  7082,  2564,    42,   429,    28,
         50264,    47,   479,     5,  1468,    16, 33997,     8, 50264,     8,
         50264,   479, 50264,    74,  3608, 12926,   159,    10,  1836,   479,
             2,     0,  2527, 29747, 50264, 27785,   269,  5779,   479,   156],
        [  162,   356,   231,   353,  5283,     8,   939,   437, 50264,  4716,
          1459,  1836,   132,   479,     2,     0,   118,   657,   910,  7474,
           268,     8,    42,    65,    16,   269, 

In [None]:
out['labels'][:2,:]

tensor([[ -100,  -100,  -100,  -100,   804,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,    24,  -100,  -100,  -100,  -100,  -100,  -100,   477,  -100,
           145,  -100,  -100,   479,  -100,  -100,  6764,  -100,  -100,  -100,
          -100,    59,  -100,  -100,  -100,    33,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    47,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
            13,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  3279,  -100,
          3473,  -100,   939,    74,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100, 24203,  -100,  -100,  -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,    10,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,    42,  -100,  -100,  -100, 

Since we are using the concatenation-of-tokenization technique, one smart thing that the HuggingFace's `DataCollatorForLanguageModeling` (which is the data collator we use) does is to allow maskings at every position, at opposed to to the previous cases (with line-by-line tokenization), there's no masking near the end tokens of each list, because those end tokens are padding tokens

### For causal language model

In [None]:
from transformers import AutoTokenizer
from tokenizers import processors

Let's define our GPT2 tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

GPT2 does not use start/end-of-sentence token:

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']


If you want to perform concatenation-of-token, and you want your causal LM to differentiate between sentences, you can add a special token to separate sentences, as follow:

In [None]:
tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']


With this modified tokenizer, let's perform concatenation-of-tokenization using GPT2

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Since it's casual language modeling, let's turn off `is_mlm`

In [None]:
tdc.set_data_collator(is_mlm=False)

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

Let's apply the collator

In [None]:
out = tdc.data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764, 50256,    72,  1842,   374,  3361,   364,   290,   428,
           530,   318,  1107, 13779,   764,  1312, 

In [None]:
out['labels'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764,  -100,    72,  1842,   374,  3361,   364,   290,   428,
           530,   318,  1107, 13779,   764,  1312, 

For CLM, the `labels` are essentially the same as `input_ids`. From HuggingFace documentation:
```
`DataCollatorForLanguageModeling` will take care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training.
```

## Save and Load TextDataController

In [None]:
show_doc(TextDataLMController.save_as_pickles)

---

### TextDataLMController.save_as_pickles

>      TextDataLMController.save_as_pickles (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

In [None]:
show_doc(TextDataLMController.from_pickle)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.from_pickle

>      TextDataController.from_pickle (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

TextDataLMController object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

In [None]:
from datasets import disable_caching

In [None]:
disable_caching()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

tdc.set_data_collator(is_mlm=True,mlm_prob=0.15)

Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.save_as_pickles('my_lm_tdc')

Load back our object

In [None]:
tdc2 = TextDataLMController.from_pickle('my_lm_tdc')

You can still access all its attributes, data, preprocessings, transformations ...

In [None]:
tdc2.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 18111
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4529
    })
})

In [None]:
tdc2.filter_dict,tdc2.content_tfms

({'Review Text': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>])

In [None]:
# #| hide
# import nbdev; nbdev.nbdev_export()