# Text Main

> This module contains the main Python class for data control: `TextDataLMController`

- skip_showdoc: true
- skip_exec: true

####| default_exp text_main

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from datasets import DatasetDict,Dataset,IterableDataset,load_dataset,concatenate_datasets
from pathlib import Path
from that_nlp_library.utils import *
from that_nlp_library.text_main import *
from functools import partial
import warnings

In [None]:
import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
import os

## Class TextDataLMController

In [None]:
#| export
class TextDataLMController(TextDataController):
    def __init__(self,
                 inp, # HuggingFainpce Dataset or DatasetDict
                 main_text:str, # Name of the main text column
                 filter_dict={}, # A dictionary: {feature: filtering_function_for_that_feature}
                 metadatas=[], # Names of the metadata columns
                 process_metas=True, # Whether to do simple text processing on the chosen metadatas
                 content_transformations=[], # A list of text transformations
                 val_ratio:int|float|None=0.2, # Ratio of data for validation set
                 stratify_cols=[], # Column(s) needed to do stratified shuffle split
                 seed=None, # Random seed
                 batch_size=1000, # CPU batch size
                 num_proc=4, # Number of process for multiprocessing
                 cols_to_keep=None, # Columns to keep after all processings
                 verbose=True, # Whether to prdint processing information
                ):
        super().__init__(inp=inp,
                         main_text=main_text,
                         filter_dict=filter_dict,
                         metadatas=metadatas,
                         process_metas=process_metas,
                         content_transformations=content_transformations,
                         val_ratio=val_ratio,
                         stratify_cols=stratify_cols,
                         seed=seed,
                         batch_size=batch_size,
                         num_proc=num_proc,
                         cols_to_keep=cols_to_keep,
                         verbose=verbose
                        )
            
    
    def _do_label_transformation(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")
        
    def _encode_labels(self):
        raise NotImplementedError("There's no classification/regression label in text processing for Language Model")

    
    def _upsampling(self):
        raise NotImplementedError("There's no upsampling in text processing for Language Model")
      
    def _do_augmentation(self):
        raise NotImplementedError("There's no text augmentation in text processing for Language Model")
            
    def _do_train_shuffling(self):
        print_msg('Shuffling and flattening train set',20,verbose=self.verbose)
        self.main_ddict['train'] = self.main_ddict['train'].shuffle(seed=self.seed).flatten_indices(num_proc = self.num_proc)
        self.verboseprint('Done')

    def _group_texts_with_stride(self,examples):
        max_length = self.max_length
        stride = self.stride
        if stride is None: stride=max_length
        else: stride = max_length-stride
        if stride==0: raise ValueError(f'Stride cannot be equal to max length of {max_length}')
            
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        result_all={}
        for k,t in concatenated_examples.items():
            result=[]
            i=0
            while i+max_length<=total_length:
                result.append(t[i:i+max_length])
                i+=stride
            result_all[k]=result

        return result_all    
        
    def do_all_preprocessing(self,
                             shuffle_trn=True # To shuffle the train set before tokenization
                            ):
        if self._processed_call:
            warnings.warn('Your dataset has already been processed. Returning the previous processed DatasetDict...')
            return self.main_ddict
            
        print_msg('Start Main Text Processing',20,verbose=self.verbose)
        
        # Filtering
        self.dset,self.ddict_rest = self._do_filtering(self.dset,self.ddict_rest)
        
        # Process metadatas
        self.dset,self.ddict_rest = self._process_metadatas(self.dset,self.ddict_rest)
        
        
        # Content transformation
        self.dset,self.ddict_rest = self._do_transformation(self.dset,self.ddict_rest)
         
        # Train Test Split.
        ### self.main_ddict is created here
        self._train_test_split()
        
        # Dropping unused columns
        self._simplify_ddict()
        
        # Check validation leaking
        self._check_validation_leaking()
        
        # Shuffle train
        if shuffle_trn:
            self._do_train_shuffling()
        
        self._processed_call=True
        
        return self.main_ddict
    
        
    def do_tokenization(self,
                        tokenizer, # Tokenizer (preferably from HuggingFace)
                        max_length=None, # pad to model's allowed max length (default is max_sequence_length). Use -1 for no padding at all
                        line_by_line=True, # To whether tokenize each sentence separately, or concatenate them
                        stride=None, # option to do striding when line_by_line is False
                        trn_size=None, # The number of training data to be tokenized
                       ):
        # References
#         https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
#         https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
        
        print_msg('Tokenization',20,verbose=self.verbose)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.line_by_line = line_by_line
        self.stride = stride
        
        tok_func = partial(tokenize_function,tok=self.tokenizer,
                           max_length=max_length if line_by_line else -1,
                           return_special_tokens_mask=True)
        
        _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
        
        
        if trn_size is not None:
            if isinstance(trn_size,float):
                num_shard = int(1/trn_size)
            else: # int
                trn_len=len(self.main_ddict['train'])
                num_shard = trn_len//trn_size
            self.main_ddict['train'] = self.main_ddict['train'].shard(num_shard,0)
        
        for k in self.main_ddict.keys():
            self.main_ddict[k] = hf_map_dset(self.main_ddict[k],_func,self.is_batched,self.batch_size,self.num_proc)
            self.main_ddict[k] = self.main_ddict[k].remove_columns(self.cols_to_keep)
        
        if not line_by_line: # string concatenation
            for k in self.main_ddict.keys():
                self.main_ddict[k] = hf_map_dset(self.main_ddict[k],
                                                 self._group_texts_with_stride,
                                                 is_batched=True,
                                                 batch_size=self.batch_size if self.batch_size>1 else 1000,
                                                 num_proc=self.num_proc)
                
        
        self.verboseprint('Done')
        return self.main_ddict
        
    def process_and_tokenize(self,
                             tokenizer, # Tokenizer (preferably from HuggingFace)
                             max_length=None, # pad to model's allowed max length (default is max_sequence_length)
                             line_by_line=True, # To whether tokenize each sentence separately, or concatenate them and then tokenize
                             stride=None, # option to do striding when line_by_line is False
                             trn_size=None, # The number of training data to be tokenized
                             shuffle_trn=True, # To shuffle the train set before tokenization
                            ):
        """
        This will perform `do_all_processing` then `do_tokenization`
        """
        _ = self.do_all_preprocessing(shuffle_trn)
        _ = self.do_tokenization(tokenizer,max_length,line_by_line,stride,trn_size)
        
    
    def set_data_collator(self):
        # TODO
        self.data_collator = None
        
    
    
    def prepare_test_dataset(self,
                             test_dset, # The HuggingFace Dataset as Test set
                             do_filtering=False, # whether to perform data filtering on this test set
                            ):
        raise NotImplementedError("There's no test set preparation for Language Model")

In [None]:
show_doc(TextDataLMController)

---

### TextDataLMController

>      TextDataLMController (inp, main_text:str, filter_dict={}, metadatas=[],
>                            process_metas=True, content_transformations=[],
>                            val_ratio:int|float|None=0.2, stratify_cols=[],
>                            seed=None, batch_size=1000, num_proc=4,
>                            cols_to_keep=None, verbose=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | HuggingFainpce Dataset or DatasetDict |
| main_text | str |  | Name of the main text column |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_for_that_feature} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| content_transformations | list | [] | A list of text transformations |
| val_ratio | int \| float \| None | 0.2 | Ratio of data for validation set |
| stratify_cols | list | [] | Column(s) needed to do stratified shuffle split |
| seed | NoneType | None | Random seed |
| batch_size | int | 1000 | CPU batch size |
| num_proc | int | 4 | Number of process for multiprocessing |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to prdint processing information |

## Load data + Basic use case

In [None]:
show_doc(TextDataLMController.from_csv)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.from_csv

>      TextDataController.from_csv (file_path, **kwargs)

In [None]:
show_doc(TextDataLMController.from_df)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.from_df

>      TextDataController.from_df (df, validate=True, **kwargs)

You can create a `TextDataLMController` from a csv, pandas DataFrame, or directly from a HuggingFace dataset object. Currently, `TextDataLMController` is designed for processing text in order to train a language model


Dataset source: https://www.kaggle.com/datasets/kavita5/review_ecommerce

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')

In [None]:
df.shape

(23486, 10)

In [None]:
df.sample(5) 

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
214,1020,67,Not as pictured!,The skirt that i received had very little blue...,1,0,17,General Petite,Bottoms,Skirts
12051,1092,53,Sweet and pretty,I found this to be a very feminine and flatter...,5,1,4,General,Dresses,Dresses
7895,866,37,,Love love love this mock tank!!! goes fabulous...,5,1,0,General Petite,Tops,Knits
3158,862,40,Perfect closet staple,This is the best! i have the black and striped...,5,1,0,General,Tops,Knits
19859,881,41,Not so bad!,"I actually like this top, but i am wearing it ...",4,1,5,General,Tops,Knits


You can create a `TextDataLMController` from a dataframe. This also provides a quick input validation check (NaN check and Duplication check)

In [None]:
tdc = TextDataLMController.from_df(df,main_text='Review Text')

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


You can also create a `TextDataLMController` directly from the csv file. The good thing about using HuggingFace Dataset as the main backend  is that you can utilize lots of its useful functionality, such as caching

In [None]:
tdc = TextDataLMController.from_csv('sample_data/Womens_Clothing_Reviews.csv',main_text='Review Text')

You can also create a `TextDataLMController` from a HuggingFace Dataset

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
dset

Dataset({
    features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
    num_rows: 23486
})

In [None]:
tdc = TextDataLMController(dset,main_text='Review Text')

In the "Input Validation Precheck" above, we notice that our dataset has missing values in the text field and the label field. For now, let's load the data as a Pandas' DataFrame, perform some cleaning, and create our `TextDataLMController`

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')

In [None]:
df = df[(~df['Review Text'].isna()) & (~df['Department Name'].isna())].reset_index(drop=True)

In [None]:
tdc = TextDataLMController.from_df(df,main_text='Review Text')

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title    2966
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 1 rows


At this point you can start perform 2 important steps on your data

1. Text preprocessings + Train/Validation Split
2. Tokenization

In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18102 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18101 [00:00<?, ? examples/s]

Done


In [None]:
ddict

DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 18101
    })
    validation: Dataset({
        features: ['Review Text'],
        num_rows: 4526
    })
})

Our DatasetDict now has two split: train and validation. Note that train split is now IterableDataset, for processing efficiency

In [None]:
ddict['train'][:3]

{'Review Text': ['Very feminine, fun and if you like this style i highly recommend. i am a 6/8 and both fit the same so i went with the 6- 36c bust 29 waist-short torso. fabric nice and light and i need no cami! very attractive blouse and very retailer.',
  "For me this is the perfect blouse. alone or with a cardigan it's a winner. not too low cut, armholes are perfect. i would buy other colors if they were available.",
  "Love this blouse. per the previous review, there was no problem with my tailoring. it is exactly as shown. i plan to handwash this and take super good care of it. you can tell from the picture it could be an excellent wardrobe staple and absolutely timeless as well as the kind of sexy and elegant you can get away with in the office.\n\ni do think it runs a tiny bit small. however, it is not short. it's just quite a fitted piece. i would consider sizing up. i love the look of the medium on me, 5'10"]}

In [None]:
ddict['validation'][:3]

{'Review Text': ['The quality of the fabric these are made from is wonderful- they are soft yet structured and i was so bummed they didn\'t fit me. i usually wear a size 28 in premium denim so i ordered a 28 and i could barely pull these up over my hips. once i pulled them on i was about 1-2 inches from being able to zip them closed. the pants fit in the legs but they were very close-fitting, which is not the look i\'m going for. i am 5\'5" and was worried they would look too long since usually pants are for me, but',
  'I bought the one with blue and white top and black bottom. absolutely loved it! i am 5"2 so the dress is part is a bit long for me---ended up wearing my 4.5 inch heels to make it fit. the fabric is more heavy than what it seems in the pictures. but that makes it more formal. it\'s great!',
  'I had been looking for a tee to wear under a kimono i bought last year at retailer. the cut out slub tee was the perfect tee. the slate color goes well with the colors in the kimo

## Filtering

This preprocessing step allow you to filter out certain values of a certain column in your dataset. Let's say I want to filter out any None value in the column 'Review Text'

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
df[(~df['Review Text'].isna())].isna().sum()

Clothing ID                   0
Age                           0
Title                      2966
Review Text                   0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                13
Department Name              13
Class Name                   13
dtype: int64

We will provide a dictionary containing the name of the column and the filtering function to apply on that column. Note that **the filtering function will receive an item from the column, and the function should return a boolean**

In [None]:
tdc = TextDataLMController.from_df(df,
                                 main_text='Review Text',
                                 filter_dict={'Review Text': lambda x: x is not None},
                                 seed=42
                                )

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Done


In [None]:
ddict

DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 18111
    })
    validation: Dataset({
        features: ['Review Text'],
        num_rows: 4529
    })
})

Let's check if we have filtered out all NaN/None value

In [None]:
for i in ddict['train']['Review Text']:
    assert i is not None
for i in ddict['validation']['Review Text']:
    assert i is not None

We can even add multiple filtering functions. Remember from our precheck, there are also None values in 'Department Name'. While we are at it, let's filter out any rating that is less than 3 (just to showcase what our filtering can do)

In [None]:
df.Rating.value_counts()

Rating
5    13131
4     5077
3     2871
2     1565
1      842
Name: count, dtype: int64

Note that `TextDataLMController` will only keep the text and the metadatas columns; any other column will be dropped. To double-check our result, we need to define the `cols_to_keep` argument

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
tdc = TextDataLMController.from_df(df,
                                   main_text='Review Text',
                                   filter_dict={'Review Text': lambda x: x is not None,
                                                'Department Name': lambda x: x is not None,
                                                'Rating': lambda x: x>=3
                                               },
                                   cols_to_keep=['Review Text','Rating','Department Name'],
                                   seed=42
                                  )

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

----- Do <lambda> on Department Name -----


Filter (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

----- Do <lambda> on Rating -----


Filter (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/16206 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/16205 [00:00<?, ? examples/s]

Done


In [None]:
for i in ddict['train']['Department Name']:
    assert i is not None
for i in ddict['validation']['Department Name']:
    assert i is not None

for i in ddict['train']['Rating']:
    assert i is not None
for i in ddict['validation']['Rating']:
    assert i >= 3

## Metadatas concatenation

If we think metadatas can be helpful, we can concatenate them into the front of your text, so that our text classification model is aware of it.

In this example, Let's add 'Title' as our metadata

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
tdc = TextDataLMController.from_df(df,
                                   main_text='Review Text',
                                   filter_dict={'Review Text': lambda x: x is not None},
                                   metadatas='Title',
                                   process_metas=True, # to preprocess the metadata (currently it's just empty space stripping and lowercasing),
                                   seed=42
                                  )

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 0, which is 0.00% of training set
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train'][:3]

{'Title': ['not flattering on me', '', ''],
 'Review Text': ['not flattering on me . I ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5\'9" about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.',
  " . So unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.",
  ' . This t-shirt does a great job of elevating the basic t-shirt in to one with a touch of flair. i typically wear a medium but luckily read earlier reviews and went with the small.']}

In [None]:
ddict['validation'][:3]

{'Title': ['', '', ''],
 'Review Text': [" . This picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.",
  ' . Easy to wear! cute, comfy...will be a go to for summer.',
  ' . Nice sweater, just did not look good on me. sorry, going back.']}

## Content Transformation

This processing allows you to **alter the text content in your dataset**. You need to define a function that accepts a single string and returns a new, processed string. Note that this transformation will be applied to ALL of your dataset (both train and validation)

Let's say we want to normalize our text, because the text might contain some extra spaces between words, or not follow the "single space after a period" rule

In [None]:
_tmp = "This is a      sentence,which doesn't follow any rule!No single space is provided after period or punctuation marks.    Maybe there are too many spaces!?!   "

In [None]:
from underthesea import text_normalize

In [None]:
text_normalize(_tmp)

"This is a sentence , which doesn't follow any rule ! No single space is provided after period or punctuation marks . Maybe there are too many spaces ! ? !"

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=text_normalize,
                         seed=42
                        )

In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train']['Review Text'][0]

'I ordered this online and was disappointed with the fit when it arrived . i ordered the xs and it was still oversize to the point of being unflattering . i am tall 5 \' 9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape . if you like a loose fit this might be for you . the material is thicker and warm and comfortable . i would suggest ordering down a size .'

In [None]:
ddict['validation']['Review Text'][0]

"This picture doesn't do the skirt justice . i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt . it is really pretty and flattering on ."

You can chain multiple functions. Let's say after text normalizing, I want to lowercase the text

In [None]:
str.lower('tHis IS NoT lowerCASE')

'this is not lowercase'

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42
                        )

In [None]:
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----
Done
-------------------- Text Transformation --------------------
----- text_normalize -----
----- lower -----
Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Done


In [None]:
ddict['train']['Review Text'][0]

'i ordered this online and was disappointed with the fit when it arrived . i ordered the xs and it was still oversize to the point of being unflattering . i am tall 5 \' 9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape . if you like a loose fit this might be for you . the material is thicker and warm and comfortable . i would suggest ordering down a size .'

In [None]:
ddict['validation']['Review Text'][0]

"this picture doesn't do the skirt justice . i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt . it is really pretty and flattering on ."

## Train/Validation Split

There are several ways to perform a train/validation split with `TextDataLMController`

The first way is when you already have a validation split in your HuggingFace's Dataset. Let's use the Dataset built-in function `train_test_split` to simulate this

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
# This will create a 'test' split instead of 'validation', so we will process a bit to have a validation split
ddict_with_val['validation']=ddict_with_val['test']
del ddict_with_val['test']

In [None]:
ddict_with_val

DatasetDict({
    train: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 21137
    })
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'],
        num_rows: 2349
    })
})

In [None]:
tdc = TextDataLMController(ddict_with_val,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         seed=42
                        )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/21137 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split already exists
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 1, which is 0.00% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/20359 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/20358 [00:00<?, ? examples/s]

Done


In [None]:
ddict

DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 20358
    })
    validation: Dataset({
        features: ['Review Text'],
        num_rows: 2282
    })
})

A second way is to split randomly based on a ratio (a float between 0 and 1), or based on the number of data in your validation set

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         val_ratio=0.15,
                         seed=42,
                         verbose=False
                        )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)
ddict

Filter (num_proc=4):   0%|          | 0/19244 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/19243 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 19243
    })
    validation: Dataset({
        features: ['Review Text'],
        num_rows: 3397
    })
})

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         val_ratio=5000,
                         seed=42,
                         verbose=False
                        )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)
ddict

Filter (num_proc=4):   0%|          | 0/17641 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/17640 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 17640
    })
    validation: Dataset({
        features: ['Review Text'],
        num_rows: 5000
    })
})

A third way is to do a random stratified split (inspired by [sklearn's](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)). Let's do a stratified split based on our label 'Department Name'

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')

In [None]:
df['Department Name'].value_counts(normalize=True)

Department Name
Tops        0.445978
Dresses     0.269214
Bottoms     0.161852
Intimate    0.073918
Jackets     0.043967
Trend       0.005070
Name: proportion, dtype: float64

In [None]:
tdc = TextDataLMController.from_df(df,
                                 main_text='Review Text',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 val_ratio=0.2,
                                 stratify_cols='Department Name',
                                 cols_to_keep=['Review Text','Department Name'],
                                 seed=42
                                )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)
ddict

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows
-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

----- Do <lambda> on Department Name -----


Filter (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio, with stratifying


Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/22628 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 2, which is 0.01% of training set
Filtering leaked data out of training set...


Filter (num_proc=4):   0%|          | 0/18102 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18100 [00:00<?, ? examples/s]

Done


DatasetDict({
    train: Dataset({
        features: ['Review Text', 'Department Name'],
        num_rows: 18100
    })
    validation: Dataset({
        features: ['Review Text', 'Department Name'],
        num_rows: 4526
    })
})

In [None]:
pd.Series(ddict['train']['Department Name']).value_counts(normalize=True)

Tops        0.444033
Dresses     0.271602
Bottoms     0.161878
Intimate    0.072983
Jackets     0.044309
Trend       0.005193
Name: proportion, dtype: float64

In [None]:
pd.Series(ddict['validation']['Department Name']).value_counts(normalize=True)

Tops        0.444101
Dresses     0.271542
Bottoms     0.161732
Intimate    0.073133
Jackets     0.044189
Trend       0.005303
Name: proportion, dtype: float64

You can also use multiple columns for your stratification

In [None]:
tdc = TextDataLMController.from_df(df,
                                 main_text='Review Text',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 val_ratio=0.2,
                                 stratify_cols=['Department Name','Rating'],
                                 cols_to_keep=['Review Text','Department Name','Rating'],
                                 seed=42,
                                 verbose=False
                                )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)
ddict

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/22628 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/18102 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/18100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review Text', 'Rating', 'Department Name'],
        num_rows: 18100
    })
    validation: Dataset({
        features: ['Review Text', 'Rating', 'Department Name'],
        num_rows: 4526
    })
})

And finally, you can omit any validation split if you specify `val_ratio` as ```None```

In [None]:
tdc = TextDataLMController.from_df(df,
                                 main_text='Review Text',
                                 filter_dict={'Review Text': lambda x: x is not None},
                                 val_ratio=None,
                                 seed=42
                                )
ddict = tdc.do_all_preprocessing(shuffle_trn=True)
ddict

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title              3810
Review Text         845
Division Name        14
Department Name      14
Class Name           14
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 21 rows
-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
No validation split defined
Done
-------------------- Dropping unused features --------------------
Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Done


DatasetDict({
    train: Dataset({
        features: ['Review Text'],
        num_rows: 22641
    })
})

## Tokenization

Define our tokenization

In [None]:
from transformers import RobertaTokenizer
from underthesea import text_normalize

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

### Option 1: Tokenize our corpus line-by-line

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

With no padding

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 18111
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 4529
    })
})

In [None]:
print(tokenizer.decode(tdc.main_ddict['train']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s>
<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s>


With padding (set `max_length` to `None` if you want to pad to model's maximum sequence length)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(tdc.main_ddict['train']['input_ids'][0]))
print(tokenizer.decode(tdc.main_ddict['validation']['input_ids'][0]))

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


### Option 2: Tokenize every text, then concatenate them together before splitting them in smaller parts.


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 13573
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 3446
    })
})

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it yet, i should have waited for it to go on sale.</s><s>... the print is so
------------------------------------------------------------------------------------------

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs petite fit perfectly. the dress hits me at the same length that is pictured. i think it would
--------------------------------------------------------------------------------------------

### Striding (For Concatenation of tokens)

If your sentences (or paragraphs) are larger than `max_length`, after concatenation, they will be broken apart; your long paragraph will be incompleted in terms of meaning. **Striding** is a way to somewhat preserve the sentence's meaning, by getting part of the sentence back. We will demonstrate it with an example, and you can compare it with the previous one (without striding) to see the differences

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20)
# Stride is 20, meaning for the next entry, we go back 20 tokens

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
for i in tdc.main_ddict['train']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made
----------------------------------------------------------------------------------------------------
 comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.</s><s>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it
----------------------------------------------------------------

For the second entry, we can see it starts with the last 20 tokens of the previous entry: `comfortable. i would suggest ordering down a size.</s><s>so unflattering! really disappointed. made`)

In [None]:
for i in tdc.main_ddict['validation']['input_ids'][:3]:
    print(tokenizer.decode(i))
    print('-'*100)

<s>this picture doesn't do the skirt justice. i paired it with a creme colored cashmere cowlneck sweater and a silver jeweled belt. it is really pretty and flattering on.</s><s>easy to wear! cute, comfy... will be a go to for summer.</s><s>nice sweater, just did not look good on me. sorry, going back.</s><s>this jacket was a little shorter than i had expected, but i still really enjoy the cut and fit of it
----------------------------------------------------------------------------------------------------
 was a little shorter than i had expected, but i still really enjoy the cut and fit of it.</s><s>i wasn't planning on loving this dress when i tried it on. i loved the the color which is what prompted me to buy it. this dress fit perfectly. it hugs my body without feeling tight. the ruching is perfect. i didn't want to take it off! it's also very comfortable. i'm 5'1 ", 107 lbs and the xs pet
---------------------------------------------------------------------------------------------

## Data Collator

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
from underthesea import text_normalize

### For masked language model

In [None]:
from transformers import RobertaTokenizer


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

With line-by-line, no padding

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = True # line-by-line

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
tdc.main_ddict['train']

Dataset({
    features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 18111
})

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[91, 24, 79, 82, 121]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [0, 118, 2740, 42, 804, 8, 21, 5779, 19, 5, 2564, 77, 24, 2035, 479, 939, 2740, 5, 3023, 29, 8, 24, 21, 202, 81, 10799, 7, 5, 477, 9, 145, 29747, 24203, 479, 939, 524, 6764, 195, 128, 361, 22, 59, 8325, 2697, 8, 33, 10, 5342, 7174, 28762, 8, 356, 275, 11, 21543, 29, 14, 33, 103, 3989, 479, 114, 47, 101, 10, 7082, 2564, 42, 429, 28, 13, 47, 479, 5, 1468, 16, 33997, 8, 3279, 8, 3473, 479, 939, 74, 3608, 12926, 159, 10, 1836, 479, 2], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 128])

In [None]:
out['labels'][:2,:]

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  2740,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,   361,
          -100,  -100,  -100,  2697,  -100,    33,  -100,  -100,  7174, 28762,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100, 33997,  -100,  -100,     8,
          -100,  -100,   939,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

With line-by-line, padding

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = False

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [0, 118, 2740, 42, 804, 8, 21, 5779, 19, 5, 2564, 77, 24, 2035, 479, 939, 2740, 5, 3023, 29, 8, 24, 21, 202, 81, 10799, 7, 5, 477, 9, 145, 29747, 24203, 479, 939, 524, 6764, 195, 128, 361, 22, 59, 8325, 2697, 8, 33, 10, 5342, 7174, 28762, 8, 356, 275, 11, 21543, 29, 14, 33, 103, 3989, 479, 114, 47, 101, 10, 7082, 2564, 42, 429, 28, 13, 47, 479, 5, 1468, 16, 33997, 8, 3279, 8, 3473, 479, 939, 74, 3608, 12926, 159, 10, 1836, 479, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['labels'][:2,:]

tensor([[ -100,   118,  2740,  -100,  -100,  -100,    21,  -100,  -100,  -100,
          -100,  -100,  -100,  2035,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100, 10799,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  2697,     8,  -100,  -100,  -100,  -100, 28762,
          -100,  -100,   275,  -100,  -100,    29,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  2564,  -100,  -100,    28,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  3279,  -100,
          -100,  -100,  -100,    74,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [ -100,  2527, 29747, 24203,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,   939,  -100,  -100,  -100,  -100,
          1836,   132,  -100,  -100,  -100,  -100, 

Without line-by-line

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = False

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [0, 118, 2740, 42, 804, 8, 21, 5779, 19, 5, 2564, 77, 24, 2035, 479, 939, 2740, 5, 3023, 29, 8, 24, 21, 202, 81, 10799, 7, 5, 477, 9, 145, 29747, 24203, 479, 939, 524, 6764, 195, 128, 361, 22, 59, 8325, 2697, 8, 33, 10, 5342, 7174, 28762, 8, 356, 275, 11, 21543, 29, 14, 33, 103, 3989, 479, 114, 47, 101, 10, 7082, 2564, 42, 429, 28, 13, 47, 479, 5, 1468, 16, 33997, 8, 3279, 8, 3473, 479, 939, 74, 3608, 12926, 159, 10, 1836, 479, 2, 0, 2527, 29747, 24203, 27785, 269, 5779, 479, 156], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['labels'][:2,:]

tensor([[ -100,  -100,  2740,  -100,  -100,  -100,    21,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  2740,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,    81, 10799,  -100,  -100,  -100,  -100,
          -100,  -100, 24203,  -100,  -100,  -100,  -100,  -100,   128,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,   275,  -100, 21543,    29,  -100,  -100,  -100,  3989,
          -100,  -100,    47,   101,    10,  -100,  -100,  -100,   429,  -100,
          -100,    47,  -100,  -100,  -100,  -100,  -100,     8,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100, 24203, 27785,   269,  5779,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100, 

### For causal language model

In [None]:
from transformers import AutoTokenizer
from tokenizers import processors

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone']


In [None]:
tokenizer._tokenizer.post_processor = processors.TemplateProcessing(
    single="$A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id)],
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer("this is a text. That is a second text.But there's a third one")['input_ids']))

['this', 'Ġis', 'Ġa', 'Ġtext', '.', 'ĠThat', 'Ġis', 'Ġa', 'Ġsecond', 'Ġtext', '.', 'But', 'Ġthere', "'s", 'Ġa', 'Ġthird', 'Ġone', '<|endoftext|>']


With line-by-line, no padding (Note: GPT2 is not typically trained on line-by-line sentences like this)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=-1)

Filter (num_proc=4):   0%|          | 0/18112 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = True # line-by-line

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=False,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
tdc.main_ddict['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 18111
})

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[90, 23, 78, 81, 120]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [72, 6149, 428, 2691, 290, 373, 11679, 351, 262, 4197, 618, 340, 5284, 764, 1312, 6149, 262, 2124, 82, 290, 340, 373, 991, 625, 7857, 284, 262, 966, 286, 852, 42880, 16475, 764, 1312, 716, 7331, 642, 705, 860, 366, 546, 11323, 8059, 290, 423, 257, 6547, 7888, 28668, 290, 804, 1266, 287, 16270, 82, 326, 423, 617, 5485, 764, 611, 345, 588, 257, 9155, 4197, 428, 1244, 307, 329, 345, 764, 262, 2587, 318, 29175, 290, 5814, 290, 6792, 764, 1312, 561, 1950, 16216, 866, 257, 2546, 764, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 120])

In [None]:
out['input_ids'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [  568, 42880, 16475,  5145,  1107, 11679, 

In [None]:
out['labels'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [  568, 42880, 16475,  5145,  1107, 11679, 

With line-by-line, padding

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=True,max_length=100)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = False

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=False,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [72, 6149, 428, 2691, 290, 373, 11679, 351, 262, 4197, 618, 340, 5284, 764, 1312, 6149, 262, 2124, 82, 290, 340, 373, 991, 625, 7857, 284, 262, 966, 286, 852, 42880, 16475, 764, 1312, 716, 7331, 642, 705, 860, 366, 546, 11323, 8059, 290, 423, 257, 6547, 7888, 28668, 290, 804, 1266, 287, 16270, 82, 326, 423, 617, 5485, 764, 611, 345, 588, 257, 9155, 4197, 428, 1244, 307, 329, 345, 764, 262, 2587, 318, 29175, 290, 5814, 290, 6792, 764, 1312, 561, 1950, 16216, 866, 257, 2546, 764, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'special_tokens_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
out['input_ids'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [  568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804,
           718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764, 50256, 50256, 50256, 50256, 

In [None]:
out['labels'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [  568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804,
           718,  1227, 10423,   290,  1312,  1101,   257,  4273,   578,  2546,
           362,   764,  -100,  -100,  -100,  -100, 

Without line-by-line

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataLMController(dset,
                         main_text='Review Text',
                         filter_dict={'Review Text': lambda x: x is not None},
                         content_transformations=[text_normalize,str.lower],
                         seed=42,
                         verbose=False,
                        )

In [None]:
tdc.process_and_tokenize(tokenizer,line_by_line=False,max_length=100,stride=20)

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/18111 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

In [None]:
pad_to_multiple_of_8 = False

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=False,
                                                pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
                                               )

In [None]:
list(map(len,tdc.main_ddict['train']['input_ids'][:5]))

[100, 100, 100, 100, 100]

In [None]:
print([tdc.main_ddict['train'][i] for i in range(2)])

[{'input_ids': [72, 6149, 428, 2691, 290, 373, 11679, 351, 262, 4197, 618, 340, 5284, 764, 1312, 6149, 262, 2124, 82, 290, 340, 373, 991, 625, 7857, 284, 262, 966, 286, 852, 42880, 16475, 764, 1312, 716, 7331, 642, 705, 860, 366, 546, 11323, 8059, 290, 423, 257, 6547, 7888, 28668, 290, 804, 1266, 287, 16270, 82, 326, 423, 617, 5485, 764, 611, 345, 588, 257, 9155, 4197, 428, 1244, 307, 329, 345, 764, 262, 2587, 318, 29175, 290, 5814, 290, 6792, 764, 1312, 561, 1950, 16216, 866, 257, 2546, 764, 50256, 568, 42880, 16475, 5145, 1107, 11679, 764, 925, 502, 804], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
out = data_collator([tdc.main_ddict['train'][i] for i in range(5)]) # simulation with batch size 5

In [None]:
out.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
out['input_ids'].shape

torch.Size([5, 100])

In [None]:
print(tokenizer.decode(out['input_ids'][0,:]))
print(tokenizer.decode(out['input_ids'][1,:]))

i ordered this online and was disappointed with the fit when it arrived. i ordered the xs and it was still oversize to the point of being unflattering. i am tall 5'9 " about 130 pounds and have a fairly thin torso and look best in cloths that have some shape. if you like a loose fit this might be for you. the material is thicker and warm and comfortable. i would suggest ordering down a size.<|endoftext|>so unflattering! really disappointed. made me look
. i would suggest ordering down a size.<|endoftext|>so unflattering! really disappointed. made me look 6 month pregnant and i'm a petite size 2.<|endoftext|>i love rompers and this one is really cute. i usually wear size 12 but should have got a 10, it runs big. it seems too long, and i'm 5'9 ". the prints cute but a little blah. i paid $ 158 which is too much, since i haven't worn it yet, i


In [None]:
out['input_ids'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  764,  1312,   561,  1950, 16216,   866,   257,  2546,   764, 50256,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804,
           718,  1227, 10423,   290,  1312,  1101, 

In [None]:
out['labels'][:2,:]

tensor([[   72,  6149,   428,  2691,   290,   373, 11679,   351,   262,  4197,
           618,   340,  5284,   764,  1312,  6149,   262,  2124,    82,   290,
           340,   373,   991,   625,  7857,   284,   262,   966,   286,   852,
         42880, 16475,   764,  1312,   716,  7331,   642,   705,   860,   366,
           546, 11323,  8059,   290,   423,   257,  6547,  7888, 28668,   290,
           804,  1266,   287, 16270,    82,   326,   423,   617,  5485,   764,
           611,   345,   588,   257,  9155,  4197,   428,  1244,   307,   329,
           345,   764,   262,  2587,   318, 29175,   290,  5814,   290,  6792,
           764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804],
        [  764,  1312,   561,  1950, 16216,   866,   257,  2546,   764,  -100,
           568, 42880, 16475,  5145,  1107, 11679,   764,   925,   502,   804,
           718,  1227, 10423,   290,  1312,  1101, 

## Save and Load TextDataController

In [None]:
show_doc(TextDataLMController.save_as_pickles)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.save_as_pickles

>      TextDataController.save_as_pickles (fname, parent='pickle_files',
>                                          drop_attributes=False)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |
| drop_attributes | bool | False | Whether to drop large-size attributes |

In [None]:
show_doc(TextDataLMController.from_pickle)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextDataController.from_pickle

>      TextDataController.from_pickle (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

TextDataController object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

In [None]:
from datasets import disable_caching

In [None]:
disable_caching() # disable huggingface caching to see data size

In [None]:
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

In [None]:
aug2 = naw.ContextualWordEmbsAug(model_path='roberta-base', 
                                device='cuda:0', # if you don't have gpu, change to 'cpu'
                                action="substitute",
                                top_k=10,
                               aug_p=0.07)

contextual_aug_func = partial(nlp_aug_stochastic,aug=aug2,p=0.1)
# add these 2 instance variables to your gpu augmentation
contextual_aug_func.run_on_gpu=True
contextual_aug_func.batch_size=32

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations = [text_normalize,str.lower],
                         content_augmentations = contextual_aug_func, 
                         process_metas=True,
                         seed=42
                        )
tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=True)

-------------------- Start Main Text Processing --------------------
-------------------- Data Filtering --------------------
----- Do <lambda> on Review Text -----


Filter (num_proc=4):   0%|          | 0/23486 [00:00<?, ? examples/s]

----- Do <lambda> on Department Name -----


Filter (num_proc=4):   0%|          | 0/22641 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Done
----- Label Encoding -----


Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Done
-------------------- Text Transformation --------------------
----- text_normalize -----


Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

----- lower -----


Map (num_proc=4):   0%|          | 0/22628 [00:00<?, ? examples/s]

Done
-------------------- Train Test Split --------------------
Validation split based on val_ratio
Done
-------------------- Dropping unused features --------------------
Done
- Number of rows leaked: 0, which is 0.00% of training set
-------------------- Text Augmentation --------------------
----- nlp_aug_stochastic -----


Map:   0%|          | 0/18102 [00:00<?, ? examples/s]

Done
-------------------- Shuffling and flattening train set --------------------


Flattening the indices (num_proc=4):   0%|          | 0/18102 [00:00<?, ? examples/s]

Done
-------------------- Tokenization --------------------


Map (num_proc=4):   0%|          | 0/18102 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4526 [00:00<?, ? examples/s]

Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18102
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

In [None]:
tdc.save_as_pickles('my_tdc')

Let's check the file size

In [None]:
file_stats = os.stat(Path('pickle_files/my_tdc.pkl'))
print(f'File Size in MegaBytes is {round(file_stats.st_size / (1024 * 1024), 3)}')

File Size in MegaBytes is 479.387


Load back our object

In [None]:
tdc2 = TextDataController.from_pickle('my_tdc')

You can still access all its attributes, data, preprocessings, transformation/augmentation ...

In [None]:
tdc2.main_ddict

DatasetDict({
    train: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18102
    })
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4526
    })
})

In [None]:
for i,v in enumerate(tdc2.main_ddict['train']):
    if i==3:break
    print(f"Text: {v['Review Text']}\nLabel: {v['Department Name']} => {v['label']}")
    print('-'*10)

Text: general petite . meh . this tunic is way over priced for the style and quality . it fit comfortably ( runs a size larger ) but it's not really flattering , it jut kind of hangs there looking ok . it is a little too deep of a v cut for a work top as well . this top does not support the price at all . it felt like something i could find at department store for way less . i will be returning it .
Label: Tops => 4
----------
Text: general petite . love byron lars . this dress , like all byron lars dresses is a work of art . it has true quality of workmanship . and fits like a glove . i always get compliments when i wear any of his dresses and i have 5 ! . this one is somewhere between casual and dressy . perfect for a dinner out on a saturday night ! . order a petite if you are under 5 5 "
Label: Bottoms => 0
----------
Text: general petite . snap neck pullover . i love this top . i ordered it in a large thinking it would be a tight rib but it is not so i reordered it in a small . i 

In [None]:
tdc2.label_lists

[['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

In [None]:
tdc2.filter_dict,tdc2.content_tfms,tdc2.aug_tfms

({'Review Text': <function __main__.<lambda>(x)>,
  'Department Name': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>],
 [functools.partial(<function nlp_aug_stochastic>, aug=<nlpaug.augmenter.word.context_word_embs.ContextualWordEmbsAug object>, p=0.1)])

If you don't want to store the HuggingFace DatasetDict in your `TextDataController`, or the augmentation functions (typically when you already have a trained model, and you only use `TextDataController` to preprocess the test set), you can remove it in the `save_as_pickles` step

In [None]:
tdc.save_as_pickles('my_lightweight_tdc',drop_attributes=True)

Let's check the file size

In [None]:
file_stats = os.stat(Path('pickle_files/my_lightweight_tdc.pkl'))
print(f'File Size in MegaBytes is {round(file_stats.st_size / (1024 * 1024), 3)}')

File Size in MegaBytes is 2.279


Load it back

In [None]:
tdc3 = TextDataController.from_pickle('my_lightweight_tdc')

We will use this object to demonstrate the Test Set Construction in the next section

In [None]:
#| hide
# import nbdev; nbdev.nbdev_export()