# Text Processing Benchmark

> This module contains some benchmarks for `TextDataController`

- skip_showdoc: true
- skip_exec: true

In [None]:
# !conda list | grep 'datasets\|transformers'
# datasets                  2.11.0                   pypi_0    pypi
# transformers              4.28.1                   pypi_0    pypi

datasets                  2.11.0                   pypi_0    pypi
transformers              4.28.1                   pypi_0    pypi


In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset,enable_caching,disable_caching
from transformers import RobertaTokenizer
import os
import time
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
from functools import partial
import random
from memory_profiler import memory_usage

In [None]:
disable_caching() # disable huggingface caching to get a fair benchmark

In [None]:
def benchmarking(tdc,bs,tokenizer,n=10,shuffle_trn=True):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==bs*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*bs} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items): {(time3-time2):.3f} s')

#     print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,bs,tokenizer,n=10,shuffle_trn=True):
    mem_usage = memory_usage((benchmarking,[tdc,bs,tokenizer,n,shuffle_trn]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

## Benchmark on medium-size dataset (~117k rows)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)
len(dset)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


117430

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
bs=128

### Without iterable dataset

With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map:   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 37.038 s
Time it takes to go through 1280 items: 0.155 s
Maximum memory usage: 762.734 MiB


With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map:   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 40.147 s
Time it takes to go through 1280 items: 0.181 s
Maximum memory usage: 806.473 MiB


With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map:   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 62.309 s
Time it takes to go through 1280 items: 0.183 s
Maximum memory usage: 859.008 MiB


With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer,shuffle_trn=False)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map:   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 59.452 s
Time it takes to go through 1280 items: 0.184 s
Maximum memory usage: 867.031 MiB


With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=2048,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,2048,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map:   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 61.862 s
Time it takes to go through 20480 items: 3.067 s
Maximum memory usage: 785.172 MiB


### With iterable dataset

With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 2.850 s
Time it takes to go through 1280 items: 0.464 s
Maximum memory usage: 799.742 MiB


With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 2.623 s
Time it takes to go through 1280 items: 0.544 s
Maximum memory usage: 838.613 MiB


With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 22.310 s
Time it takes to go through 1280 items: 0.562 s
Maximum memory usage: 891.176 MiB


With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer,shuffle_trn=False)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 22.421 s
Time it takes to go through 1280 items: 0.474 s
Maximum memory usage: 892.000 MiB


With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=2048,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,2048,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 21.645 s
Time it takes to go through 20480 items: 7.809 s
Maximum memory usage: 768.688 MiB


### With streaming

With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Time it takes to process + tokenize training texts: 0.002 s
Time it takes to go through 1280 items: 1.244 s
Maximum memory usage: 752.238 MiB


With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Time it takes to process + tokenize training texts: 0.002 s
Time it takes to go through 1280 items: 1.365 s
Maximum memory usage: 829.074 MiB


With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Time it takes to process + tokenize training texts: 0.084 s
Time it takes to go through 1280 items: 95.443 s
Maximum memory usage: 6955.020 MiB


With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer,shuffle_trn=False)

Time it takes to process + tokenize training texts: 0.080 s
Time it takes to go through 1280 items: 11.529 s
Maximum memory usage: 6841.391 MiB


With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=200,
                         seed=42,
                         convert_training_to_iterable=True,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,200,tokenizer,shuffle_trn=False)

Time it takes to process + tokenize training texts: 0.079 s
Time it takes to go through 2000 items: 63.094 s
Maximum memory usage: 37318.242 MiB


## Improving processing time with caching

The worst processing time is recorded with an non-iterable training set, with the following preprocessing: 2-column filtering, 2-column metadatas, 2 content transformations, 2 content augmentation; the total preprocessing time is ~62s for 117k dataset. However, this results in the best data iteration time: 0.183s for going through 1280 items.

With caching, we can significantly reduce the preprocessing time. That means, you only need to do all preprocessings once; all subsequent call will take advatages of this cached result.

In [None]:
enable_caching()

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         is_batched=True,
                         batch_size=bs,
                         seed=42,
                         convert_training_to_iterable=False,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,bs,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-a8e48b2fdcc1675b_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-7f67ed2247bad412_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-8895dee11a0750d6_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec

Time it takes to process + tokenize training texts: 1.471 s
Time it takes to go through 1280 items: 0.176 s
Maximum memory usage: 874.715 MiB


## Conclusion

With CPU batch size of 128, and data iteration of 1280 items (10 batches)

1. Time to process + tokenize. Unit: seconds

|  | Filtering | + 2-column metadatas | + 2 tfms and 2 augs | + no train shuffling |
|------------------------------|-------------------------|-------------------------|-----------------|--------------------|
| no iterable training         | 37.038                  | 40.147                  | 62.309          | 59.452             |
| iterable training            | 2.85                    | 2.623                   | 22.31           | 22.421             |
| streaming                    | 0.002                   | 0.002                   | 0.084           | 0.08               |

2. Time to loop through 1280 items (10 batches). Unit: seconds

|                              | Filtering | + 2-column metadatas | + 2 tfms and 2 augs | + no train shuffling |
|------------------------------|-------------------------|-----------------|--------------------|------------------------------------|
| no iterable training         | 0.155                    | 0.181           | 0.183              | 0.184                              |
| iterable training            | 0.464                    | 0.544           | 0.562              | 0.474                              |
| streaming                    | 1.244                    | 1.365           | 95.443             | 11.529                             |

3. Maximum memory usage. Unit: megabytes

|                              | Filtering | + 2-column metadatas | + 2 tfms and 2 augs | + no train shuffling |
|------------------------------|-------------------------|-----------------|--------------------|------------------------------------|
| no iterable training         | 762.734 | 806.473                  | 859.008         | 867.031            | 
| iterable training            |799.742 | 838.613                  | 891.176         | 892                |
| streaming                    | 752.238 | 829.074                  | 6955.02         | 6841.391           |

## Tips and tricks

- For non-streaming data, the best way to minimize processing and iteration time is:
    - Use non-iterable training (which means don't turn training set into an Iterable Dataset)
    - Turn on dataset caching, and run the processing step once for it to be cached
- If caching is not an option, then use iterable training (turn trainingset into an Iterable Dataset)
- The more content transformations and augmentations added, the slower the process + iteration. This is especially true for streaming data
- For streaming data, which might be the slowest option, here are a few things to speed up the whole pipeline:
    - Try to define and create a validation set split in your dataset; don't use the validation split functionality of `TextDataController
    - Minimize the amount of content transformation and content augmentation
    - Turn off `shuffle_trn`
    - Set a smaller CPU batch size. E.g. in my 64gb RAM machine, and this dataset of 117k rows, I can only set batch size up to 200 to avoid memory error
