# Text Processing Benchmark

> This module contains some benchmarks for `TextDataController`

- skip_showdoc: true
- skip_exec: true

In [None]:
# !conda list | grep 'datasets\|transformers'
# datasets                  2.11.0                   pypi_0    pypi
# transformers              4.28.1                   pypi_0    pypi

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from importlib.machinery import SourceFileLoader
from datasets import load_dataset,enable_caching,disable_caching
from transformers import RobertaTokenizer
import os
import time
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
from functools import partial
import random
from memory_profiler import memory_usage

In [None]:
disable_caching() # disable huggingface caching to get a fair benchmark

## Benchmark on medium-size dataset (~117k rows)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)
len(dset)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


117430

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
bs=len(dset)//100
bs

1174

### Non-streaming dataset

In [None]:
def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

#     print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 14.905 s
Time it takes to go through 11740 items: 1.496 s
Maximum memory usage: 763.004 MiB


With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 15.518 s
Time it takes to go through 11740 items: 1.755 s
Maximum memory usage: 780.414 MiB


With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 35.544 s
Time it takes to go through 11740 items: 1.843 s
Maximum memory usage: 774.332 MiB


With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,shuffle_trn=False)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 34.666 s
Time it takes to go through 11740 items: 1.758 s
Maximum memory usage: 789.406 MiB


With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs*3,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 35.632 s
Time it takes to go through 35220 items: 5.409 s
Maximum memory usage: 814.371 MiB


With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

In [None]:
bs

1174

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 35.284 s
Time it takes to go through all items: 16.821 s
Maximum memory usage: 824.852 MiB


### With streaming

In [None]:
def benchmarking_streaming(tdc,tokenizer,n=10):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

def benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=10):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         batch_size=bs,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.842 s
Time it takes to go through 11740 items: 4.709 s
Maximum memory usage: 767.066 MiB


With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         batch_size=bs,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.886 s
Time it takes to go through 11740 items: 4.889 s
Maximum memory usage: 790.480 MiB


With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower],
                         batch_size=bs,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.914 s
Time it takes to go through 11740 items: 12.926 s
Maximum memory usage: 804.312 MiB


With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower],
                         batch_size=bs*3,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.952 s
Time it takes to go through 35220 items: 38.854 s
Maximum memory usage: 993.641 MiB


With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower],
                         batch_size=bs,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 0.825 s
Time it takes to go through all items: 121.332 s
Maximum memory usage: 771.711 MiB


### Test the effect of batch size and num_proc

For non-streaming dataste, text processing + tokenization are the most time-consuming tasks, thus we will check how different batch size and num proc will affect these tasks' running time

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=128,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 36.516 s
Time it takes to go through all items: 16.576 s
Maximum memory usage: 997.777 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=4):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 35.601 s
Time it takes to go through all items: 16.904 s
Maximum memory usage: 987.270 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=24*50, #1200
                         num_proc=24,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=24):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=24):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 29.681 s
Time it takes to go through all items: 16.793 s
Maximum memory usage: 989.348 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=24*400, #9600
                         num_proc=24,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Filter (num_proc=24):   0%|          | 0/117430 [00:00<?, ? examples/s]

Filter (num_proc=24):   0%|          | 0/113205 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/113140 [00:00<?, ? examples/s]

Time it takes to process + tokenize training texts: 29.924 s
Time it takes to go through all items: 17.506 s
Maximum memory usage: 994.566 MiB


Conclusion: increase BOTH batch size and num_proc can help decrease the processing + tokenization time, but the relationship between batch size, num_proc and running time are not linear

## Improving processing time with caching

The worst processing time is recorded with non-streaming training set, with the following preprocessing: 2-column filtering, 2-column metadatas, 2 content transformations, 2 content augmentation.

With caching, we can significantly reduce the preprocessing time. That means, you only need to do all preprocessings once; all subsequent call will take advatages of this cached result.

In [None]:
enable_caching()

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
tdc.process_and_tokenize(tokenizer,max_length=512)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec

Time it takes to process + tokenize training texts: 0.979 s
Time it takes to go through all items: 16.824 s
Maximum memory usage: 823.531 MiB


If you cached, then you only need 0.979s to load the data back from caches, instead of wait for 35.284s to do the process all over again

## Conclusion

| Process + Tokenize Time (117k records), batchsize = 1174 | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Batchsize=1174 x 3 | Batchsize=1174 |
|----------------------------------------------------------|-------------------------|-----------------|---------------------|--------------------|----------------|
| Non-streaming training                                   | 14.94                   | 15.518          | 35.544              | 35.632             | 35.284         |
| Streaming                                                | 0.842                    | 0.886           | 0.914               | 0.952              | 0.825          |
| Ratio Non-streaming/Streaming                            | 17.74                    | 17.51           | 38.89               | 37.43              | 42.77          |


| Run 10 batches time (1174*10) | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated: 117430 items |
|-------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training        | 1.496                    | 1.755           | 1.843               | 5.409                            | 16.821                         |
| Streaming                     | 4.709                    | 4.889           | 12.926              | 38.854                           | 121.332                        |
| Ratio Streaming/Non-Streaming | 3.15                     | 2.79            | 7.01                | 7.18                             | 7.21                           |


| Total Time (Process+Tokenize+Iterate) | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated: 117430 items |
|---------------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training                | 16.436                   | 17.273          | 37.387              | 41.041                           | 52.105                         |
| Streaming                             | 5.551                    | 5.775           | 13.84               | 39.806                           | 122.157                        |
| Ratio Non-streaming/Streaming         | 2.96                     | 2.99            | 2.7                 | 1.03                             | 0.43                           |


| Total memory use                    | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated: 117430 items |
|-------------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training              | 763                      | 780             | 774                 | 814                              | 825                            |
| Streaming                           | 767                      | 790             | 804                 | 993                              | 772                            |
| Ratio Streaming/Non-Streaming       | 1.01                     | 1.01            | 1.04                | 1.22                             | 0.94                           |


## Tips and tricks

- For non-streaming data, the best way to minimize processing and iteration time is:
    - Turn on dataset caching, and run the processing step once for it to be cached
- The more content transformations and augmentations added, the slower the process + iteration. This is especially true for streaming data

