# Text Processing Benchmark

> This module contains some benchmarks for `TextDataController`

- skip_showdoc: true
- skip_exec: true

In [None]:
# !conda list | grep 'datasets\|transformers\|torch'
# datasets                  2.14.4                   pypi_0    pypi
# pytorch-ignite            0.4.11                   pypi_0    pypi
# pytorch-lightning         2.0.1.post0              pypi_0    pypi
# torch                     2.0.1+cu118              pypi_0    pypi
# torchaudio                2.0.2+cu118              pypi_0    pypi
# torchmetrics              1.1.1                    pypi_0    pypi
# torchvision               0.15.2+cu118             pypi_0    pypi
# transformers              4.31.0                   pypi_0    pypi

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from that_nlp_library.text_main import *
from that_nlp_library.text_main_streaming import *
from datasets import load_dataset,enable_caching,disable_caching
from transformers import RobertaTokenizer
import os
import time
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
from functools import partial
import random
from memory_profiler import memory_usage

In [None]:
disable_caching() # disable huggingface caching to get a fair benchmark

## Benchmark on medium-size dataset (~117k rows)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)
len(dset)

117430

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
bs=len(dset)//100
bs

1174

### Non-streaming dataset

In [None]:
def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

#     print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

#### With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Time it takes to process + tokenize training texts: 15.248 s
Time it takes to go through 11740 items: 1.428 s
Maximum memory usage: 763.223 MiB


#### With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Time it takes to process + tokenize training texts: 15.586 s
Time it takes to go through 11740 items: 1.639 s
Maximum memory usage: 772.113 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Time it takes to process + tokenize training texts: 35.383 s
Time it takes to go through 11740 items: 1.695 s
Maximum memory usage: 779.855 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation + no shuffling

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,shuffle_trn=False)

Time it takes to process + tokenize training texts: 34.722 s
Time it takes to go through 11740 items: 1.708 s
Maximum memory usage: 780.215 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs*3,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer)

Time it takes to process + tokenize training texts: 35.651 s
Time it takes to go through 35220 items: 5.246 s
Maximum memory usage: 774.805 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

In [None]:
bs

1174

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=bs,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 35.532 s
Time it takes to go through all items: 16.601 s
Maximum memory usage: 862.285 MiB


### With streaming

In [None]:
def benchmarking_streaming(tdc,tokenizer,n=10):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,line_by_line=True)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

def benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=10):
    mem_usage = memory_usage((benchmarking_streaming,[tdc,tokenizer,n]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

#### With filter

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.768 s
Time it takes to go through 11740 items: 4.768 s
Maximum memory usage: 678.414 MiB


#### With filter + metadatas concatenation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.802 s
Time it takes to go through 11740 items: 4.692 s
Maximum memory usage: 700.844 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.821 s
Time it takes to go through 11740 items: 12.967 s
Maximum memory usage: 724.578 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation + higher batch size

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs*3,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer)

Time it takes to process + tokenize training texts: 0.894 s
Time it takes to go through 35220 items: 38.426 s
Maximum memory usage: 861.527 MiB


#### With filter + metadatas concatenation + content transformation + content augmentation + iterate the whole dataset (1 epoch)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=True)

tdc = TextDataControllerStreaming(dset,
                                 main_text='Review Text',
                                 label_names='Department Name',
                                 sup_types='classification',
                                 filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                             },
                                 class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                 metadatas=['Title','Division Name'],
                                 content_transformations=[text_normalize,str.lower],
                                 content_augmentations= [nearby_aug_func,str.lower],
                                 batch_size=bs,
                                 num_proc=4,
                                 seed=42,
                                 verbose=False
                                )
benchmarking_and_memory_usage_streaming(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 0.966 s
Time it takes to go through all items: 120.615 s
Maximum memory usage: 747.059 MiB


### Test the effect of batch size and num_proc

In [None]:
def benchmarking(tdc,tokenizer,n=10,shuffle_trn=True):
    time1 = time.time()
    tdc.process_and_tokenize(tokenizer,max_length=512,shuffle_trn=shuffle_trn)
    time2 = time.time() 
    print(f'Time it takes to process + tokenize training texts: {(time2-time1):.3f} s')
    for i,v in enumerate(tdc.main_ddict['train']):
        if n is not None and i==tdc.batch_size*n: break
    time3 = time.time()
    if n is not None:
        print(f'Time it takes to go through {n*tdc.batch_size} items: {(time3-time2):.3f} s')
    else:
        print(f'Time it takes to go through all items: {(time3-time2):.3f} s')

#     print(f'Total time: {(time3-time1):.3f} s')
def benchmarking_and_memory_usage(tdc,tokenizer,n=10,shuffle_trn=True):
    mem_usage = memory_usage((benchmarking,[tdc,tokenizer,n,shuffle_trn]))
    print(f'Maximum memory usage: {max(mem_usage):.3f} MiB')


def nlp_aug_stochastic(x,aug=None,p=0.5):
    results = aug.augment(x)
    if not isinstance(x,list): return results[0] if random.random()<p else x
    return [a if random.random()<p else b for a,b in zip(results,x)]

aug = nac.KeyboardAug(aug_char_max=3,aug_char_p=0.1,aug_word_p=0.07)
nearby_aug_func = partial(nlp_aug_stochastic,aug=aug,p=0.5)

For non-streaming dataset, text processing + tokenization are the most time-consuming tasks, thus we will check how different batch size and num proc will affect these tasks' running time

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=128,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 37.585 s
Time it takes to go through all items: 16.613 s
Maximum memory usage: 872.035 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 36.803 s
Time it takes to go through all items: 16.858 s
Maximum memory usage: 888.293 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=24*50, #1200
                         num_proc=24,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 36.704 s
Time it takes to go through all items: 17.148 s
Maximum memory usage: 893.254 MiB


In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=24*400, #9600
                         num_proc=24,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Time it takes to process + tokenize training texts: 37.287 s
Time it takes to go through all items: 17.501 s
Maximum memory usage: 914.801 MiB


## Improving processing time with caching

The worst processing time is recorded with non-streaming training set, with the following preprocessing: 2-column filtering, 2-column metadatas, 2 content transformations, 2 content augmentation.

With caching, we can significantly reduce the preprocessing time. That means, you only need to do all preprocessings once; all subsequent call will take advatages of this cached result.

In [None]:
enable_caching()

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
tdc.process_and_tokenize(tokenizer,max_length=512)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Flattening the indices (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/113140 [00:00<?, ? examples/s]

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv' for i in range(5)],
                    split='train',
                    streaming=False)

tdc = TextDataController(dset,
                         main_text='Review Text',
                         label_names='Department Name',
                         sup_types='classification',
                         filter_dict={'Review Text': lambda x: x is not None,
                                      'Department Name': lambda x: x is not None,
                                     },
                         metadatas=['Title','Division Name'],
                         content_transformations=[text_normalize,str.lower],
                         content_augmentations= [nearby_aug_func,str.lower], 
                         val_ratio=None,
                         batch_size=1000,
                         num_proc=4,
                         seed=42,
                         verbose=False
                        )
benchmarking_and_memory_usage(tdc,tokenizer,n=None)

Found cached dataset csv (/home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed8574c094e4fd_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-b781a4a73d06caf5_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0f85d6db4165d6ef_*_of_00004.arrow
Loading cached processed dataset at /home/quan/.cache/huggingface/datasets/csv/sample_data-b5f53892a1b938ad/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec

Time it takes to process + tokenize training texts: 0.979 s
Time it takes to go through all items: 16.824 s
Maximum memory usage: 823.531 MiB


If you cached, then you only need 0.979s to load the data back from caches, instead of wait for 35.284s to do the process all over again

## Conclusion

| Process + Tokenize Time (117k records), batchsize = 1174 | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Batchsize=1174 x 3 | Processing 1 epoch (Batchsize=1174) |
|----------------------------------------------------------|-------------------------|-----------------|---------------------|--------------------|----------------|
| Non-streaming training                                   | 15.248                   | 15.586          | 35.383              | 35.651             | 35.532         |
| Streaming                                                | 0.768                    | 0.802           | 0.821               | 0.894              | 0.966          |
| Ratio Non-streaming/Streaming                            | 19.85                    | 19.43           | 43.1               | 39.88              | 36.78          |


| Run 10 batches time (1174*10) | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated (117430 items) |
|-------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training        | 1.428                    | 1.639           | 1.695               | 5.246                            | 16.601                         |
| Streaming                     | 4.768                    | 4.692           | 12.967              | 38.426                           | 120.615                        |
| Ratio Streaming/Non-Streaming | 3.34                     | 2.86            | 7.65                | 7.32                             | 7.27                           |


| Total Time (Process+Tokenize+Iterate) | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated: 117430 items |
|---------------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training                | 16.676                   | 17.225          | 37.078              | 40.897                           | 52.133                         |
| Streaming                             | 5.536                    | 5.494           | 13.788               | 39.32                           | 121.581                        |
| Ratio Non-streaming/Streaming         | 3.01                     | 3.14            | 2.69                 | 1.04                             | 0.43                           |


| Total memory use                    | Filter, train shuffling | And 2 metadatas | And 2 tfms + 2 augs | Total items iterated: 1174 * 10 * 3 | 1 epoch iterated: 117430 items |
|-------------------------------------|-------------------------|-----------------|---------------------|---------------------------------|--------------------------------|
| Non-streaming training              | 763                      | 772             | 779                 | 774                              | 862                            |
| Streaming                           | 678                      | 701             | 724                 | 861                              | 747                            |
| Ratio Streaming/Non-Streaming       | 0.89                     | 0.91            | 0.93                | 1.11                             | 0.87                           |


## Tips and tricks

- For non-streaming data, the best way to minimize processing and iteration time is:
    - Turn on dataset caching, and run the processing step once for it to be cached
- The more content transformations and augmentations added, the slower the process + iteration. This is especially true for streaming data
- For streaming, be aware of the pros and cons of batch-process and line-by-line process

