# Text Main Streaming

> This module contains the main Python class for data control for streaming data: `TextDataControllerStreaming`

- skip_showdoc: true
- skip_exec: true

In [None]:
#| default_exp text_main_streaming

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import DatasetDict,Dataset,IterableDataset,load_dataset,Value
from pathlib import Path
from that_nlp_library.utils import *
from that_nlp_library.text_main import tokenize_function,concat_metadatas
from functools import partial
import warnings

In [None]:
import pandas as pd
import numpy as np
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
import os
import random

## Class TextDataControllerStreaming

In [None]:
#| export
class TextDataControllerStreaming():
    def __init__(self,
                 inp, # HuggingFainpce Dataset or DatasetDict
                 main_text:str, # Name of the main text column
                 label_names=[], # Names of the label (dependent variable) columns
                 sup_types=[], # Type of supervised learning for each label name ('classification' or 'regression')
                 class_names_predefined=[], # List of names associated with the labels (same index order)
                 filter_dict={}, # A dictionary: {feature: filtering_function_based_on_the_feature}
                 label_tfm_dict={}, # A dictionary: {label_name: transform_function_for_that_label}
                 metadatas=[], # Names of the metadata columns
                 process_metas=True, # Whether to do simple text processing on the chosen metadatas
                 content_transformations=[], # A list of text transformations
                 content_augmentations=[], # A list of text augmentations
                 seed=None, # Random seed
                 batch_size=100, # CPU batch size
                 num_proc=1, # Number of process for multiprocessing. This will be applied on non-streamed validation set
                 cols_to_keep=None, # Columns to keep after all processings
                 verbose=True, # Whether to print processing information
                ):
            
        self.main_text = main_text
        
        self.label_names = val2iterable(label_names)
        self.sup_types = val2iterable(sup_types)
        self._check_sup_types()
        self.label_lists = class_names_predefined
        
        self.filter_dict = filter_dict
        self.label_tfm_dict = label_tfm_dict
        self.metadatas = val2iterable(metadatas)
        self.process_metas = process_metas

        self.content_tfms = val2iterable(content_transformations)
        self.aug_tfms = val2iterable(content_augmentations)
        
        self.seed = seed
        self.is_batched = batch_size>1
        self.batch_size = batch_size
        self.num_proc = num_proc
        self.cols_to_keep = cols_to_keep

        self.main_ddict=DatasetDict()
        self.verbose = verbose
        self.verboseprint = print if verbose else lambda *a, **k: None
        
        if hasattr(inp,'keys'): # is datasetdict
            if 'train' not in inp.keys(): 
                raise ValueError('The given DatasetDict has no "train" split')
            else:
                self.main_ddict['train'] = inp['train']
            val_key = list(set(inp.keys()) & set(['val','validation','valid']))
            if len(val_key)>1: raise ValueError('Your DatasetDict has more than 1 validation split')
            if len(val_key)==1:
                self.main_ddict['validation'] = inp[val_key[0]]
        else: # is dataset
            self.main_ddict['train'] = inp
          
        is_streamed=isinstance(self.main_ddict['train'],IterableDataset)
        if not is_streamed: raise Exception('This Text Data Controller only handles streamed dataset')
        
        self.all_cols = get_dset_col_names(self.main_ddict['train'])
        
        if is_streamed and ('classification' in self.sup_types) and len(self.label_lists)==0:
            raise ValueError('All classification labels must be provided when streaming')
            
        self._determine_multihead_multilabel()
        self._convert_regression_to_float()
        self._processed_call=False
        
            
    @classmethod
    def from_pickle(cls,
                    fname, # Name of the pickle file
                    parent='pickle_files' # Parent folder
                   ):
        return load_pickle(fname,parent=parent)
    
    def set_verbose(self,verbose):
        self.verbose = verbose
        self.verboseprint = print if verbose else lambda *a, **k: None
    
    def _convert_regression_to_float(self):
        if len(self.sup_types)==0: return
        # convert regression labels to float64
        reg_idxs = [i for i,v in enumerate(self.sup_types) if v=='regression']
        for i in reg_idxs:
            self.main_ddict['train'] = self.main_ddict['train'].cast_column(self.label_names[i],Value("float64"))
            if 'validation' in self.main_ddict.keys():
                self.main_ddict['validation'] = self.main_ddict['validation'].cast_column(self.label_names[i],Value("float64"))
        
    def _check_sup_types(self):
        assert len(self.label_names)==len(self.sup_types), "The number of supervised learning declaration must equal to the number of label"
        assert len(set(self.sup_types) - set(['classification','regression']))==0, 'Accepted inputs for `sup_types` are `classification` and `regression`'
        
    def _determine_multihead_multilabel(self):
        self.is_multilabel=False
        self.is_multihead=False
        if len(self.label_names)==0: return
        
        if len(self.label_names)>1:
            self.is_multihead=True
        # get label of first row
        first_label = next(iter(self.main_ddict['train']))[self.label_names[0]]
        if isinstance(first_label,(list,set,tuple)):
            # This is multi-label. Ignore self.label_names[1:]
            self.label_names = [self.label_names[0]]
            self.is_multihead=False
            self.is_multilabel=True
                     
    
    def save_as_pickles(self,
                        fname, # Name of the pickle file
                        parent='pickle_files', # Parent folder
                        drop_attributes=False # Whether to drop large-size attributes
                       ):
        if drop_attributes:
            if hasattr(self, 'main_ddict'):
                del self.main_ddict
            if hasattr(self, 'ddict_rest'):
                del self.ddict_rest
            if hasattr(self, 'aug_tfms'):
                del self.aug_tfms
        save_to_pickle(self,fname,parent=parent)
    
    def _process_metadatas(self,dtrain):
        if len(self.metadatas):
            map_func = partial(concat_metadatas,
                               main_text=self.main_text,
                               metadatas=self.metadatas,
                               process_metas=self.process_metas,
                               is_batched=self.is_batched)
            dtrain = hf_map_dset(dtrain,map_func,self.is_batched,self.batch_size,self.num_proc)
        return dtrain
    
    def _do_label_transformation(self):
        if len(self.label_names)==0 or len(self.label_tfm_dict)==0: return
        print_msg('Label Transformation',20,verbose=self.verbose)
        for f,tfm in self.label_tfm_dict.items():
            if f in self.label_names:
                _func = partial(lambda_map_batch,
                                feature=f,
                                func=tfm,
                                is_batched=self.is_batched
                               )                
                self.main_ddict['train'] = hf_map_dset(self.main_ddict['train'],_func,self.is_batched,self.batch_size,self.num_proc)
                if 'validation' in self.main_ddict.keys():
                    self.main_ddict['validation'] = hf_map_dset(self.main_ddict['validation'],
                                                                _func,
                                                                self.is_batched,
                                                                self.batch_size,
                                                                self.num_proc)
        self.verboseprint('Done')
                    
                      
                
    def _create_label_mapping_func(self,encoder_classes):
        if self.is_multihead:
            label2idxs = [{v:i for i,v in enumerate(l_classes)} for l_classes in encoder_classes]
            _func = lambda inp: {'label': [[label2idxs[i][v] if len(label2idxs[i]) else v for i,v in enumerate(vs)] \
                                           for vs in zip(*[inp[l] for l in self.label_names])] if self.is_batched \
                                 else [label2idxs[i][v] if len(label2idxs[i]) else v for i,v in enumerate([inp[l] for l in self.label_names])]
                                }
            
        else: # single-head
            if self.sup_types[0]=='regression':
                _func1 = lambda x: x
            else:
                label2idx = {v:i for i,v in enumerate(encoder_classes[0])}
                _func1 = lambda x: label2idx[x]
                
            _func = partial(lambda_map_batch,
                           feature=self.label_names[0],
                           func=_func1,
                           output_feature='label',
                           is_batched=self.is_batched)
        return _func
        
    def _encode_labels(self):
        if len(self.label_names)==0: return
        print_msg('Label Encoding',verbose=self.verbose)
        
        if len(self.label_lists) and not isinstance(self.label_lists[0],list):
            self.label_lists = [self.label_lists]
                    
        encoder_classes=[]
        if not self.is_multilabel:
            for idx,l in enumerate(self.label_names):
                if self.sup_types[idx]=='regression':
                    l_classes=[]
                else: # classification
                    l_classes = sorted(list(self.label_lists[idx]))
                encoder_classes.append(l_classes)
                
            _func = self._create_label_mapping_func(encoder_classes)
            
            self.main_ddict['train'] = hf_map_dset(self.main_ddict['train'],_func,self.is_batched,self.batch_size,self.num_proc)
            if 'validation' in self.main_ddict.keys():
                self.main_ddict['validation'] = hf_map_dset(self.main_ddict['validation'],_func,self.is_batched,self.batch_size,self.num_proc)
                    
        else:
            # For MultiLabel, we transform the label itself to one-hot (or actually, few-hot)
            l_classes = sorted(list(self.label_lists[0]))   
            encoder_classes.append(l_classes)
            
            l_encoder = MultiLabelBinarizer(classes=encoder_classes[0])
            _ = l_encoder.fit(None)
            _func = partial(lambda_map_batch,
                            feature=self.label_names[0],
                            func=lambda x: l_encoder.transform(x),
                            output_feature='label',
                            is_batched=self.is_batched,
                            is_func_batched=True)
            self.main_ddict['train'] = hf_map_dset(self.main_ddict['train'],_func,self.is_batched,self.batch_size,self.num_proc)
            if 'validation' in self.main_ddict.keys():
                self.main_ddict['validation'] = hf_map_dset(self.main_ddict['validation'],_func,self.is_batched,self.batch_size,self.num_proc)
            
        self.label_lists = encoder_classes
        self.verboseprint('Done')
        
            
            
    def _simplify_ddict(self):
        print_msg('Dropping unused features',20,verbose=self.verbose)
        if self.cols_to_keep is None:
            self.cols_to_keep= [self.main_text] + self.metadatas + self.label_names
        cols_to_remove = set(self.all_cols) - set(self.cols_to_keep)
        self.main_ddict['train']=self.main_ddict['train'].remove_columns(list(cols_to_remove))
        if 'validation' in self.main_ddict.keys():
            self.main_ddict['validation']=self.main_ddict['validation'].remove_columns(list(cols_to_remove))
        self.verboseprint('Done')

    def _do_filtering(self,dtrain):
        if len(self.filter_dict):
            col_names = get_dset_col_names(dtrain)
            for f,tfm in self.filter_dict.items():
                if f in col_names:
                    _func = partial(lambda_batch,
                                    feature=f,
                                    func=tfm,
                                    is_batched=self.is_batched)
                    dtrain = hf_filter_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)
        return dtrain
        

    def _do_transformation_tokenization(self,dtrain,tokenizer,max_length,):
        tok_func = partial(tokenize_function,tok=tokenizer,max_length=max_length)
        if len(self.content_tfms):            
            for tfm in self.content_tfms:
                _func = partial(lambda_map_batch,
                                feature=self.main_text,
                                func=tfm,
                                is_batched=self.is_batched)
                dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)
        
        _func = partial(lambda_map_batch,
                        feature=self.main_text,
                        func=tok_func,
                        output_feature=None,
                        is_batched=self.is_batched)
        dtrain = hf_map_dset(dtrain,_func,self.is_batched,self.batch_size,self.num_proc)
            
        return dtrain 
 
    def _do_transformation_augmentation_tokenization(self,tokenizer,max_length):
        tok_func = partial(tokenize_function,tok=tokenizer,max_length=max_length)
        all_tfms = self.content_tfms + self.aug_tfms
        all_tfms = partial(func_all,functions=all_tfms) if len(all_tfms) else None
        seed_everything(self.seed)
           
        self.main_ddict['train'] = IterableDataset.from_generator(aug_and_tok_stream_generator,
                                                   gen_kwargs={'dset': self.main_ddict['train'],
                                                               'text_name':self.main_text,
                                                               'tok_func':tok_func,
                                                               'func': all_tfms
                                                              }
                                                                 )
        
        
    def process_and_tokenize(self,
                             tokenizer, # Tokenizer (preferably from HuggingFace)
                             max_length=None, # pad to model's allowed max length (default is max_sequence_length)
                            ):
        if self._processed_call:
            warnings.warn('Your dataset has already been processed. Returning the previous processed DatasetDict...')
            return self.main_ddict
        
        self.tokenizer = tokenizer
        self.max_length = max_length
                             
        # Filtering
        print_msg('Data Filtering',20,verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._do_filtering(self.main_ddict[k])
        self.verboseprint('Done')

        
        # Process metadatas
        print_msg('Metadata Simple Processing & Concatenating to Main Content',verbose=self.verbose)
        for k in self.main_ddict.keys():   
            self.main_ddict[k] = self._process_metadatas(self.main_ddict[k])
        self.verboseprint('Done')
        
        # Label transformation
        self._do_label_transformation()
        
        # Process labels
        self._encode_labels()

        # Dropping unused columns
        self._simplify_ddict()

        
        # Content transformation + tokenization for validation
        if 'validation' in self.main_ddict.keys():
            print_msg('Performing content transformation and tokenization on validation set',verbose=self.verbose)
            self.main_ddict['validation'] = self._do_transformation_tokenization(self.main_ddict['validation'],
                                                                                 tokenizer,
                                                                                 max_length,
                                                                                )
            self.verboseprint('Done')
 
        # Content transformation + augmentation + tokenization for train
        print_msg('Creating a generator for content transformation, augmentation and tokenization on train set',verbose=self.verbose)
        self._do_transformation_augmentation_tokenization(tokenizer,max_length)
        self.verboseprint('Done')
        
        self._processed_call=True
    
        
    
    def set_data_collator(self,data_collator):
        self.data_collator = data_collator
        
    
    def prepare_test_dataset_from_csv(self,
                                      file_path, # path to csv file
                                      do_filtering=False # whether to perform data filtering on this test set
                                     ):
        file_path = Path(file_path)
        ds = load_dataset(str(file_path.parent),
                          data_files=file_path.name,
                          split='train')
        return self.prepare_test_dataset(ds,do_filtering)
    
    def prepare_test_dataset_from_df(self,
                                     df, # Pandas Dataframe
                                     validate=True, # whether to perform input data validation
                                     do_filtering=False # whether to perform data filtering on this test set 
                                    ):
        if validate:
            check_input_validation(df)
        ds = Dataset.from_pandas(df)
        return self.prepare_test_dataset(ds,do_filtering)
    
    def prepare_test_dataset_from_raws(self,
                                       content, # Either a single sentence, list of sentence or a dictionary with keys are metadata columns and values are list
                                      ):
        if len(self.metadatas)!=0 and not isinstance(content,dict):
            raise ValueError(f'There is/are metadatas in the preprocessing step. Please include a dictionary including these keys for metadatas: {self.metadatas}, and texture content: {self.main_text}')
            
        _dic = {self.main_text:[content]} if isinstance(content,str) else content
        for k in _dic.keys():
            _dic[k] = val2iterable(_dic[k])
        
        test_dict = Dataset.from_dict(_dic)
        
        # set num_proc to 1 for small data processing
        _tmp = self.num_proc
        self.num_proc=1
        results = self.prepare_test_dataset(test_dict,do_filtering=False)
        self.num_proc = _tmp
        return results
    
    def prepare_test_dataset(self,
                             test_dset, # The HuggingFace Dataset as Test set
                             do_filtering=False, # whether to perform data filtering on this test set
                            ):
        test_cols = set(get_dset_col_names(test_dset))
        label_names_set = set(self.label_names)
        test_cols = test_cols - label_names_set
        missing_cols = set(self.cols_to_keep) - label_names_set - test_cols
        if len(missing_cols):
            raise ValueError(f'Test set does not have these columns required for preprocessings: {missing_cols}')
            
        print_msg('Start Test Set Transformation',20,verbose=self.verbose)

        # Filtering
        if do_filtering:
            print_msg('Data Filtering',20,verbose=self.verbose)
            test_dset = self._do_filtering(test_dset)
            self.verboseprint('Done')
        
        # Process metadatas
        print_msg('Metadata Simple Processing & Concatenating to Main Content',verbose=self.verbose)    
        test_dset = self._process_metadatas(test_dset)
        self.verboseprint('Done')
        
        # Drop unused columns
        print_msg('Dropping unused features',20,verbose=self.verbose)
        cols_to_remove = test_cols - set(self.cols_to_keep)
        test_dset=test_dset.remove_columns(list(cols_to_remove))
        self.verboseprint('Done')
        
        
        # Content transformation and tokenization
        print_msg('Performing content transformation and tokenization on test set',verbose=self.verbose)
        test_dset = self._do_transformation_tokenization(test_dset,self.tokenizer,self.max_length)
        self.verboseprint('Done')
        
        return test_dset


In [None]:
show_doc(TextDataControllerStreaming)

---

### TextDataControllerStreaming

>      TextDataControllerStreaming (inp, main_text:str, label_names=[],
>                                   sup_types=[], class_names_predefined=[],
>                                   filter_dict={}, label_tfm_dict={},
>                                   metadatas=[], process_metas=True,
>                                   content_transformations=[],
>                                   content_augmentations=[], seed=None,
>                                   batch_size=100, num_proc=1,
>                                   cols_to_keep=None, verbose=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | HuggingFainpce Dataset or DatasetDict |
| main_text | str |  | Name of the main text column |
| label_names | list | [] | Names of the label (dependent variable) columns |
| sup_types | list | [] | Type of supervised learning for each label name ('classification' or 'regression') |
| class_names_predefined | list | [] | List of names associated with the labels (same index order) |
| filter_dict | dict | {} | A dictionary: {feature: filtering_function_based_on_the_feature} |
| label_tfm_dict | dict | {} | A dictionary: {label_name: transform_function_for_that_label} |
| metadatas | list | [] | Names of the metadata columns |
| process_metas | bool | True | Whether to do simple text processing on the chosen metadatas |
| content_transformations | list | [] | A list of text transformations |
| content_augmentations | list | [] | A list of text augmentations |
| seed | NoneType | None | Random seed |
| batch_size | int | 100 | CPU batch size |
| num_proc | int | 1 | Number of process for multiprocessing. This will be applied on non-streamed validation set |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| verbose | bool | True | Whether to print processing information |

In [None]:
show_doc(TextDataControllerStreaming.process_and_tokenize)

---

### TextDataControllerStreaming.process_and_tokenize

>      TextDataControllerStreaming.process_and_tokenize (tokenizer,
>                                                        max_length=None)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| tokenizer |  |  | Tokenizer (preferably from HuggingFace) |
| max_length | NoneType | None | pad to model's allowed max length (default is max_sequence_length) |

## Streaming Capability

The majority of streaming capability of `TextDataControllerStreaming` is adapted from [HuggingFace's stream](https://huggingface.co/docs/datasets/stream)

Streaming is a method to let you work with data without having it in your hard drive. This is especially helpful when the dataset size exceeds the amount of disk space you have on your machine.

Here are a few things to be aware of when using `TextDataControllerStreaming` streaming functionality (versus `TextDataController`)

- The list of label names must be available beforehand (except for regression label)
- To avoid out-of-memory error, reduce batch_size argument.
- There will NOT be any validation split functionality. If you want to include a validation set, provide a `validation` split in your HuggingFace DatasetDict beforehand
- There's no upsampling, and there's no shuffling the training set
	

**To stream, you must provide a streamed HuggingFace dataset.**

Let's repeat few examples mentioned in [this tutorial](https://anhquan0412.github.io/that-nlp-library/text_main.html), but with a streaming dataset

In [None]:
from transformers import RobertaTokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

### Filtering + Metadatas + Label Transformation +  Content Transformation + Content Augmentation (for Single Head)

In [None]:
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw


In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

In [None]:
aug2 = naw.ContextualWordEmbsAug(model_path='roberta-base', 
                                device='cuda:0', # if you don't have gpu, change to 'cpu'
                                action="substitute",
                                top_k=10,
                               aug_p=0.07)

contextual_aug_func = partial(nlp_aug_stochastic,aug=aug2,p=0.5)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
tdc = TextDataControllerStreaming(ddict_with_val,
                                  main_text='Review Text',
                                  label_names='Department Name',
                                  sup_types='classification',
                                  class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trending'],
                                  filter_dict={'Review Text': lambda x: x is not None,
                                              'Department Name': lambda x: x is not None,
                                              },
                                  label_tfm_dict={'Department Name': lambda x: x if x!='Trend' else 'Trending'},
                                  metadatas=['Title','Division Name'],
                                  content_transformations=[text_normalize,str.lower],
                                  content_augmentations= contextual_aug_func, 
                                  process_metas=True,
                                  batch_size=1000,
                                  num_proc=4,
                                  seed=42
                                 )
tdc.process_and_tokenize(tokenizer,max_length=512)

-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
-------------------- Label Transformation --------------------


Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
----- Label Encoding -----


Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on validation set -----


Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation, augmentation and tokenization on train set -----
Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2253
    })
})

In [None]:
%%time
for i,v in enumerate(tdc.main_ddict['train']):
    if i%100==0:
        print(i)
    if i==1000:
        break
    pass
    

0
100
200
300
400
500
600
700
800
900
1000
CPU times: user 21.6 s, sys: 900 ms, total: 22.5 s
Wall time: 22.5 s


In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==10:break
    print(f"Text: {v['Review Text']}\nLabel: {v['Department Name']} => {v['label']}")
    print('-'*10)

Text: general petite . beautiful top , worth the necessary tailoring . the beautiful bold print drew me to this top and it did not disappoint upon receipt . however , the bottom ruffle belled so far out on each side that it was laughable ! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back . however , the fabric is beautiful , the fit was perfect ( size 2 , 5 ' 4 " , 106 lbs . ) , the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si
Label: Tops => 4
----------
Text: general . not as short on me ( petite ) . i ordered the xxs p as this dress is not a fitted dress , and that was the right size for me . only thing is the length is a bit linger still 9 lower on calf for me ) , the straps are almost tight , so i would say the dress is a reversed taper shape . color is beautiful , i ordered green as the other color ( plum ) doesn't have pet

In [None]:
pd.Series(tdc.main_ddict['validation']['Department Name']).value_counts()

Tops        956
Dresses     627
Bottoms     375
Intimate    187
Jackets      97
Trending     11
Name: count, dtype: int64

In [None]:
for i in range(5):
    print(f"Text: {tdc.main_ddict['validation']['Review Text'][i]}")
    print(f"Label: {tdc.main_ddict['validation']['Department Name'][i]} => {tdc.main_ddict['validation']['label'][i]}")
    print('-'*10)

Text: general . soft , feminine and fun pockets ! . i love this tunic . purchased the dark orange in medium ( i am 5 ' 9 and 140 lbs ) . tried the small and almost kept it but i felt seams around my arm pits a tad , so went with the medium and glad i did - this top should be comfortable . feels very fall and perfect for casual get-togethers and running around town . only comment is that it is rayon ... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs .
Label: Tops => 4
----------
Text: general petite . a new staple ! . tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are ! they manage to look flowing & sleek without shortening the legs . took a size 6 with my 27 " waist , 37 " hips . it's a bit of a generous fit , especially around the waist , but they're extremely comfortable & have room to tuck tops into . i have the cowled sweater tank in gray & it looks fantastic over these ! couldn't res

### Filtering + Metadatas + Label Transformation +  Content Transformation + Content Augmentation (for Multi Head: Classification + Regression + Classification)

In [None]:
aug2 = naw.ContextualWordEmbsAug(model_path='roberta-base', 
                                device='cuda:0', # if you don't have gpu, change to 'cpu'
                                action="substitute",
                                top_k=10,
                               aug_p=0.07)

contextual_aug_func = partial(nlp_aug_stochastic,aug=aug2,p=0.5)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
tdc = TextDataControllerStreaming(ddict_with_val,
                                  main_text='Review Text',
                                  label_names=['Division Name','Rating','Department Name'],
                                  sup_types=['classification','regression','classification'],
                                  class_names_predefined=[['General', 'General Petite', 'Initmates'],
                                                          [], # empty list for regression
                                                          ['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trending']],
                                  filter_dict={'Review Text': lambda x: x is not None,
                                               'Department Name': lambda x: x is not None,
                                              },
                                  metadatas=['Title'],
                                  label_tfm_dict={'Department Name': lambda x: x if x!='Trend' else 'Trending'},
                                  content_transformations=[text_normalize,str.lower],
                                  content_augmentations=contextual_aug_func,
                                  process_metas=True,
                                  batch_size=1000,
                                  num_proc=4,
                                  seed=42
                                 )
tdc.process_and_tokenize(tokenizer,max_length=512)

Casting the dataset:   0%|          | 0/2349 [00:00<?, ? examples/s]

-------------------- Data Filtering --------------------


Filter:   0%|          | 0/2349 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
-------------------- Label Transformation --------------------


Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
----- Label Encoding -----


Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on validation set -----


Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Map:   0%|          | 0/2253 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation, augmentation and tokenization on train set -----
Done


In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==10:break
    print(f"Text: {v['Review Text']}\nLabel: {v['Division Name'],v['Rating'],v['Department Name']} => {v['label']}")
    print('-'*10)

Text: beautiful top , worth the necessary tailoring . the beautiful bold print drew me to this top and it did not disappoint upon receipt . however , the bottom ruffle belled so far out on each side that it was laughable ! the actual fit is nothing like the picture ; clearly the model's arms are placed in front of all the extra fabric to hold the ruffle back . however , the fabric is beautiful , the fit was perfect ( size 2 , 5 ' 4 " , 106 lbs . ) , the quality is great and i love the print so i decided to take it to my tailor to " sew away " the " wings " on both si
Label: ('General Petite', 4.0, 'Tops') => [1, 4.0, 4]
----------
Text: not as short on me ( petite ). i ordered the big p, this dress is also a fitted dress, and that was the right size on me. main thing is the skirt is a bit linger still 9 lower on calf for me ), the straps are almost tight, so i would say the dress is a reversed taper shape. color is bright, i ordered green as the other color ( plum ) doesn't have petite

In [None]:
for i in range(5):
    print(f"Text: {tdc.main_ddict['validation']['Review Text'][i]}")
    print(f"Label: {tdc.main_ddict['validation']['Division Name'][i],tdc.main_ddict['validation']['Rating'][i],tdc.main_ddict['validation']['Department Name'][i]} => {tdc.main_ddict['validation']['label'][i]}")
    print('-'*10)

Text: soft , feminine and fun pockets ! . i love this tunic . purchased the dark orange in medium ( i am 5 ' 9 and 140 lbs ) . tried the small and almost kept it but i felt seams around my arm pits a tad , so went with the medium and glad i did - this top should be comfortable . feels very fall and perfect for casual get-togethers and running around town . only comment is that it is rayon ... and for me anyway rayon doesn't wash too well - so we shall see how this one fairs .
Label: ('General', 5.0, 'Tops') => [0.0, 5.0, 4.0]
----------
Text: a new staple ! . tried these on out of sheer curiosity -- i've got a long torso & was pleasantly surprised how flattering they are ! they manage to look flowing & sleek without shortening the legs . took a size 6 with my 27 " waist , 37 " hips . it's a bit of a generous fit , especially around the waist , but they're extremely comfortable & have room to tuck tops into . i have the cowled sweater tank in gray & it looks fantastic over these ! could

### Filtering + Metadatas + Content Transformation + Content Augmentation (for Multi Label)

In [None]:
aug2 = naw.ContextualWordEmbsAug(model_path='roberta-base', 
                                device='cuda:0', # if you don't have gpu, change to 'cpu'
                                action="substitute",
                                top_k=10,
                               aug_p=0.07)

contextual_aug_func = partial(nlp_aug_stochastic,aug=aug2,p=0.5)

In [None]:
df = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig')
df['Fake Label'] = [np.random.choice(df['Department Name'].unique()[:-1],size=np.random.randint(2,6),replace=False) for _ in range(len(df))]

In [None]:
dset = Dataset.from_pandas(df)

In [None]:
ddict_with_val = dset.train_test_split(test_size=0.1)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

In [None]:
ddict_with_val

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name', 'Fake Label'],
        num_rows: 2349
    })
})

In [None]:
tdc = TextDataControllerStreaming(ddict_with_val,
                                  main_text='Review Text',
                                  label_names='Fake Label',
                                  sup_types='classification',
                                  class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                  filter_dict={'Review Text': lambda x: x is not None},
                                  metadatas=['Title','Division Name'],
                                  content_transformations=[text_normalize,str.lower],
                                  content_augmentations= contextual_aug_func, 
                                  process_metas=True,
                                  batch_size=1000,
                                  num_proc=4,
                                  seed=42
                                 )
tdc.process_and_tokenize(tokenizer,max_length=512)

-------------------- Data Filtering --------------------


Filter (num_proc=4):   0%|          | 0/2349 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/2276 [00:00<?, ? examples/s]

Done
----- Label Encoding -----


Map (num_proc=4):   0%|          | 0/2276 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on validation set -----


Map (num_proc=4):   0%|          | 0/2276 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2276 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2276 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation, augmentation and tokenization on train set -----
Done


In [None]:
for i,v in enumerate(tdc.main_ddict['train']):
    if i==10:break
    print(f"Text: {v['Review Text']}\nLabel: {v['Fake Label']} => {v['label']}")
    print('-'*10)

Text: initmates . so cute , but not for me . this suit is so adorable ! i love the retro style and the two patterns and i really wish it worked for me . the suit is comfy , seems well made and fit nicely . but , although it fit technically , i don't like the way it cuts me off at the thigh . it's not flattering for me at all . the suit is kind of a neoprene material , so it's thicker than a normal bathing suit , so i think even though it isn't lined , it might not be see-through , but i didn't get it wet to test it out . in terms of fit , i think it
Label: ['Jackets', 'Tops', 'Intimate'] => [0 0 1 1 1 0]
----------
Text: general. beautiful, stunning, cozy top!. i read the first article on this and wanted both a small and a medium as i thought this top run small! i have to strongly disagree with the reviewer! i find that this top runs true to size or even generous! the sky color is so pretty and this top can be dressed up with some white heels and a necklace or it can be comfy casual! i

In [None]:
for i in range(5):
    print(f"Text: {tdc.main_ddict['validation']['Review Text'][i]}")
    print(f"Label: {tdc.main_ddict['validation']['Fake Label'][i]} => {tdc.main_ddict['validation']['label'][i]}")
    print('-'*10)

Text: general petite . . this top has great detailing and color . does run a little big , but adds to the style and movement of the tank . the stitching around the bottom makes it cute for layering .
Label: ['Dresses', 'Intimate', 'Trend', 'Tops', 'Bottoms'] => [1, 1, 1, 0, 1, 1]
----------
Text: general . . i love this top . i got it on sale and am so glad that i did . it is a short too but still super flattering . it isn't too boxy on me .
Label: ['Intimate', 'Trend', 'Jackets', 'Dresses', 'Tops'] => [0, 1, 1, 1, 1, 1]
----------
Text: general . beautiful idea ... . i ordered my normal size in this dress . i am 6 foot tall , but the regular sizes were too large and too long ( mid-calf ) . i returned the dress for a size smaller in petite for a more flattering hemline . the dress is lovely , especially on the models in the pictures , but didn't quite work out for me . also , it feels like there are hundreds of closure hooks that make putting on / taking off the dress seem to take an u

In [None]:
tdc.label_lists

[['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

## Save and Load TextDataControllerStreaming

In [None]:
show_doc(TextDataControllerStreaming.save_as_pickles)

---

### TextDataControllerStreaming.save_as_pickles

>      TextDataControllerStreaming.save_as_pickles (fname,
>                                                   parent='pickle_files',
>                                                   drop_attributes=False)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |
| drop_attributes | bool | False | Whether to drop large-size attributes |

In [None]:
show_doc(TextDataControllerStreaming.from_pickle)

---

### TextDataControllerStreaming.from_pickle

>      TextDataControllerStreaming.from_pickle (fname, parent='pickle_files')

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |

TextDataControllerStreaming object can be saved and loaded with ease. This is especially useful after text processing and/or tokenization have been done

In [None]:
from datasets import disable_caching

In [None]:
disable_caching() # disable huggingface caching to see data size

In [None]:
from underthesea import text_normalize
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

In [None]:
def nlp_aug_stochastic(x,aug=None,p=0.5):
    if not isinstance(x,list): 
        if random.random()<p: return aug.augment(x)[0]
        return x
    news=[]
    originals=[]
    for _x in x:
        if random.random()<p: news.append(_x)
        else: originals.append(_x)
    # only perform augmentation when needed
    if len(news): news = aug.augment(news)
    return news+originals

In [None]:
aug2 = naw.ContextualWordEmbsAug(model_path='roberta-base', 
                                device='cuda:0', # if you don't have gpu, change to 'cpu'
                                action="substitute",
                                top_k=10,
                               aug_p=0.07)

contextual_aug_func = partial(nlp_aug_stochastic,aug=aug2,p=0.1)

In [None]:
dset = load_dataset('sample_data',data_files=['Womens_Clothing_Reviews.csv'],split='train')
ddict_with_val = dset.train_test_split(test_size=0.2)
ddict_with_val['validation'] = ddict_with_val['test']
ddict_with_val['train'] = ddict_with_val['train'].to_iterable_dataset()
del ddict_with_val['test']

tdc = TextDataControllerStreaming(ddict_with_val,
                                  main_text='Review Text',
                                  label_names='Department Name',
                                  sup_types='classification',
                                  class_names_predefined=['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
                                  filter_dict={'Review Text': lambda x: x is not None,
                                               'Department Name': lambda x: x is not None,
                                              },
                                  metadatas=['Title','Division Name'],
                                  content_transformations=[text_normalize,str.lower],
                                  content_augmentations= contextual_aug_func,
                                  process_metas=True,
                                  batch_size=100,
                                  num_proc=4,
                                  seed=42
                                 )
tdc.process_and_tokenize(tokenizer,max_length=512)

-------------------- Data Filtering --------------------


Filter (num_proc=4):   0%|          | 0/4698 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/4530 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Done
----- Label Encoding -----


Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on validation set -----


Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4529 [00:00<?, ? examples/s]

Done
----- Creating a generator for content transformation, augmentation and tokenization on train set -----
Done


In [None]:
tdc.main_ddict

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4529
    })
})

In [None]:
tdc.save_as_pickles('my_tdc_stream')

Let's check the file size

In [None]:
file_stats = os.stat(Path('pickle_files/my_tdc_stream.pkl'))
print(f'File Size in MegaBytes is {round(file_stats.st_size / (1024 * 1024), 3)}')

File Size in MegaBytes is 479.023


Load back our object

In [None]:
tdc2 = TextDataControllerStreaming.from_pickle('my_tdc_stream')

You can still access all its attributes, data, preprocessings, transformation/augmentation ...

In [None]:
tdc2.main_ddict

DatasetDict({
    train: <datasets.iterable_dataset.IterableDataset object>
    validation: Dataset({
        features: ['Title', 'Review Text', 'Division Name', 'Department Name', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4529
    })
})

In [None]:
for i,v in enumerate(tdc2.main_ddict['train']):
    if i==3:break
    print(f"Text: {v['Review Text']}\nLabel: {v['Department Name']} => {v['label']}")
    print('-'*10)

Text: general . eye spy a great vest . i purchased this in my usual small ( size 4-6 ) and it fits just the way it shows in the photo . it is very flowy which is th point . i wore it over a black romper and it looked great . i can also wear with jeans and a simple black tank . keep the vest open or use the hook and eye and close it up . seeing the black through the sheer white is simply dreamy ! glad i purchased it during the sale on sale promotion . it is a classic piece for sure .
Label: Jackets => 3
----------
Text: general petite.. i love this soft, colorful, flowy beauty! it's the new color palette! i'm 5'5 ", 34 d, size 6 and a small fit me with room to spare. don't wait!
Label: Dresses => 1
----------
Text: general petite . cool top . impecable workmanship ( overseas ) . i usually wear a petite 2 but ordered this in a regular size 0 and glad that i did . since it curves up on this side , it barely overs the waistband on my jeans . ordered this in pink but it's more of pale coral

In [None]:
tdc2.label_lists

[['Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']]

In [None]:
tdc2.filter_dict,tdc2.content_tfms,tdc2.aug_tfms

({'Review Text': <function __main__.<lambda>(x)>,
  'Department Name': <function __main__.<lambda>(x)>},
 [<function underthesea.pipeline.text_normalize.text_normalize(text, tokenizer='underthesea')>,
  <method 'lower' of 'str' objects>],
 [functools.partial(<function nlp_aug_stochastic>, aug=<nlpaug.augmenter.word.context_word_embs.ContextualWordEmbsAug object>, p=0.1)])

If you don't want to store the HuggingFace DatasetDict in your `TextDataControllerStreaming`, or the augmentation functions (typically when you already have a trained model, and you only use `TextDataControllerStreaming` to preprocess the test set), you can remove it in the `save_as_pickles` step

In [None]:
tdc.save_as_pickles('my_lightweight_tdc_stream',drop_attributes=True)

Let's check the file size

In [None]:
file_stats = os.stat(Path('pickle_files/my_lightweight_tdc_stream.pkl'))
print(f'File Size in MegaBytes is {round(file_stats.st_size / (1024 * 1024), 3)}')

File Size in MegaBytes is 1.911


Load it back

In [None]:
tdc3 = TextDataControllerStreaming.from_pickle('my_lightweight_tdc_stream')

We will use this object to demonstrate the Test Set Construction in the next section

### Construct a Test Dataset

In [None]:
show_doc(TextDataControllerStreaming.prepare_test_dataset)

---

### TextDataControllerStreaming.prepare_test_dataset

>      TextDataControllerStreaming.prepare_test_dataset (test_dset,
>                                                        do_filtering=False)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| test_dset |  |  | The HuggingFace Dataset as Test set |
| do_filtering | bool | False | whether to perform data filtering on this test set |

In [None]:
show_doc(TextDataControllerStreaming.prepare_test_dataset_from_csv)

---

### TextDataControllerStreaming.prepare_test_dataset_from_csv

>      TextDataControllerStreaming.prepare_test_dataset_from_csv (file_path,
>                                                                 do_filtering=F
>                                                                 alse)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| file_path |  |  | path to csv file |
| do_filtering | bool | False | whether to perform data filtering on this test set |

In [None]:
show_doc(TextDataControllerStreaming.prepare_test_dataset_from_df)

---

### TextDataControllerStreaming.prepare_test_dataset_from_df

>      TextDataControllerStreaming.prepare_test_dataset_from_df (df,
>                                                                validate=True, 
>                                                                do_filtering=Fa
>                                                                lse)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df |  |  | Pandas Dataframe |
| validate | bool | True | whether to perform input data validation |
| do_filtering | bool | False | whether to perform data filtering on this test set |

In [None]:
show_doc(TextDataControllerStreaming.prepare_test_dataset_from_raws)

---

### TextDataControllerStreaming.prepare_test_dataset_from_raws

>      TextDataControllerStreaming.prepare_test_dataset_from_raws (content)

|    | **Details** |
| -- | ----------- |
| content | Either a single sentence, list of sentence or a dictionary with keys are metadata columns and values are list |

Let's say you have done your preprocessing and tokenization in your training set, and have a nicely trained model, ready to do inference on new data. Here is how you can use `TextDataControllerStreaming` to apply all the necessary preprocessings to your new data

We will reuse the lightweight tdc object we created in the previous section (since we don't really need all the training data just to construct new data). Also, we will take a small sample of our training data and pretend it is our test data

In [None]:
tdc = TextDataControllerStreaming.from_pickle('my_lightweight_tdc_stream')

In [None]:
df_test = pd.read_csv('sample_data/Womens_Clothing_Reviews.csv',encoding='utf-8-sig').sample(frac=0.2,random_state=1)
# drop NaN values in the label column
df_test = df_test[~df_test['Department Name'].isna()].reset_index(drop=True)
df_test.shape

(4692, 10)

In [None]:
df_test.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,872,42,Perfect for work and play,This shirt works for both going out and going ...,5,1,0,General,Tops,Knits
1,1033,40,,I don't know why i had the opposite problem mo...,4,1,0,General Petite,Bottoms,Jeans
2,1037,45,Great pants,These cords are great--lightweight for fl wint...,5,1,1,General Petite,Bottoms,Jeans
3,829,35,Surprisingly comfy for a button down,I am a 10 m and got the 10. it fits perfectly ...,5,1,1,General Petite,Tops,Blouses
4,872,29,Short and small,The shirt is mostly a thick sweatshirt materia...,3,0,15,General Petite,Tops,Knits


In [None]:
test_dset = tdc.prepare_test_dataset_from_df(df_test,validate=True,do_filtering=True)

- Input Validation Precheck -
Data contains missing values!
-----> List of columns and the number of missing values for each
Title          758
Review Text    164
dtype: int64
Data contains duplicated values!
-----> Number of duplications: 2 rows
-------------------- Start Test Set Transformation --------------------
-------------------- Data Filtering --------------------


Filter (num_proc=4):   0%|          | 0/4692 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

Done
----- Metadata Simple Processing & Concatenating to Main Content -----


Map (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on test set -----


Map (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4528 [00:00<?, ? examples/s]

Done


In [None]:
for i in range(3):
    print(f"Text: {test_dset['Review Text'][i]}")
    print(f"Input_ids: {test_dset['input_ids'][i]}")
    print('-'*10)

Text: general . perfect for work and play . this shirt works for both going out and going to work , and i can wear it with everything . fits perfect , tucked and untucked , tied and untied . i love it .
Input_ids: [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42, 6399, 1364, 13, 258, 164, 66, 8, 164, 7, 173, 2156, 8, 939, 64, 3568, 24, 19, 960, 479, 10698, 1969, 2156, 21222, 8, 7587, 23289, 2156, 3016, 8, 7587, 2550, 479, 939, 657, 24, 479, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
----------
Text: general petite . . i don't know why i had the opposite problem most reviewers had with these ..... i tried on the regular length in the store and found that they were just a bit too short with heels . ( i'm 5 ' 5 ) . i had them ordere

Let's make our test data streamed as well

In [None]:
test_dset_raw = Dataset.from_pandas(df_test).to_iterable_dataset()

This test dataset might have some NaN values in the text field (`Review Text`), thus we will turn on the filtering option to get rid of these NaNs, as this is what we did in the training set. If your test dataset don't need any filtering, turn off this option

In [None]:
test_dset = tdc.prepare_test_dataset(test_dset_raw,do_filtering=True)

-------------------- Start Test Set Transformation --------------------
-------------------- Data Filtering --------------------
Done
----- Metadata Simple Processing & Concatenating to Main Content -----
Done
-------------------- Dropping unused features --------------------
Done
----- Performing content transformation and tokenization on test set -----
Done


In [None]:
for i,v in enumerate(test_dset):
    if i==3:break
    print(f"Text: {v['Review Text']}\Input_ids: {v['input_ids']}\nAttention mask: {v['attention_mask']}")
    print('-'*10)

Text: general . perfect for work and play . this shirt works for both going out and going to work , and i can wear it with everything . fits perfect , tucked and untucked , tied and untied . i love it .\Input_ids: [0, 15841, 479, 1969, 13, 173, 8, 310, 479, 42, 6399, 1364, 13, 258, 164, 66, 8, 164, 7, 173, 2156, 8, 939, 64, 3568, 24, 19, 960, 479, 10698, 1969, 2156, 21222, 8, 7587, 23289, 2156, 3016, 8, 7587, 2550, 479, 939, 657, 24, 479, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()