# Text Main

> This module contains the main Python class for data control: `TextDataMain`

In [None]:
#| default_exp text_main

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from datasets import DatasetDict,Dataset
from pathlib import Path
from tqdm import tqdm
from that_nlp_library.utils import *
from functools import partial

In [None]:
from that_nlp_library.text_transformation import *
from that_nlp_library.text_augmentation import *
from importlib.machinery import SourceFileLoader
import os

## Text transformation and Tokenizer Explanation

In [None]:
#| export
def tokenizer_explain(inp, # Input sentence
                      tokenizer, # Tokenizer (preferably from HuggingFace)
                      split_word=False # Is input `inp` split into list or not
                     ):
    "Display results from tokenizer"
    print('----- Tokenizer Explained -----')
    print('--- Input ---')
    print(inp)
    print()
    print('--- Tokenized results --- ')
    print(tokenizer(inp,is_split_into_words=split_word))
    print()
    tok = tokenizer.encode(inp,is_split_into_words=split_word)
    print('--- Results from tokenizer.convert_ids_to_tokens ---')
    print(tokenizer.convert_ids_to_tokens(tok))
    print()
    print('--- Results from tokenizer.decode --- ')
    print(tokenizer.decode(tok))
    print()

In [None]:
show_doc(tokenizer_explain)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L20){target="_blank" style="float:right; font-size:smaller"}

### tokenizer_explain

>      tokenizer_explain (inp, tokenizer, split_word=False)

Display results from tokenizer

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | Input sentence |
| tokenizer |  |  | Tokenizer (preferably from HuggingFace) |
| split_word | bool | False | Is input `inp` split into list or not |

Let's load a tokenizer from EnviBert model. Uncomment the command line below to download necessary files to build this tokenizer

In [None]:
# !pip install gdown

In [None]:
# !gdown 14X9fGijA7kdNfe4dM_8gqfxIWtj1Q-hb -O ./envibert_cache --folder

In [None]:
cache_dir=Path('./envibert_tokenizer')
tokenizer = SourceFileLoader("envibert.tokenizer", 
                             str(cache_dir/'envibert_tokenizer.py')).load_module().RobertaTokenizer(cache_dir)

Note that Envibert tokenizer does not required the input to be tokenized using word_tokenize from UnderTheSea library

In [None]:
inp = 'hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức'
tokenizer_explain(inp,tokenizer)

----- Tokenizer Explained -----
--- Input ---
hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức

--- Tokenized results --- 
{'input_ids': [0, 227, 1033, 191, 664, 1033, 7366, 2615, 13, 664, 1033, 671, 1355, 2294, 993, 413, 2900, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', '▁hội', '▁cư', '▁dân', '▁chung', '▁cư', '▁sen', '▁hồng', '▁-', '▁chung', '▁cư', '▁lot', 'us', '▁sóng', '▁thần', '▁thủ', '▁đức', '</s>']

--- Results from tokenizer.decode --- 
<s> ▁hội ▁cư ▁dân ▁chung ▁cư ▁sen ▁hồng ▁- ▁chung ▁cư ▁lot us ▁sóng ▁thần ▁thủ ▁đức </s>



In [None]:
inp = ['hội', 'cư', 'dân', 'chung', 'cư', 'sen', 'hồng', '-', 'chung', 'cư', 'lotus', 'sóng', 'thần', 'thủ', 'đức']
tokenizer_explain(inp,tokenizer,split_word=True)

----- Tokenizer Explained -----
--- Input ---
['hội', 'cư', 'dân', 'chung', 'cư', 'sen', 'hồng', '-', 'chung', 'cư', 'lotus', 'sóng', 'thần', 'thủ', 'đức']

--- Tokenized results --- 
{'input_ids': [0, 227, 1033, 191, 664, 1033, 7366, 2615, 13, 664, 1033, 671, 1355, 2294, 993, 413, 2900, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', '▁hội', '▁cư', '▁dân', '▁chung', '▁cư', '▁sen', '▁hồng', '▁-', '▁chung', '▁cư', '▁lot', 'us', '▁sóng', '▁thần', '▁thủ', '▁đức', '</s>']

--- Results from tokenizer.decode --- 
<s> ▁hội ▁cư ▁dân ▁chung ▁cư ▁sen ▁hồng ▁- ▁chung ▁cư ▁lot us ▁sóng ▁thần ▁thủ ▁đức </s>



Now let's try PhoBert tokenizer. PhoBert tokenizer, unlike Envibert tokenizer, requires input to be word tokenized (using UnderTheSea library)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
inp = apply_word_tokenize('hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức')
print(inp)

hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức


In [None]:
tokenizer_explain(inp,tokenizer)

----- Tokenizer Explained -----
--- Input ---
hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức

--- Tokenized results --- 
{'input_ids': [0, 1093, 1838, 1574, 3330, 2025, 31, 1574, 2029, 4885, 8554, 25625, 7344, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', 'hội', 'cư_dân', 'chung_cư', 'sen', 'hồng', '-', 'chung_cư', 'lo@@', 'tus', 'sóng_thần', 'thủ_@@', 'đức', '</s>']

--- Results from tokenizer.decode --- 
<s> hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức </s>



In [None]:
#| export
def two_steps_tokenization_explain(inp, # Input sentence
                                   tokenizer, # Tokenizer (preferably from HuggingFace)
                                   split_word=False, # Is input `inp` split into list or not
                                   content_tfms=[] # A list of text transformations
                                  ):
    "Display results form each content transformation, then display results from tokenizer"
    print('----- Text Transformation Explained -----')
    print('--- Raw sentence ---')
    print(inp)
    for tfm in content_tfms:
        print_msg(callable_name(tfm),3)
        inp = tfm(inp)
        print(inp)
    print()
    tokenizer_explain(inp,tokenizer,split_word)

In [None]:
show_doc(two_steps_tokenization_explain)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L41){target="_blank" style="float:right; font-size:smaller"}

### two_steps_tokenization_explain

>      two_steps_tokenization_explain (inp, tokenizer, split_word=False,
>                                      content_tfms=[])

Display results form each content transformation, then display results from tokenizer

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| inp |  |  | Input sentence |
| tokenizer |  |  | Tokenizer (preferably from HuggingFace) |
| split_word | bool | False | Is input `inp` split into list or not |
| content_tfms | list | [] | A list of text transformations |

Let's load Phobert tokenizer one more time to test out this function

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from underthesea import text_normalize

In [None]:
inp = 'Hội cư dân   chung cư sen hồng- chung cư    lotus sóng thần thủ đức'
two_steps_tokenization_explain(inp,tokenizer,content_tfms=[text_normalize,apply_word_tokenize])

----- Text Transformation Explained -----
--- Raw sentence ---
Hội cư dân   chung cư sen hồng- chung cư    lotus sóng thần thủ đức
--- text_normalize ---
Hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức
--- apply_word_tokenize ---
Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức

----- Tokenizer Explained -----
--- Input ---
Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức

--- Tokenized results --- 
{'input_ids': [0, 792, 1838, 1574, 3330, 2025, 31, 1574, 2029, 4885, 8554, 25625, 7344, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', 'Hội', 'cư_dân', 'chung_cư', 'sen', 'hồng', '-', 'chung_cư', 'lo@@', 'tus', 'sóng_thần', 'thủ_@@', 'đức', '</s>']

--- Results from tokenizer.decode --- 
<s> Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức </s>



This is a bit redundant, as `apply_word_tokenize` also have an option to normalize text. Let's shorten the code:

In [None]:
from functools import partial

In [None]:
inp = 'Hội cư dân   chung cư sen hồng- chung cư    lotus sóng thần thủ đức'
two_steps_tokenization_explain(inp,tokenizer,content_tfms=[partial(apply_word_tokenize,normalize_text=True)])

----- Text Transformation Explained -----
--- Raw sentence ---
Hội cư dân   chung cư sen hồng- chung cư    lotus sóng thần thủ đức
--- apply_word_tokenize ---
Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức

----- Tokenizer Explained -----
--- Input ---
Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức

--- Tokenized results --- 
{'input_ids': [0, 792, 1838, 1574, 3330, 2025, 31, 1574, 2029, 4885, 8554, 25625, 7344, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', 'Hội', 'cư_dân', 'chung_cư', 'sen', 'hồng', '-', 'chung_cư', 'lo@@', 'tus', 'sóng_thần', 'thủ_@@', 'đức', '</s>']

--- Results from tokenizer.decode --- 
<s> Hội cư_dân chung_cư sen hồng - chung_cư lotus sóng_thần thủ_đức </s>



## DatasetDict

In [None]:
#| export
def tokenize_function(examples:dict,
                      tok,
                      max_length=None,
                      is_split_into_words=True):
    if max_length is None:
        # pad to model's default max sequence length
        return tok(examples["text"], padding="max_length", truncation=True,is_split_into_words=is_split_into_words)
    if isinstance(max_length,int) and max_length>0:
        # pad to max length of the current batch, and start truncating at max_length
        return tok(examples["text"], padding=True, max_length=max_length,truncation=True,is_split_into_words=is_split_into_words)
    
    # no padding (still truncate at model's default max sequence length)
    return tok(examples["text"], truncation=True,is_split_into_words=is_split_into_words)

In [None]:
#| export
def datasetdictize_given_idxs(kv_pairs:dict, # Dictionary; keys can be content, label, metadata. Values are list each.
                              trn_idx=None, # Training indices
                              val_idx=None, # Validation indices
                              tokenizer=None, # HuggingFace tokenizer
                              is_split_into_words=False, # Is text (content) split into list or not
                              max_length=None # pad to model's allowed max length (default is max_sequence_length)
                             ):
    "Create a HuggingFace DatasetDict with given arguments"
    if 'text' not in kv_pairs.keys():
        raise ValueError('Dictionary must have `text` (which contains texture contents) as key')
    all_dataset = Dataset.from_dict(kv_pairs)
    main_ddict = DatasetDict()
    if trn_idx is None:
        main_ddict['train'] = all_dataset
    else:
        main_ddict['train'] = all_dataset.select(trn_idx)

    if val_idx is not None:  
        main_ddict['validation'] = all_dataset.select(val_idx)
    
    print_msg("Map Tokenize Function",20)
    main_ddict_tokenized = main_ddict.map(partial(tokenize_function,
                                                  tok=tokenizer,
                                                  is_split_into_words=is_split_into_words,
                                                  max_length=max_length),batched=True)
    
    return main_ddict_tokenized

In [None]:
show_doc(datasetdictize_given_idxs)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L67){target="_blank" style="float:right; font-size:smaller"}

### datasetdictize_given_idxs

>      datasetdictize_given_idxs (kv_pairs:dict, trn_idx=None, val_idx=None,
>                                 tokenizer=None, is_split_into_words=False,
>                                 max_length=None)

Create a HuggingFace DatasetDict with given arguments

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| kv_pairs | dict |  | Dictionary; keys can be content, label, metadata. Values are list each. |
| trn_idx | NoneType | None | Training indices |
| val_idx | NoneType | None | Validation indices |
| tokenizer | NoneType | None | HuggingFace tokenizer |
| is_split_into_words | bool | False | Is text (content) split into list or not |
| max_length | NoneType | None | pad to model's allowed max length (default is max_sequence_length) |

Example

In [None]:
cache_dir=Path('./envibert_tokenizer')
tokenizer = SourceFileLoader("envibert.tokenizer", 
                             str(cache_dir/'envibert_tokenizer.py')).load_module().RobertaTokenizer(cache_dir)

In [None]:
kv_pairs={
    'text':[
         'hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức',
         'This is the recommended way to make a Python package importable from anywhere',
         'hội cần mở thẻ tín dụng tại hà nội, đà nẵng, tp. hồ chí minh',
         "biti's cao lãnh - đồng tháp",
         'chợ phòng trọ + việc làm...khu lĩnh nam - vĩnh hưng - mai động (hoàng mai)'
          ],
    'label': [0,1,0,0,1]
}

ddict = datasetdictize_given_idxs(kv_pairs,
                                  trn_idx=[0,1,3],
                                  val_idx=[2,4],
                                  tokenizer=tokenizer,
                                  max_length=512)

-------------------- Map Tokenize Function --------------------


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
ddict.keys()

dict_keys(['train', 'validation'])

In [None]:
print(ddict['train']['input_ids'])

[[0, 227, 1033, 191, 664, 1033, 7366, 2615, 13, 664, 1033, 671, 1355, 2294, 993, 413, 2900, 2], [0, 116, 14, 6, 3169, 270, 9, 364, 10, 23963, 5360, 15930, 2003, 51, 5906, 2, 1, 1], [0, 880, 592, 427, 162, 171, 906, 13, 122, 6553, 2, 1, 1, 1, 1, 1, 1, 1]]


You can change max_length (which allow truncation when sentence length is higher than max_length) 

In [None]:
ddict = datasetdictize_given_idxs(kv_pairs,
                                  trn_idx=[0,1,3],
                                  val_idx=[2,4],
                                  tokenizer=tokenizer,
                                  max_length=5)

-------------------- Map Tokenize Function --------------------


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
print(ddict['train']['input_ids'])

[[0, 227, 1033, 191, 2], [0, 116, 14, 6, 2], [0, 880, 592, 427, 2]]


Allow full dataset to be mapped to DatasetDict if we omit trn_idx argument

In [None]:
ddict = datasetdictize_given_idxs(kv_pairs,
                                  tokenizer=tokenizer,
                                  max_length=512)

-------------------- Map Tokenize Function --------------------


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
print(ddict['train']['input_ids'])

[[0, 227, 1033, 191, 664, 1033, 7366, 2615, 13, 664, 1033, 671, 1355, 2294, 993, 413, 2900, 2, 1, 1, 1, 1, 1, 1, 1], [0, 116, 14, 6, 3169, 270, 9, 364, 10, 23963, 5360, 15930, 2003, 51, 5906, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 227, 256, 778, 2600, 1074, 144, 76, 5489, 613, 57339, 4820, 27666, 57339, 21422, 244, 872, 635, 841, 2, 1, 1, 1, 1, 1], [0, 880, 592, 427, 162, 171, 906, 13, 122, 6553, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2299, 315, 5995, 1349, 99, 83, 55025, 244, 6356, 1114, 1213, 1163, 13, 8233, 11051, 13, 3335, 109, 28, 11695, 13377, 3335, 3, 2]]


## Class TextDataMain

In [None]:
#| export
class TextDataMain():
    def __init__(self,
                 df: pd.DataFrame, # The main dataframe
                 main_content:str, # Name of the text column
                 metadatas=[], # Names of the metadata columns
                 label_names=None, # Names of the label (dependent variable) columns
                 class_names_predefined=None, # (Optional) List of names associated with the labels (same index order)
                 val_ratio:list|float|None=0.2, # Ratio of data for validation set. If given a list, validation set will be chosen based on indices in this list
                 split_cols:list|str=None, # Column(s) needed to do stratified shuffle split
                 content_tfms=[], # A list of text transformations
                 aug_tfms=[], # A list of text augmentations
                 process_metadatas=True, # Whether to do simmple text processing on the chosen metadatas
                 seed=None, # Random seed
                 cols_to_keep=None, # Columns to keep after all processings
                 shuffle_trn=True # Whether to shuffle the train set
                ):
        self.df = df.copy()
        self.main_content = main_content
        self.metadatas = metadatas
        self.label_names = label_names
        self.label_lists = class_names_predefined
        self.content_tfms = content_tfms
        self.aug_tfms = aug_tfms
        self.process_metadatas = process_metadatas
        self.val_ratio=val_ratio
        self.split_cols=split_cols
        self.seed = seed
        self.cols_to_keep = cols_to_keep
        self.shuffle_trn=shuffle_trn  
        self._main_called=False
        self.is_multilabel=False
        self.is_multihead=False
        check_input_validation(self.df)
        
    @classmethod
    def from_csv(cls,path,return_df=False,encoding='utf-8-sig',**kwargs):
        df = pd.read_csv(path,encoding=encoding,engine='pyarrow')
        tdm = TextDataMain(df,main_content=None) if return_df else TextDataMain(df,**kwargs)
        if return_df:
            return df
        return tdm
    
    @classmethod
    def from_pickle(cls,
                    fname, # Name of the pickle file
                    parent='pickle_files' # Parent folder
                   ):
        return load_pickle(fname,parent=parent)
    
    @classmethod
    def from_gsheet(cls,gs_id,return_df=False,**kwargs):
        pass

    
    def save_as_pickles(self,
                        fname, # Name of the pickle file
                        parent='pickle_files', # Parent folder
                        drop_data_attributes=False # Whether to drop all large-size data attributes
                       ):
        if drop_data_attributes:
            if hasattr(self, 'df'):
                del self.df
            if hasattr(self, 'main_ddict'):
                del self.main_ddict
        save_to_pickle(self,fname,parent=parent)

        
    def _check_validation_leaking(self,trn_idxs,val_idxs):
        if self.val_ratio is None:
            return trn_idxs,None
        
        df_trn = self.df.loc[trn_idxs]
        df_val = self.df.loc[val_idxs]
        
        #sanity check
        assert df_trn.shape[0]+df_val.shape[0]==self.df.shape[0],"Train + Validation != Total Data"

        
        print(f'Previous Validation Percentage: {round(100*len(val_idxs)/self.df.shape[0],3)}%')
        val_content_series = check_text_leaking(df_trn[self.main_content],df_val[self.main_content])
        val_idxs2 = val_content_series.index.values
        trn_idxs2 = self.df[~self.df.index.isin(val_idxs2)].index.values
        print(f'Current Validation Percentage: {round(100*len(val_idxs2)/self.df.shape[0],3)}%')
        if len(val_idxs2)!=len(val_idxs):
            return trn_idxs2,val_idxs2
        return trn_idxs,val_idxs
    
    def _train_test_split(self):
        print_msg('Train Test Split',20)
        rng = np.random.default_rng(self.seed)
        if self.val_ratio is None: # no train/val split
            trn_idxs = rng.permutation(self.df.shape[0])
            return trn_idxs,None
        if isinstance(self.val_ratio,list) or isinstance(self.val_ratio,np.ndarray):
            val_idxs = np.array(self.val_ratio)
            trn_idxs = np.array(set(self.df.index.values) - set(self.val_ratio))
            return trn_idxs,val_idxs
        if isinstance(self.val_ratio,float) and self.split_cols is None:
            _idxs = rng.permutation(self.df.shape[0])
            _cutoff = int(self.val_ratio*self.df.shape[0]) 
            val_idxs = _idxs[:_cutoff]
            trn_idxs = _idxs[_cutoff:]
            return trn_idxs,val_idxs
        
        self.split_cols = val2iterable(self.split_cols)
        if self.is_multilabel and self.label_names[0] in self.split_cols:
            raise ValueError('For MultiLabel classification, you cannot choose the label as your shuffle-split column')
        
        if len(self.split_cols)>0:
            _y = self.df[self.split_cols[0]]
            if len(self.split_cols)>1:
                for c in self.split_cols[1:]:
                    _y= _y.astype(str) + '_' + self.df[c].astype(str)
            sss = StratifiedShuffleSplit(n_splits=1, test_size=self.val_ratio, 
                                         random_state=self.seed)
            trn_idxs,val_idxs = list(sss.split(self.df,_y))[0]
            return trn_idxs,val_idxs
        
        raise ValueError('No valid keyword arguments for train validation split!')

                         
    def _encode_labels(self):
        print_msg('Label Encoding')
        if self.label_names is None: 
            raise ValueError('Missing label columns!')
        self.label_names = val2iterable(self.label_names)
        if len(self.label_names)>1:
            self.is_multihead=True
        
        if self.label_lists is not None and not isinstance(self.label_lists[0],list):
            self.label_lists = [self.label_lists]
        
        if isinstance(self.df[self.label_names[0]].iloc[0],list):
            # This is multi-label. Ignore self.label_names[1:]
            self.label_names = [self.label_names[0]]
            self.is_multihead=False
            self.is_multilabel=True
            
        encoder_classes=[]
        if not self.is_multilabel:
            for idx,l in enumerate(self.label_names):
                if self.label_lists is None:
                    train_label = self.df[l].values
                    l_encoder = LabelEncoder()
                    self.df[l] = l_encoder.fit_transform(train_label)
                    encoder_classes.append(list(l_encoder.classes_))
                else:
                    l_classes = sorted(list(self.label_lists[idx]))
                    label2idx = {v:i for i,v in enumerate(l_classes)}
                    self.df[l] = self.df[l].map(label2idx).values
                    encoder_classes.append(l_classes)
        else:
            # For MultiLabel, we only save the encoder classes without transforming the label itself to one-hot (or actually, few-hot)
            if self.label_lists is None:
                l_encoder = MultiLabelBinarizer()
                _ = l_encoder.fit(self.df[self.label_names[0]])
                encoder_classes.append(list(l_encoder.classes_))
            else:
                l_classes = sorted(list(self.label_lists[0]))
                encoder_classes.append(l_classes)
                
        self.label_lists = encoder_classes
            
    def _process_metadatas(self,df,override_dict=True):
        print_msg('Metadata Simple Processing & Concatenating to Main Content')
        self.metadatas = val2iterable(self.metadatas)
            
        for s in self.metadatas:
            if self.process_metadatas:
                # just strip and lowercase
                df[s] = df[s].astype(str).str.strip().str.lower()
            # simple concatenation with '. '
            df[self.main_content] = df[s] + ' - ' + df[self.main_content]
                
        if override_dict:        
            self.metadata_dict={}
            for s in self.metadatas:
                self.metadata_dict[s]=sorted(set(df[s].values))
        return df
    
    def _simplify_df(self):
        if self.cols_to_keep is None:
            self.cols_to_keep= [self.main_content] + self.metadatas + self.label_names
        self.df = self.df[self.cols_to_keep].copy()
    
    def _do_transformation(self,df):
        print_msg('Text Transformation',20)
        for tfm in self.content_tfms:
            print_msg(callable_name(tfm))
            df[self.main_content] = [tfm(s) for s in tqdm(df[self.main_content].values)]
        return df
    
    def _do_augmentation(self,df_trn_org):
        df_trn_all = df_trn_org.copy()
        print_msg('Text Augmentation',20)
        print(f'Train data size before augmentation: {len(df_trn_all)}')
        for tfm in self.aug_tfms:
            print_msg(callable_name(tfm))
            if tfm.keywords['apply_to_all']:
                new_content,new_others = tfm(content=df_trn_all[self.main_content].values,others=df_trn_all.iloc[:,1:])
            else:
                new_content,new_others = tfm(content=df_trn_org[self.main_content].values,others=df_trn_org.iloc[:,1:])
            
            # add axis to np array in order to do concatenation
            if len(new_content.shape)==1:
                new_content = new_content[:,None]
            if len(new_others.values.shape)==1:
                new_others = new_others.values[:,None]
                
            df_tmp = pd.DataFrame(np.concatenate((new_content,new_others.values),axis=1),columns=df_trn_org.columns.values)
            df_trn_all = pd.concat((df_trn_all,df_tmp),axis=0).reset_index(drop=True)
            print(f'Train data size after THIS augmentation: {len(df_trn_all)}')       
        print(f'Train data size after ALL augmentation: {len(df_trn_all)}')
        return df_trn_all
    
    def _main_text_processing(self):
        print_msg('Start Main Text Processing',20)
        
        # Process metadatas
        self.df = self._process_metadatas(self.df)
        
        # Process labels
        self._encode_labels()
        
        # Content transformation
        self.df = self._do_transformation(self.df)
        
        # Train Test Split
        trn_idxs,val_idxs = self._train_test_split()
        self._simplify_df()
        trn_idxs,val_idxs = self._check_validation_leaking(trn_idxs,val_idxs)
        if self.val_ratio is not None:
            df_val = self.df.loc[val_idxs].reset_index(drop=True)
        
        # Augmentation
        df_trn_org = self.df.loc[trn_idxs].reset_index(drop=True)
        df_trn_all = self._do_augmentation(df_trn_org)
        df_trn_all['is_valid']=False
        
        # Shuffle train
        if self.shuffle_trn:
            df_trn_all = df_trn_all.sample(frac=1.,random_state=self.seed)
            
        # Combine augmented train and val
        if self.val_ratio is not None:
            df_val['is_valid']=True
            df_trn_all = pd.concat((df_trn_all,df_val),axis=0)
        
        self._main_called=True
        self.df = df_trn_all.reset_index(drop=True)        
    
    def set_data_collator(self,data_collator):
        self.data_collator = data_collator
        
    def tokenizer_explain_single(self,tokenizer):
        inp = self.df[~self.df['is_valid']][self.main_content].sample(1).values[0]
        tokenizer_explain(inp,tokenizer)
        
    def to_df(self): 
        "To execute all the defined processings and return a dataframe"
        if not self._main_called:
            self._main_text_processing()
        return self.df
       
    def save_train_data_after_processing(self,output_path,encoding='utf-8-sig'):
        if not self._main_called:
            print_msg('WARNING')
            print('Please process training data (using to_df or to_datasetdict)')
            return
        self.df.to_csv(Path(output_path),encoding=encoding,index=False)
    
    def to_datasetdict(self,
                       tokenizer, # Tokenizer (preferably from HuggingFace)
                       is_split_into_words=False, # Is text split into list or not
                       max_length=None, # pad to model's allowed max length (default is max_sequence_length)
                       trn_ratio=1., # Portion of training data to be converted to datasetdict. Useful for sample experiments
                       seed=42 # Random seed
                      ):
        if not self._main_called:
            self._main_text_processing()
        val_idx = self.df[self.df['is_valid']].index.values if self.val_ratio is not None else None
        trn_idx = self.df[~self.df['is_valid']].index.values
        if trn_ratio<1. and trn_ratio>0.:
            rng = np.random.default_rng(self.seed)
            _idxs = rng.permutation(len(trn_idx))
            _cutoff = int(trn_ratio*len(trn_idx)) 
            trn_idx = _idxs[:_cutoff]
            
        _label = self.df[self.label_names].values.tolist()
        if not self.is_multilabel:
            if len(self.label_names)==1:
                _label = np.array(_label).flatten().tolist() # (n,)
        else:
            # For MultiLabel, this is where the actual label transformation happens
            mlb = MultiLabelBinarizer(classes=self.label_lists[0])
            _label = self.df[self.label_names[0]].values.tolist()
            _label = mlb.fit_transform(_label).tolist() # few-hotted
        
        kv_pairs = {'text':self.df[self.main_content].tolist(),
                    'label':_label,
                   }
        for c in self.cols_to_keep:
            if c not in self.label_names+[self.main_content]: kv_pairs[c] = self.df[c].tolist()
        
        self.tokenizer = tokenizer
        self.is_split_into_words= is_split_into_words
        self.max_length = max_length
        
        ddict = datasetdictize_given_idxs(kv_pairs,trn_idx,val_idx,self.tokenizer,
                                         is_split_into_words=is_split_into_words,max_length=max_length)
        self.main_ddict = ddict
        return ddict
    
    def get_test_datasetdict_from_csv(self,path,encoding='utf-8-sig'):
        df_test = pd.read_csv(path,encoding=encoding,engine='pyarrow')
        return self.get_test_datasetdict_from_df(df_test)

    def get_test_datasetdict_from_dict(self,content):
        if len(self.metadatas)!=0 and not isinstance(content,dict):
            raise ValueError(f'There is/are metadatas in the preprocessing step. Please include a dictionary including these keys for metadatas: {self.metadatas}, and texture content: {self.main_content}')
            
        _dic = {self.main_content:[content]} if isinstance(content,str) else content
        for k in _dic.keys():
            _dic[k] = val2iterable(_dic[k])
        
        df_test = pd.DataFrame.from_dict(_dic)
        return self.get_test_datasetdict_from_df(df_test)
    
    def get_test_datasetdict_from_df(self,df_test):
        print_msg('Getting Test Set',20)
        check_input_validation(df_test)
        
        cols_to_keep = [c for c in self.cols_to_keep if c not in self.label_names]
        df_test = df_test[cols_to_keep].copy()
        
        print_msg('Start Test Set Transformation',20)
        df_test = self._process_metadatas(df_test,override_dict=False)
        df_test = self._do_transformation(df_test)
        
        if hasattr(self,'df'):
            print_msg('Test Leak Checking',20)
            _ = check_text_leaking(self.df[self.main_content],df_test[self.main_content])
        
        print_msg('Construct DatasetDict',20)
        test_text = df_test[self.main_content].values
        
        kv_pairs ={'text':test_text}
        for c in self.cols_to_keep:
            if c not in self.label_names+[self.main_content]: kv_pairs[c] = df_test[c].tolist()
        
        test_dataset = Dataset.from_dict(kv_pairs)
        test_ddict = DatasetDict()
        test_ddict['test'] = test_dataset
        test_ddict_tokenized = test_ddict.map(partial(tokenize_function,tok=self.tokenizer,
                                                      is_split_into_words=self.is_split_into_words,
                                                      max_length=self.max_length),batched=True)
        
        return test_ddict_tokenized

In [None]:
show_doc(TextDataMain)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L96){target="_blank" style="float:right; font-size:smaller"}

### TextDataMain

>      TextDataMain (df:pandas.core.frame.DataFrame, main_content:str,
>                    metadatas=[], label_names=None,
>                    class_names_predefined=None, val_ratio:list|float|None=0.2,
>                    split_cols:list|str=None, content_tfms=[], aug_tfms=[],
>                    process_metadatas=True, seed=None, cols_to_keep=None,
>                    shuffle_trn=True)

Initialize self.  See help(type(self)) for accurate signature.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | pd.DataFrame |  | The main dataframe |
| main_content | str |  | Name of the text column |
| metadatas | list | [] | Names of the metadata columns |
| label_names | NoneType | None | Names of the label (dependent variable) columns |
| class_names_predefined | NoneType | None | (Optional) List of names associated with the labels (same index order) |
| val_ratio | list \| float \| None | 0.2 | Ratio of data for validation set. If given a list, validation set will be chosen based on indices in this list |
| split_cols | list \| str | None | Column(s) needed to do stratified shuffle split |
| content_tfms | list | [] | A list of text transformations |
| aug_tfms | list | [] | A list of text augmentations |
| process_metadatas | bool | True | Whether to do simmple text processing on the chosen metadatas |
| seed | NoneType | None | Random seed |
| cols_to_keep | NoneType | None | Columns to keep after all processings |
| shuffle_trn | bool | True | Whether to shuffle the train set |

Let's start a step-by-step walkthrough on how to use this class

In [None]:
DATA_PATH = Path('sample_data')

### Constructor/ Class Method calls

If you just want to get the dataframe from the csv path, set ```return_df=True```. You still have the input validation precheck functionality.

In [None]:
df = TextDataMain.from_csv(DATA_PATH/'sample_large.csv',
                            return_df=True)

----- Input Validation Precheck -----
DataFrame contains duplicated values!
-----> Number of duplications: 16 rows


The ```Input Validation Precheck``` will check for missing values and duplicate rows in the csv file. Since there's no such thing in our sample dataset, we won't see anything here

In [None]:
df.sample(5)

Unnamed: 0,Source,Content,L1,L2
556,Non Owned,"Xinh quá mấy chị,e còn dư 50 sét thôi! #35k 1s...",Others,Cannot defined
1318,iOS,"k sử dụng dc mã giảm giá, nhập vào rồi kêu lỗi...",Feature,Apply Voucher
1852,Google Play,Cách shopee giải quyết vấn đề quá là TỆ,Services,Contact Agent
901,Google Play,"Lỗi không vào được ứng dụng, nhà điều hành có ...",Feature,App performance
64,iOS,Cảm thấy tệ,Others,Cannot defined


In [None]:
df.Source.value_counts()

Source
Google Play    1434
Non Owned       499
Owned           139
iOS             124
HC search        73
Name: count, dtype: int64

Let's say you are happy with this dataframe (after you did some others preprocessing), then you can start creating a `TextDataMain` object

For this dataframe, I want to 
- Build a text classification model, with main text in ```Content``` column, metadatas is ```Source```, and the label is ```L1```
- Perform `apply_word_tokenize` with text normalization (this is "text transformation")
- For augmentation, I want to perform: Oversampling the ```Owned, Non Owned and HC Search``` from column ```Source```, then add some the Vietnamese no-accent text. Note that all of these are called "text augmentation"

Let's define these transformations

> For Text Transformation

In [None]:
awt_tfm = partial(apply_word_tokenize,normalize_text=True)
# You can also set a __name__ to your augmentation function. 
# This way you will have meaningful text messages as outputs
awt_tfm.__name__='UTS Word Tokenization With Normalization'

txt_tfms=[awt_tfm]

> For Text Augmentation

In [None]:
# apply_to_all means I will apply this augmentation to all the data 
# (including the original data and the augmented data/transformed data from previous augmentation/transformation)
over_nonown_tfm = partial(sampling_with_condition,query='Source=="non owned"',frac=0.5,seed=42,apply_to_all=False)
over_nonown_tfm.__name__ = 'Oversampling Non Owned'

over_own_tfm = partial(sampling_with_condition,query='Source=="owned"',frac=2,seed=42,apply_to_all=False)
over_own_tfm.__name__ = 'Oversampling Owned'

over_hc_tfm = partial(sampling_with_condition,query='Source=="hc search"',frac=2.5,seed=42,apply_to_all=False)
over_hc_tfm.__name__ = 'Oversampling HC search'

remove_accent_tfm = partial(remove_vnmese_accent,frac=1,seed=42,apply_to_all=True)
remove_accent_tfm.__name__ = 'Add No-Accent Text'

aug_tfms = [over_nonown_tfm,over_own_tfm,over_hc_tfm,remove_accent_tfm]


In [None]:
tdm = TextDataMain(df,
                    main_content='Content',
                    metadatas='Source', # You can put a list of multiple metadatas
                    label_names='L1', # You can put a list of multiple labels
                    val_ratio=0.2,
                    split_cols='L1', # You can even put a list of multiple columns to be used for validation splitting
                    content_tfms = txt_tfms, # You can add multiple content transformation functions ...
                    aug_tfms = aug_tfms, # ... as well as augmentation functions
                    process_metadatas=True,
                    seed=42,
                    shuffle_trn=True)

----- Input Validation Precheck -----
DataFrame contains duplicated values!
-----> Number of duplications: 16 rows


If we want to directly create a ```TextDataMain``` object from our csv file, we can instead use this:

In [None]:
tdm = TextDataMain.from_csv(DATA_PATH/'sample_large.csv',
                            return_df=False,
                            main_content='Content',
                            metadatas='Source',
                            label_names='L1',
                            val_ratio=0.2,
                            split_cols='L1',
                            content_tfms = txt_tfms,
                            aug_tfms = aug_tfms,
                            process_metadatas=True,
                            seed=42,
                            shuffle_trn=True)

----- Input Validation Precheck -----
DataFrame contains duplicated values!
-----> Number of duplications: 16 rows


In [None]:
show_doc(TextDataMain.to_df)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L354){target="_blank" style="float:right; font-size:smaller"}

### TextDataMain.to_df

>      TextDataMain.to_df ()

To execute all the defined processings and return a dataframe

Note that all the previous constructor calls do not do any heavy processing yet.

To actually run all the processes, one can call `TextDataMain.to_df()`

In [None]:
df_processed = tdm.to_df()

-------------------- Start Main Text Processing --------------------
----- Metadata Simple Processing & Concatenating to Main Content -----
----- Label Encoding -----
-------------------- Text Transformation --------------------
----- UTS Word Tokenization With Normalization -----


100%|██████████████████████████████████████████████████████████████████████████████| 2269/2269 [00:03<00:00, 599.84it/s]


-------------------- Train Test Split --------------------
Previous Validation Percentage: 20.009%
- Before leak check
Size: 454
- After leak check
Size: 447
- Number of rows leaked: 7, or 1.54% of the original validation (or test) data
Current Validation Percentage: 19.7%
-------------------- Text Augmentation --------------------
Train data size before augmentation: 1822
----- Oversampling Non Owned -----
Train data size after THIS augmentation: 2020
----- Oversampling Owned -----
Train data size after THIS augmentation: 2248
----- Oversampling HC search -----
Train data size after THIS augmentation: 2390
----- Add No-Accent Text -----


100%|████████████████████████████████████████████████████████████████████████████| 2390/2390 [00:00<00:00, 19530.97it/s]

Train data size after THIS augmentation: 4780
Train data size after ALL augmentation: 4780





Notice this?
```
Previous Validation Percentage: 20.0%
- Before leak check
Size: 14
- After leak check
Size: 14
- Number of rows leaked: 0, or 0.00% of the original validation (or test) data
Current Validation Percentage: 20.0%
```
After performing train/test split, the ```TextDataMain``` object also perform a "leak check": After `text_transformation` is performed, it will compare the text from ```Content``` value in the validation set to the ```Content``` text in the train set. Any duplications (texts that belong to both set) will be removed from validation set.

In [None]:
df_processed.sample(5)

Unnamed: 0,Content,Source,L1,is_valid
5090,non owned - https://shopee.vn/maybeaty like và...,non owned,5,True
2280,owned -_Địa chị này dám giao ko a . 😂,owned,1,False
1258,google play - ble,google play,5,False
3743,google play - Ko dừng tự_động cập_nhật được,google play,3,False
4624,owned - Như này là sao ạ,owned,5,False


Note that, since we have metadatas, the metadatas is concatenated to the front of the texture content

In [None]:
df_processed.Content.sample(5).values

array(['owned - 🚀 *_* ĐÓN_CHÀO ƯU_ĐÃI VÀNG VÀO THỨ_BA VỚI PHIÊN CHỢ VOUCHER_*_*_🚀_*_🎁 Miễn_phí tối_đa 50K cho 1 giờ khi chọn dịch_vụ dọn_dẹp nhà từ bTaskee *_🤗_Giảm ngay 20 % gói học Toán 12 tháng từ VioEdu_*_🤑_ShopBack tặng ngay 25K tiền thưởng cho đơn hàng từ 150K_*_😍_Giảm 70K cho hóa đơn từ 300K , áp_dụng cho tất_cả dịch_vụ làm đẹp tại Lamia 🎉 ️ 🎉_Chỉ từ 1000 xu , SỐ_LƯỢNG CÓ_HẠN ➡_[_http://shopee.vn/ShopeeDoiXu ] ( http://shopee.vn/ShopeeDoiXu ) ___________________________💥_10.10_SIÊU_SALE CHÍNH HÃNG - VẪN CÒN SALE_🎊_▶_️ https://shopee.vn/1010-Sieu-Sale-Chinh-Hang Săn thêm ưu_đãi Xtra duy_nhất 11.10 : 🎁 Miễn_phí vận_chuyển 0 Đ_🎁_Thương_hiệu hoàn xu tới 50 % 🎁 Thu_thập voucher , nhận đến 1.2 triệu 🎁 4 Khung giờ săn sale đậm : 0H - 9H - 12H - 21H_#_Shopee1010SieuSaleChinhHang',
       'ios - Dat_hang cu bi tu hoan hang ve rat la buc :)',
       'owned - Các bạn từng mua gì đáng đồng_tiền nhất trên Sốp_Pi ?',
       'google play - Được',
       'google play - Chon thanh_toan ma cu the

We now have a new dataframe with only the necessary columns (the processed text column, metadatas, label, and ```is_valid``` which tells you which row belongs to the validation set). Notice that our class has also encode our label for us

Our TextDataMain object also stores other useful attributes, such as:

In [None]:
# The entire processed dataframe, similar to the df_processed above
tdm.df.head()

Unnamed: 0,Content,Source,L1,is_valid
0,hc search - làm_sao để hết lỗi m02,hc search,3,False
1,hc search - xin chao,hc search,5,False
2,"owned - 8 NGAY DUA DON , TRUNG VOUCHER 8 THA...",owned,1,False
3,google play - 🤬_😡_🤬_😡,google play,5,False
4,hc search - mua hàng quốc_tế như thế_nào,hc search,4,False


In [None]:
# class names (This will be a list of list, as this class can handle multi-label classification)
tdm.label_lists

[['Buyer complained seller',
  'Commercial',
  'Delivery',
  'Feature',
  'Order/Item',
  'Others',
  'Payment',
  'Return/Refund',
  'Services',
  'Shopee account']]

In [None]:
# a dictionary storing unique value for each provided metadata
tdm.metadata_dict

{'Source': ['google play', 'hc search', 'ios', 'non owned', 'owned']}

If we want to see how a HuggingFace's tokenizer work on our processed text:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# this will pick a random text from train set to show
tdm.tokenizer_explain_single(tokenizer)

----- Tokenizer Explained -----
--- Input ---
google play - Mang ro manh nhung lai dang nhap khong duoc : <

--- Tokenized results --- 
{'input_ids': [0, 38970, 14015, 31, 8283, 7135, 24136, 12088, 5135, 19058, 2008, 18679, 3014, 2662, 6190, 22899, 27, 8452, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

--- Results from tokenizer.convert_ids_to_tokens ---
['<s>', 'google', 'play', '-', 'Mang', 'ro', 'manh', 'nhung', 'lai', 'dang', 'nh@@', 'ap', 'kh@@', 'ong', 'du@@', 'oc', ':', '<', '</s>']

--- Results from tokenizer.decode --- 
<s> google play - Mang ro manh nhung lai dang nhap khong duoc : < </s>



By doing this, we can see how the tokenizer interact with our text.

In [None]:
show_doc(TextDataMain.to_datasetdict)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L367){target="_blank" style="float:right; font-size:smaller"}

### TextDataMain.to_datasetdict

>      TextDataMain.to_datasetdict (tokenizer, is_split_into_words=False,
>                                   max_length=None, trn_ratio=1.0, seed=42)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| tokenizer |  |  | Tokenizer (preferably from HuggingFace) |
| is_split_into_words | bool | False | Is text split into list or not |
| max_length | NoneType | None | pad to model's allowed max length (default is max_sequence_length) |
| trn_ratio | float | 1.0 | Portion of training data to be converted to datasetdict. Useful for sample experiments |
| seed | int | 42 | Random seed |

Since we need to convert our data to HuggingFace's DatasetDict format in order to utilize HuggingFace's model well, we can directly export datasetdict using `TextDataMain.to_datasetdict`

In [None]:
ddict_sample = tdm.to_datasetdict(tokenizer)

-------------------- Map Tokenize Function --------------------


Map:   0%|          | 0/4780 [00:00<?, ? examples/s]

Map:   0%|          | 0/447 [00:00<?, ? examples/s]

In [None]:
ddict_sample

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4780
    })
    validation: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 447
    })
})

In [None]:
ddict_sample['train']['text'][0]

'hc search - làm_sao để hết lỗi m02'

In [None]:
print(ddict_sample['train']['input_ids'][0])

[0, 1340, 1894, 51139, 31, 2407, 24, 351, 1210, 1387, 3974, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Note that PhoBert will auto-pad our sentence to its model max_sequence_length, which is 256

In [None]:
len(ddict_sample['train']['input_ids'][0])

256

In [None]:
ddict_sample['train']['label'][0]

3

In [None]:
show_doc(TextDataMain.save_as_pickles)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_main.py#L150){target="_blank" style="float:right; font-size:smaller"}

### TextDataMain.save_as_pickles

>      TextDataMain.save_as_pickles (fname, parent='pickle_files',
>                                    drop_data_attributes=False)

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| fname |  |  | Name of the pickle file |
| parent | str | pickle_files | Parent folder |
| drop_data_attributes | bool | False | Whether to drop all large-size data attributes |

As the transformations/augmentations can take time for large dataset, we want to save our TextDataMain object. We can use `TextDataMain.save_as_pickles` to export a pickle file

In [None]:
tdm.save_as_pickles('my_tdm')

Then you can load it with

In [None]:
tdm2 = TextDataMain.from_pickle('my_tdm')

... and access all the attributes

In [None]:
tdm2.df.head()

Unnamed: 0,Content,Source,L1,is_valid
0,hc search - làm_sao để hết lỗi m02,hc search,3,False
1,hc search - xin chao,hc search,5,False
2,"owned - 8 NGAY DUA DON , TRUNG VOUCHER 8 THA...",owned,1,False
3,google play - 🤬_😡_🤬_😡,google play,5,False
4,hc search - mua hàng quốc_tế như thế_nào,hc search,4,False


In [None]:
tdm2.label_lists[0]

['Buyer complained seller',
 'Commercial',
 'Delivery',
 'Feature',
 'Order/Item',
 'Others',
 'Payment',
 'Return/Refund',
 'Services',
 'Shopee account']

In [None]:
tdm2.metadata_dict

{'Source': ['google play', 'hc search', 'ios', 'non owned', 'owned']}

Let's check the file size

In [None]:
file_stats = os.stat(Path('pickle_files/my_tdm.pkl'))
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')

File Size in MegaBytes is 13.537714004516602


As it saves the entire processed dataframe (and datasetdict if you call ```to_datasetdict```), the pickle size can be large. In some scenario you don't need to store these data attributes (as inference time, or in production). Thus one can save a lighter pickle file by setting ```drop_data_attributes``` to ```True```

In [None]:
tdm.save_as_pickles('my_lightweight_tdm',drop_data_attributes=True)

In [None]:
file_stats = os.stat(Path('pickle_files/my_lightweight_tdm.pkl'))
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')

File Size in MegaBytes is 3.1605300903320312


We will see a bigger file size reduction when we work with much larger dataset

In [None]:
tdm_light = TextDataMain.from_pickle('my_lightweight_tdm')

You can still access some important attributes (except for any data attributes, such as ```df``` or ```main_ddict```

In [None]:
tdm_light.label_lists[0]

['Buyer complained seller',
 'Commercial',
 'Delivery',
 'Feature',
 'Order/Item',
 'Others',
 'Payment',
 'Return/Refund',
 'Services',
 'Shopee account']

In [None]:
tdm_light.metadata_dict

{'Source': ['google play', 'hc search', 'ios', 'non owned', 'owned']}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()