# Text Augmentation

> This contains some text augmentation functionality. 

- skip_showdoc: true
- skip_exec: true

In [None]:

from nbdev.showdoc import *

In [None]:

from __future__ import annotations
from functools import partial, wraps
import unidecode
import numpy as np
from tqdm import tqdm
from that_nlp_library.utils import val2iterable
import pandas as pd

In [None]:

def _remove_kwargs(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        kwargs.pop("apply_to_all", True)
        return func(*args, **kwargs)
    return wrapper

def _sampling_content(content,frac=1,seed=42,others=None):
    replace=frac>1
    rng = np.random.default_rng(seed)
    _len = len(content)
    idxs = rng.choice(list(range(_len)),int(frac*_len),replace=replace)
    content = content[idxs]
    if others is not None:
        others = others.iloc[idxs]
    return content,others

In [None]:

@_remove_kwargs
def remove_vnmese_accent(content:np.ndarray|list, # A list or Numpy array of string
                         frac=1, # Fraction of the content to perform augmentation
                         seed=42, # Random seed
                         others=None # Metadata associating with the content
                        ):
    "Perform Vietnamese accent removal"
    content = val2iterable(content,t='nparray')
    if isinstance(content,list):
        content = np.array(content)
    content,others = _sampling_content(content,frac=frac,seed=seed,others=others)
    content = np.array([unidecode.unidecode(c) for c in tqdm(content)])
    if others is None:
        return content
    return content,others

In [None]:
show_doc(remove_vnmese_accent)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_augmentation.py#L35){target="_blank" style="float:right; font-size:smaller"}

### remove_vnmese_accent

>      remove_vnmese_accent (content:numpy.ndarray|list, frac=1, seed=42,
>                            others=None)

Perform Vietnamese accent removal

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| content | np.ndarray \| list |  | A list or Numpy array of string |
| frac | int | 1 | Fraction of the content to perform augmentation |
| seed | int | 42 | Random seed |
| others | NoneType | None | Metadata associating with the content |

In [None]:
remove_vnmese_accent('hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức')

100%|████████████████████████████████████████| 1/1 [00:00<00:00, 1934.64it/s]


array(['hoi cu dan chung cu sen hong - chung cu lotus song than thu duc'],
      dtype='<U63')

In [None]:
texts=[
     'hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức',
     'This is the recommended way to make a Python package importable from anywhere',
     'hội cần mở thẻ tín dụng tại hà nội, đà nẵng, tp. hồ chí minh',
     "biti's cao lãnh - đồng tháp",
     'chợ phòng trọ + việc làm...khu lĩnh nam - vĩnh hưng - mai động (hoàng mai)'
 ]
remove_vnmese_accent(texts)

100%|███████████████████████████████████████| 5/5 [00:00<00:00, 45003.26it/s]


array(['cho phong tro + viec lam...khu linh nam - vinh hung - mai dong (hoang mai)',
       'This is the recommended way to make a Python package importable from anywhere',
       'hoi cu dan chung cu sen hong - chung cu lotus song than thu duc',
       "biti's cao lanh - dong thap",
       'hoi can mo the tin dung tai ha noi, da nang, tp. ho chi minh'],
      dtype='<U77')

In [None]:
remove_vnmese_accent(texts,frac=0.5)

100%|███████████████████████████████████████| 2/2 [00:00<00:00, 28532.68it/s]


array(['hoi cu dan chung cu sen hong - chung cu lotus song than thu duc',
       "biti's cao lanh - dong thap"], dtype='<U63')

In [None]:

@_remove_kwargs
def sampling_with_condition(content:np.ndarray|list, # Numpy array of string
                              query:str, # Pandas query string for query method
                              frac=1, # Fraction of the content to perform augmentation
                              seed=42, # Random seed
                              others:pd.DataFrame=None, # Metadata (as dataframe) that you can query on
                           ):
    """
    Can perform oversampling/undersampling based on dataframe query
    
    For more information about dataframe query: https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-query
    """
    if isinstance(content,list):
        content = np.array(content)
    idx_to_sample = others.query(query).index.values
    others_to_sample = others.loc[idx_to_sample].copy()
    content_to_sample=content[idx_to_sample].copy()
    return _sampling_content(content_to_sample,frac,seed,others=others_to_sample)

In [None]:
show_doc(sampling_with_condition)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_augmentation.py#L52){target="_blank" style="float:right; font-size:smaller"}

### sampling_with_condition

>      sampling_with_condition (content:numpy.ndarray|list, query:str, frac=1,
>                               seed=42,
>                               others:pandas.core.frame.DataFrame=None)

Can perform oversampling/undersampling based on dataframe query

For more information about dataframe query: https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-query

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| content | np.ndarray \| list |  | Numpy array of string |
| query | str |  | Pandas query string for query method |
| frac | int | 1 | Fraction of the content to perform augmentation |
| seed | int | 42 | Random seed |
| others | pd.DataFrame | None | Metadata (as dataframe) that you can query on |

In [None]:
texts=[
     'hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức',
     'This is the recommended way to make a Python package importable from anywhere',
     'hội cần mở thẻ tín dụng tại hà nội, đà nẵng, tp. hồ chí minh',
     "biti's cao lãnh - đồng tháp",
     'chợ phòng trọ + việc làm...khu lĩnh nam - vĩnh hưng - mai động (hoàng mai)'
      ]
df = pd.DataFrame({'text':texts,
                  'value 1': [1,2,1,3,4],
                  'value_2': ['vnm','eng','vnm','vnm','vnm']
                  })

In [None]:
df

Unnamed: 0,text,value 1,value_2
0,hội cư dân chung cư sen hồng - chung cư lotus ...,1,vnm
1,This is the recommended way to make a Python p...,2,eng
2,"hội cần mở thẻ tín dụng tại hà nội, đà nẵng, t...",1,vnm
3,biti's cao lãnh - đồng tháp,3,vnm
4,chợ phòng trọ + việc làm...khu lĩnh nam - vĩnh...,4,vnm


In [None]:
df_new,others = sampling_with_condition(df['text'].values,
                                        query='`value 1` == 1',
                                        frac=1,
                                        others=df[['value 1','value_2']]
                                       )

In [None]:
print(df_new)
display(others)

['hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức'
 'hội cần mở thẻ tín dụng tại hà nội, đà nẵng, tp. hồ chí minh']


Unnamed: 0,value 1,value_2
0,1,vnm
2,1,vnm


In [None]:
df_new,others = sampling_with_condition(df['text'].values,
                                        query='`value 1`>2 and `value 1`<4',
                                        frac=2,
                                        others=df[['value 1','value_2']]
                                       )

In [None]:
print(df_new)
display(others)

["biti's cao lãnh - đồng tháp" "biti's cao lãnh - đồng tháp"]


Unnamed: 0,value 1,value_2
3,3,vnm
3,3,vnm


In [None]:
df_new,others = sampling_with_condition(df['text'].values,
                                        query='value_2=="vnm"',
                                        frac=0.5,
                                        others=df[['value 1','value_2']]
                                       )

In [None]:
print(df_new)
display(others)

['hội cư dân chung cư sen hồng - chung cư lotus sóng thần thủ đức'
 'chợ phòng trọ + việc làm...khu lĩnh nam - vĩnh hưng - mai động (hoàng mai)']


Unnamed: 0,value 1,value_2
0,1,vnm
4,4,vnm
