# Text Transformation

> This contains some text transformation functionality

In [None]:
#| default_exp text_transformation

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from __future__ import annotations
from underthesea import word_tokenize, sent_tokenize, text_normalize

In [None]:
#| export
def apply_vnmese_word_tokenize(sentence:str, # Input sentence
                        normalize_text=False, # To 'normalize' the text before tokenization
                        fixed_words=[]
                       ):
    "Applying UnderTheSea Vietnamese word tokenization"
    if normalize_text:
        sentence = text_normalize(sentence)
    sens = sent_tokenize(sentence)

    tokenized_sen = []
    for sen in sens:
        tokenized_sen.append(word_tokenize(sen,format='text',fixed_words=fixed_words))
    return ' '.join(tokenized_sen)

In [None]:
show_doc(apply_vnmese_word_tokenize)

---

[source](https://github.com/anhquan0412/that-nlp-library/blob/main/that_nlp_library/text_transformation.py#L11){target="_blank" style="float:right; font-size:smaller"}

### apply_vnmese_word_tokenize

>      apply_vnmese_word_tokenize (sentence:str, normalize_text=False,
>                                  fixed_words=[])

Applying UnderTheSea Vietnamese word tokenization

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| sentence | str |  | Input sentence |
| normalize_text | bool | False | To 'normalize' the text before tokenization |
| fixed_words | list | [] |  |

For non-Vietnamese word, it's a hit-or-miss since UnderTheSea works best for Vietnamese sentences

In [None]:
text = 'This is a cat. New York city. San Francisco. New York and San Francisco Bay area. George Bush, Barrack Obama'
apply_vnmese_word_tokenize(text)

'This is a_cat . New_York city . San_Francisco . New_York and_San_Francisco Bay area . George Bush , Barrack Obama'

Here's an example on a clean Vietnamese sentence

In [None]:
text = 'Ch√†ng trai 9X Qu·∫£ng Tr·ªã kh·ªüi nghi·ªáp t·ª´ n·∫•m s√≤'
apply_vnmese_word_tokenize(text)

'Ch√†ng trai 9X Qu·∫£ng_Tr·ªã kh·ªüi_nghi·ªáp t·ª´ n·∫•m s√≤'

What if the sentence is not cleaned?

In [None]:
text = "Ch√†ng trai 9X Qu·∫£ng Tr·ªã kh·ªüi nghi·ªáp t·ª´ n·∫•m s√≤.Anh ·∫•y kh√¥ng nu√¥i   n·∫•müòä. nh∆∞ng anh n√†y nu√¥i. Ch·ªã ·∫•y l·∫°i kh√¥ng nu√¥i?(ai bi·∫øt t·∫°i sao üòäüòä? )R·ªìi? R·ªìi sao?r·ªìi ?R·ªìi ·ªßa...ch·ª© ch·ªã ·∫•y nu√¥i g√¨, #m·ªôthaiba c≈©ng kh√¥ng r√µ =)) üòä. Haha :) üòä hehe üòä."

In [None]:
apply_vnmese_word_tokenize(text)

'Ch√†ng trai 9X Qu·∫£ng_Tr·ªã kh·ªüi_nghi·ªáp t·ª´ n·∫•m s√≤ . Anh ·∫•y kh√¥ng nu√¥i n·∫•m üòä . nh∆∞ng anh n√†y nu√¥i . Ch·ªã ·∫•y l·∫°i kh√¥ng nu√¥i ? ( ai bi·∫øt t·∫°i_sao üòä_üòä ? ) R·ªìi ? R·ªìi sao ? r·ªìi ? R·ªìi ·ªßa ... ch·ª© ch·ªã ·∫•y nu√¥i g√¨ , #_m·ªôthaiba c≈©ng kh√¥ng r√µ =))_üòä . Haha :) üòä hehe üòä .'

We need to normalize the text

In [None]:
apply_vnmese_word_tokenize(text,normalize_text=True)

'Ch√†ng trai 9X Qu·∫£ng_Tr·ªã kh·ªüi_nghi·ªáp t·ª´ n·∫•m s√≤ . Anh ·∫•y kh√¥ng nu√¥i n·∫•m üòä . nh∆∞ng anh n√†y nu√¥i . Ch·ªã ·∫•y l·∫°i kh√¥ng nu√¥i ? ( ai bi·∫øt t·∫°i_sao üòä_üòä ? ) R·ªìi ? R·ªìi sao ? r·ªìi ? R·ªìi ·ªßa ... ch·ª© ch·ªã ·∫•y nu√¥i g√¨ , #_m·ªôthaiba c≈©ng kh√¥ng r√µ =))_üòä . Haha :) üòä hehe üòä .'

We can add a list of specific words to tokenize

In [None]:
text = "Vi·ªán Nghi√™n C·ª©u chi·∫øn l∆∞·ª£c qu·ªëc gia v·ªÅ h·ªçc m√°y"
apply_vnmese_word_tokenize(text)

'Vi·ªán Nghi√™n_C·ª©u chi·∫øn_l∆∞·ª£c qu·ªëc_gia v·ªÅ h·ªçc m√°y'

In [None]:
apply_vnmese_word_tokenize(text,fixed_words=["Vi·ªán Nghi√™n C·ª©u", "h·ªçc m√°y"])

'Vi·ªán_Nghi√™n_C·ª©u chi·∫øn_l∆∞·ª£c qu·ªëc_gia v·ªÅ h·ªçc_m√°y'

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()