# Tokenization

## Tokenizing French and English texts

In [1]:
from nautilus_nlp.utils.tokenizer import tokenize, untokenize

[nltk_data] Downloading package punkt to /Users/hugo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
fr_txt = "Ceci est un texte français, j'adore 1 !"
eng_txt = "Let's play together!"

In [3]:
%%time
tokenize(fr_txt, lang_module="fr_spacy")

CPU times: user 23.5 ms, sys: 3.68 ms, total: 27.2 ms
Wall time: 30.2 ms


[Ceci, est, un, texte, français, ,, j', adore, 1, !]

In [4]:
%%time
tokenized_fr_txt = tokenize(fr_txt, lang_module="fr_moses")

CPU times: user 19.5 s, sys: 157 ms, total: 19.6 s
Wall time: 20.2 s


In [5]:
%%time
tokenized_eng_txt = tokenize(eng_txt, lang_module="en_spacy")
tokenized_eng_txt

CPU times: user 24.3 ms, sys: 3.99 ms, total: 28.3 ms
Wall time: 26.2 ms


In [6]:
%%time
tokenized_eng_txt = tokenize(eng_txt, lang_module="en_nltk")
tokenized_eng_txt

CPU times: user 17.3 ms, sys: 3.71 ms, total: 21 ms
Wall time: 26.9 ms


## You can also untokenize your text

In [13]:
untokenize(tokenized_eng_txt,lang='en')

"Let's play together!"

In [15]:
# Here the "J'adore" is not handled in the right way
untokenize(tokenized_fr_txt,lang='fr')

"Ceci est un texte français, j' adore !"

# Stemming 

In [124]:
from nautilus_nlp.utils.stemmer import stem_tokens

In [125]:
stem_tokens(['I','survived','these', 'dogs'], lang='english')

['i', 'surviv', 'these', 'dog']

In [128]:
stem_tokens(tokenize("je mangerai dans les cuisines du château"),lang='french')

['je', 'mang', 'dan', 'le', 'cuisin', 'du', 'château']

# Lemmatization

## French 

In [28]:
from nautilus_nlp.utils.lemmatizer import lemmatize_french_tokens

In [29]:
txt_to_tokenize=['Ceci', 'est', 'un', 'texte', 'français', ',', "j'", 'adore', 'tes', 'frites', 'bien', 'grasses', 'YOLO', '!']
print(txt_to_tokenize)

['Ceci', 'est', 'un', 'texte', 'français', ',', "j'", 'adore', 'tes', 'frites', 'bien', 'grasses', 'YOLO', '!']


In [30]:
%%time
# Ici frites est traduit par frire car par défaut la fonction remplace le verbe en dernier. 
# Si ca ne conviens pas au besoin il faut construire sa propre règle de priorisation avec la lib FrenchLefffLemmatizer
lemmatize_french_tokens(txt_to_tokenize, module='french_leff_v')

CPU times: user 2.46 s, sys: 286 ms, total: 2.74 s
Wall time: 2.82 s


['Ceci',
 'être',
 'un',
 'texte',
 'français',
 ',',
 "j'",
 'adorer',
 'tes',
 'frire',
 'bien',
 'gras',
 'YOLO',
 '!']

In [31]:
%%time
# Ici on ne remplace que les noms. Frites deviens frite. 
lemmatize_french_tokens(txt_to_tokenize,load_only_pos='n', module='french_leff_v')

CPU times: user 2.26 s, sys: 131 ms, total: 2.39 s
Wall time: 2.44 s


['Ceci',
 'est',
 'un',
 'texte',
 'français',
 ',',
 "j'",
 'adore',
 'tes',
 'frite',
 'bien',
 'grasses',
 'YOLO',
 '!']

In [32]:
%%time
lemmatize_french_tokens(txt_to_tokenize, module='spacy')

CPU times: user 37.9 ms, sys: 51.9 ms, total: 89.8 ms
Wall time: 65.3 ms


['ceci',
 'être',
 'un',
 'texte',
 'français',
 ',',
 'j',
 "'",
 'adorer',
 't',
 'frite',
 'bien',
 'gras',
 'yolo',
 '!']

## English

In [1]:
from nautilus_nlp.utils.lemmatizer import lemmatize_english_tokens

[nltk_data] Downloading package wordnet to /Users/hugo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
to_lemmatize = ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']

In [3]:
%%time
lemmatize_english_tokens(to_lemmatize, module='spacy')

CPU times: user 27.2 ms, sys: 4.4 ms, total: 31.6 ms
Wall time: 31.3 ms


['the', 'strip', 'bat', 'be', 'hang', 'on', '-PRON-', 'foot', 'for', 'good']

In [4]:
%%time
lemmatize_english_tokens(to_lemmatize, module='nltk')

CPU times: user 3.07 s, sys: 217 ms, total: 3.28 s
Wall time: 3.45 s


['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']