# Text processing tools benchmark

In [1]:
from urllib import request

## Load french text

In [2]:
url = "https://www.gutenberg.org/cache/epub/5711/pg5711.txt"

In [3]:
response = request.urlopen(url)
rawfr = response.read().decode('utf8')

In [4]:
print(rawfr[:500])

﻿The Project Gutenberg EBook of Germinal, by Emile Zola
(#8 in our series by Emile Zola)

Copyright laws are changing all over the world. Be sure to check the
copyright laws for your country before downloading or redistributing
this or any other Project Gutenberg eBook.

This header should be the first thing seen when viewing this Project
Gutenberg file.  Please do not remove it.  Do not change or edit the
header without written permission.

Please read the "legal small print," and ot


In [5]:
type(rawfr)

str

In [6]:
len(rawfr)

1046377

## Load english text

In [7]:
url = "http://www.gutenberg.org/files/2554/2554-0.txt"

In [8]:
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [9]:
print(raw[:500])

﻿The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: Crime and Punishment

Author: Fyodor Dostoevsky

Release Date: March 28, 2006 [EBook #2554]
Last Updated: October 27, 2016

Language: English

Charac


In [10]:
type(raw)

str

In [11]:
len(raw)

1176967

## Tokenization

Several tokenizers are available. As you will see bellow, spaCy is much faster than the other implementations (Moses, NLTK) and often return better results. 

In [12]:
from nautilus_nlp.preprocessing.tokenizer import tokenize, untokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### French spaCy

In [13]:
%%time
tokenized = tokenize(rawfr[:1000000], lang_module="fr_spacy")

CPU times: user 1.84 s, sys: 84.1 ms, total: 1.93 s
Wall time: 1.93 s


In [14]:
%%timeit
tokenized = tokenize(rawfr[:1000000], lang_module="fr_spacy")

768 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
tokenized = tokenize(rawfr[:1000000], lang_module="fr_spacy")

### French Moses

In [15]:
%%time
tokenized_moses = tokenize(rawfr[:1000000], lang_module="fr_moses")

CPU times: user 15.6 s, sys: 18.9 ms, total: 15.6 s
Wall time: 15.6 s


In [19]:
%%timeit
tokenized_moses = tokenize(rawfr[:1000000], lang_module="fr_moses")

1.78 s ± 45.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
tokenized_moses = tokenize(rawfr[:1000000], lang_module="fr_moses")

In [23]:
print('spacy: {} tokens\nmoses: {} tokens'.format(len(tokenized),len(tokenized_moses)))

spacy: 227009 tokens
moses: 202475 tokens


In [24]:
print(tokenized[1000:1200])

['se', 'diriger', 'vers', 'les', '\r\n', 'bâtiments', ',', 'il', 'se', 'risqua', 'enfin', 'à', 'gravir', 'le', 'terri', 'sur', 'lequel', 'brûlaient', '\r\n', 'les', 'trois', 'feux', 'de', 'houille', ',', 'dans', 'des', 'corbeilles', 'de', 'fonte', ',', 'pour', 'éclairer', '\r\n', 'et', 'réchauffer', 'la', 'besogne', '.', ' ', 'Les', 'ouvriers', 'de', 'la', 'coupe', 'à', 'terre', 'avaient', 'dû', '\r\n', 'travailler', 'tard', ',', 'on', 'sortait', 'encore', 'les', 'débris', 'inutiles', '.', ' ', 'Maintenant', ',', '\r\n', 'il', 'entendait', 'les', 'moulineurs', 'pousser', 'les', 'trains', 'sur', 'les', 'tréteaux', ',', 'il', '\r\n', 'distinguait', 'des', 'ombres', 'vivantes', 'culbutant', 'les', 'berlines', ',', 'près', 'de', 'chaque', '\r\n', 'feu', '.', '\r\n\r\n', '--Bonjour', ',', 'dit', '-', 'il', 'en', "s'", 'approchant', "d'", 'une', 'des', 'corbeilles', '.', '\r\n\r\n', 'Tournant', 'le', 'dos', 'au', 'brasier', ',', 'le', 'charretier', 'était', 'debout', ',', 'un', 'vieillard', 

In [25]:
print(tokenized_moses[1000:1200])

['le', 'dos', 'au', 'brasier', ',', 'le', 'charretier', 'était', 'debout', ',', 'un', 'vieillard', 'vêtu', "d'", 'un', 'tricot', 'de', 'laine', 'violette', ',', 'coiffé', "d'", 'une', 'casquette', 'en', 'poil', 'de', 'lapin', ';', 'pendant', 'que', 'son', 'cheval', ',', 'un', 'gros', 'cheval', 'jaune', ',', 'attendait', ',', 'dans', 'une', 'immobilité', 'de', 'pierre', ',', "qu'", 'on', 'eût', 'vidé', 'les', 'six', 'berlines', 'montées', 'par', 'lui', '.', 'Le', 'manoeuvre', 'employé', 'au', 'culbuteur', ',', 'un', 'gaillard', 'roux', 'et', 'efflanqué', ',', 'ne', 'se', 'pressait', 'guère', ',', 'pesait', 'sur', 'le', 'levier', "d'", 'une', 'main', 'endormie', '.', 'Et', ',', 'là-haut', ',', 'le', 'vent', 'redoublait', ',', 'une', 'bise', 'glaciale', ',', 'dont', 'les', 'grandes', 'haleines', 'régulières', 'passaient', 'comme', 'des', 'coups', 'de', 'faux', '.', '--Bonjour', ',', 'répondit', 'le', 'vieux', '.', 'Un', 'silence', 'se', 'fit', '.', "L'", 'homme', ',', 'qui', 'se', 'sentai

### English spaCy

In [None]:
%%time
tokenized_eng = tokenize(raw[:1000000], lang_module="en_spacy")

In [33]:
tokenized_eng = tokenize(raw[:1000000], lang_module="en_spacy")

In [34]:
%%timeit
tokenize(raw[:1000000], lang_module="en_spacy")

260 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
tokenized_eng[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

### English NLTK 

In [30]:
%%time
tokenized_eng_nltk = tokenize(raw[:1000000], lang_module="en_nltk")

CPU times: user 1.56 s, sys: 28 ms, total: 1.58 s
Wall time: 1.58 s


In [36]:
tokenized_eng_nltk = tokenize(raw[:1000000], lang_module="en_nltk")

In [31]:
%%timeit
tokenize(raw[:1000000], lang_module="en_nltk")

1.54 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
tokenized_eng_nltk[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

# Stemming 

In [38]:
from nautilus_nlp.preprocessing.stemming import stem_tokens

In [39]:
%%time
stem = stem_tokens(tokenized,lang='french')

CPU times: user 4.19 s, sys: 20 ms, total: 4.21 s
Wall time: 4.21 s


In [40]:
%%timeit
stem_tokens(tokenized,lang='french')

4.25 s ± 61.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Lemmatization

## French 

In [41]:
from nautilus_nlp.preprocessing.lemmatization import lemmatize_french_tokens

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [42]:
from nautilus_nlp.preprocessing.preprocess import remove_tokens_with_nonletters

In [43]:
tokenized = remove_tokens_with_nonletters(tokenized)

In [46]:
%%time
lemmatized_tokens = lemmatize_french_tokens(tokenized, module='spacy')

CPU times: user 21.9 s, sys: 8.62 s, total: 30.6 s
Wall time: 30.6 s


In [47]:
lemmatized_tokens = lemmatize_french_tokens(tokenized, module='spacy')

In [48]:
print(lemmatized_tokens[1000:1200])

['de', 'puits', 'le', 'vaste', 'chambre', 'de', 'le', 'machine', 'extraction', 'le', 'tourelle', 'de', 'le', 'pompe', 'ce', 'fosse', 'au', 'fondre', 'un', 'creux', 'avec', 'son', 'construction', 'trapu', 'de', 'brique', 'dresser', 'son', 'comme', 'un', 'corne', 'luire', 'sembler', 'avoir', 'un', 'air', 'mauver', 'de', 'goulu', 'accroupie', 'pour', 'manger', 'le', 'monde', 'tout', 'en', 'examiner', 'il', 'songer', 'luire', 'son', 'existence', 'de', 'vagabond', 'depuis', 'huit', 'jour', 'il', 'chercher', 'un', 'place', 'il', 'se', 'revoir', 'dans', 'son', 'atelier', 'de', 'chemin', 'de', 'fer', 'gifler', 'son', 'chef', 'de', 'Lille', 'de', 'partout', 'le', 'samedi', 'il', 'Marchiennes', 'on', 'dire', 'il', 'y', 'avoir', 'de', 'travail', 'aux', 'Forges', 'et', 'rien', 'ni', 'aux', 'Forges', 'ni', 'chez', 'Sonneville', 'il', 'avoir', 'passer', 'le', 'dimanche', 'sou', 'le', 'bois', 'un', 'chantier', 'de', 'charronnage', 'dont', 'le', 'surveillant', 'venir', 'de', 'expulser', 'deux', 'heure

## English

In [49]:
from nautilus_nlp.preprocessing.lemmatization import lemmatize_english_tokens

In [50]:
tokenized_eng = remove_tokens_with_nonletters(tokenized_eng)

In [51]:
%%time
lemmatized_eng = lemmatize_english_tokens(tokenized_eng, module='spacy')

CPU times: user 24.3 s, sys: 11.3 s, total: 35.6 s
Wall time: 35.6 s


In [52]:
lemmatized_eng = lemmatize_english_tokens(tokenized_eng, module='spacy')

In [53]:
%%time
lemmatized_eng_nltk = lemmatize_english_tokens(tokenized_eng, module='nltk')

CPU times: user 20.5 s, sys: 706 ms, total: 21.2 s
Wall time: 21.1 s


In [54]:
lemmatized_eng_nltk = lemmatize_english_tokens(tokenized_eng, module='nltk')

In [55]:
print(tokenized_eng[1000:1200])

['feel', 'ashamed', 'He', 'was', 'hopelessly', 'in', 'debt', 'to', 'his', 'landlady', 'and', 'was', 'afraid', 'of', 'meeting', 'her', 'This', 'was', 'not', 'because', 'he', 'was', 'cowardly', 'and', 'abject', 'quite', 'the', 'contrary', 'but', 'for', 'some', 'time', 'past', 'he', 'had', 'been', 'in', 'an', 'overstrained', 'irritable', 'condition', 'verging', 'on', 'hypochondria', 'He', 'had', 'become', 'so', 'completely', 'absorbed', 'in', 'himself', 'and', 'isolated', 'from', 'his', 'fellows', 'that', 'he', 'dreaded', 'meeting', 'not', 'only', 'his', 'landlady', 'but', 'anyone', 'at', 'all', 'He', 'was', 'crushed', 'by', 'poverty', 'but', 'the', 'anxieties', 'of', 'his', 'position', 'had', 'of', 'late', 'ceased', 'to', 'weigh', 'upon', 'him', 'He', 'had', 'given', 'up', 'attending', 'to', 'matters', 'of', 'practical', 'importance', 'he', 'had', 'lost', 'all', 'desire', 'to', 'do', 'so', 'Nothing', 'that', 'any', 'landlady', 'could', 'do', 'had', 'a', 'real', 'terror', 'for', 'him', 'B

In [56]:
print(lemmatized_eng[1000:1200])

['feel', 'ashamed', '-PRON-', 'be', 'hopelessly', 'in', 'debt', 'to', '-PRON-', 'landlady', 'and', 'be', 'afraid', 'of', 'meet', '-PRON-', 'This', 'be', 'not', 'because', '-PRON-', 'be', 'cowardly', 'and', 'abject', 'quite', 'the', 'contrary', 'but', 'for', 'some', 'time', 'past', '-PRON-', 'have', 'be', 'in', 'an', 'overstrained', 'irritable', 'condition', 'verge', 'on', 'hypochondria', '-PRON-', 'have', 'become', 'so', 'completely', 'absorb', 'in', '-PRON-', 'and', 'isolate', 'from', '-PRON-', 'fellow', 'that', '-PRON-', 'dread', 'meeting', 'not', 'only', '-PRON-', 'landlady', 'but', 'anyone', 'at', 'all', '-PRON-', 'be', 'crush', 'by', 'poverty', 'but', 'the', 'anxiety', 'of', '-PRON-', 'position', 'have', 'of', 'late', 'cease', 'to', 'weigh', 'upon', '-PRON-', '-PRON-', 'have', 'give', 'up', 'attend', 'to', 'matter', 'of', 'practical', 'importance', '-PRON-', 'have', 'lose', 'all', 'desire', 'to', 'do', 'so', 'Nothing', 'that', 'any', 'landlady', 'could', 'do', 'have', 'a', 'real',

In [57]:
print(lemmatized_eng_nltk[1000:1200])

['feel', 'ashamed', 'He', 'be', 'hopelessly', 'in', 'debt', 'to', 'his', 'landlady', 'and', 'be', 'afraid', 'of', 'meeting', 'her', 'This', 'be', 'not', 'because', 'he', 'be', 'cowardly', 'and', 'abject', 'quite', 'the', 'contrary', 'but', 'for', 'some', 'time', 'past', 'he', 'have', 'be', 'in', 'an', 'overstrain', 'irritable', 'condition', 'verge', 'on', 'hypochondria', 'He', 'have', 'become', 'so', 'completely', 'absorbed', 'in', 'himself', 'and', 'isolated', 'from', 'his', 'fellow', 'that', 'he', 'dread', 'meeting', 'not', 'only', 'his', 'landlady', 'but', 'anyone', 'at', 'all', 'He', 'be', 'crush', 'by', 'poverty', 'but', 'the', 'anxiety', 'of', 'his', 'position', 'have', 'of', 'late', 'cease', 'to', 'weigh', 'upon', 'him', 'He', 'have', 'give', 'up', 'attend', 'to', 'matter', 'of', 'practical', 'importance', 'he', 'have', 'lose', 'all', 'desire', 'to', 'do', 'so', 'Nothing', 'that', 'any', 'landlady', 'could', 'do', 'have', 'a', 'real', 'terror', 'for', 'him', 'But', 'to', 'be', '