# Compare Spacy and NLTK tokenizers

In [1]:
import pandas as pd

text = "Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.\nAt eight o'clock on Thursday morning, Arthur didn't feel very good."
print(text)

test_data = pd.Series([text]*10000)

Good muffins cost $3.88
in New York.  Please buy me two of them.

Thanks.
At eight o'clock on Thursday morning, Arthur didn't feel very good.


## Spacy

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

words = [token.text for token in doc if not token.is_space]

print(words)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']


In [3]:
%%time

docs = nlp.pipe(test_data)

words = pd.Series([[token.text for token in doc if not token.is_space] for doc in docs])

print(words[1000])

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Wall time: 1min


To improve spacy performance we can disable some of pipeline processing steps which we do not need for word tokenization.

In [4]:
nlp_slim = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
doc = nlp(text)

words = [token.text for token in doc if not token.is_space]

print(words)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']


In [5]:
%%time

docs = nlp_slim.pipe(test_data)

words = pd.Series([[token.text for token in doc if not token.is_space] for doc in docs])

print(words[1])

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Wall time: 1.26 s


## NLTK tokenizers

In [6]:
from nltk.tokenize import PunktSentenceTokenizer, TreebankWordTokenizer, WordPunctTokenizer, word_tokenize

### Using recommended word_tokenize. Accurate but not fast

In [7]:
%%time

words = test_data.apply(word_tokenize)

print(words[0])

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Wall time: 5.14 s


word_tokenize tokenizes words as accurate as spacy but performace is worser: word_tokenize takes 4-5 times more time than spacy to do the same work

### PunktSentenceTokenizer + TreebankWordTokenizer == word_tokenize

In [8]:
sentence_tokenizer = PunktSentenceTokenizer()
word_tokenizer = TreebankWordTokenizer()

def tokenize(text):
    sentences = sentence_tokenizer.tokenize(text)
    return [word for sentence in sentences for word in word_tokenizer.tokenize(sentence)]

print(tokenize(text))

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']


In [9]:
%%time

words = test_data.apply(tokenize)

print(words[0])

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Wall time: 4.83 s


Our implementation of tokenize() shows basically the same quality and performace as word_tokenize. As expected.

### WordPunctTokenizer fast but not accurate

In [10]:
wp_tokenizer = WordPunctTokenizer()

def wp_tokenize(text):
    return wp_tokenizer.tokenize(text)
    
print(wp_tokenize(text))

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', 'o', "'", 'clock', 'on', 'Thursday', 'morning', ',', 'Arthur', 'didn', "'", 't', 'feel', 'very', 'good', '.']


In [11]:
%%time

words = test_data.apply(wp_tokenize)

print(words[0])

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.', 'At', 'eight', 'o', "'", 'clock', 'on', 'Thursday', 'morning', ',', 'Arthur', 'didn', "'", 't', 'feel', 'very', 'good', '.']
Wall time: 155 ms


WordPunctTokenizer is faster than spacy but much less accurate

### Using TreebankWordTokenizer alone without PunktSentenceTokenizer

In [12]:
w_tokenizer = TreebankWordTokenizer()

def w_tokenize(text):
    return w_tokenizer.tokenize(text)
    
print(w_tokenize(text))

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']


In [13]:
%%time

words = test_data.apply(w_tokenize)

print(words[0])

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.', 'At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
Wall time: 1.69 s


As expected TreebankWordTokenizer is not abbly to process sentences correctly. Much less accurate than spacy with slightly worse performace.

## Conslusion

Spacy with all pipeline steps enabled demostrates terribled performance although after disabling unnecessary steps it cannot be beaten by NLTK withoput without loss of quality.