# Tokenization

The process of dividing sentences into text.

In [1]:
import nltk

In [2]:
filename = './data/sherlock_holmes_1.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()

In [3]:
# Replace newlines with spaces (actually why do we need this?)
text = text.replace('\n', ' ')

In [5]:
# Divide the text into words.
words = nltk.tokenize.word_tokenize(text)
print(words)

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_the_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', '.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', '.', 'All', 'emotions', ',', 'and', 'that', 'one', 'particularly', ',', 'were', 'abhorrent', 'to', 'his', 'cold', ',', 'precise', 'but', 'admirably', 'balanced', 'mind', '.', 'He', 'was', ',', 'I', 'take', 'it', ',', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', ',', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', '.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', ',', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', '.', 'They', 'were', 'admirable', 'things', 'for', 'the', 'observe

In [6]:
# Note that "don't" (contractions) will be split into two words: "do" and "n't"
nltk.tokenize.word_tokenize("I don't have time")

['I', 'do', "n't", 'have', 'time']

## Tokenizing tweet

For tweets, NLTK has the option to remove the Twitter user handles and shortening repeating characters to a maximum of three in a row.

In [7]:
tweet = "@EmpireStateBldg Central Park Tower is reaaaaaaaly hiiiigh"
words = nltk.tokenize.casual.casual_tokenize(tweet, 
                                             preserve_case=True,
                                             reduce_len=3,
                                             strip_handles=True)
print(words)

['Central', 'Park', 'Tower', 'is', 'reaaaly', 'hiiigh']


## Tokenization with Spacy

In [8]:
import spacy
filename = './data/sherlock_holmes_1.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()

In [9]:
text = text.replace('\n', ' ')

In [12]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
words = [token.text for token in doc]
print(words)

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_', 'the', '_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', '.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', '.', 'All', 'emotions', ',', 'and', 'that', 'one', 'particularly', ',', 'were', 'abhorrent', 'to', 'his', 'cold', ',', 'precise', 'but', 'admirably', 'balanced', 'mind', '.', 'He', 'was', ',', 'I', 'take', 'it', ',', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', ',', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', '.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', ',', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', '.', 'They', 'were', 'admirable', 'things', 'for', 'the', 