# Tokenization in NLTK

In [1]:
from nltk.tokenize import word_tokenize
text1 = "The chicken danced because she loved disco."
tokens = word_tokenize(text1)
print(tokens)

['The', 'chicken', 'danced', 'because', 'she', 'loved', 'disco', '.']


In [2]:
# notice that 'Mr.' is one token but 'tacology.' is two tokens.
text2 = "Mr. Smith loves tacos. He has a Ph.D. in tacology."
tokens = word_tokenize(text2)
print(tokens)

['Mr.', 'Smith', 'loves', 'tacos', '.', 'He', 'has', 'a', 'Ph.D.', 'in', 'tacology', '.']


# Tokenization in spaCy

In [None]:
from spacy.en import English
parser = English()
tokens = parser(text1)
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

In [5]:
# A lot of times you will get text from some unknown encoding. 
# UTF-8 is the most common representation.
# If you need to catch errros, you can remove errors='ignore'
def convert_unicode(text):
    if isinstance(text,str):
        return text.decode('utf-8',errors='ignore')
    else:
        return text

In [6]:
import spacy
parser = spacy.load('en')
tokens = parser(convert_unicode(text1))
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

[u'The', u'chicken', u'danced', u'because', u'she', u'loved', u'disco', u'.']


In [8]:
# Here we see why spaCy made the sentence segmentation error in the previous lesson
tokens = parser(convert_unicode(text2))
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

[u'Mr.', u'Smith', u'loves', u'tacos', u'.', u'He', u'has', u'a', u'Ph.D.', u'in', u'tacology', u'.']


In [5]:
# It can be fixed with a little effort https://github.com/explosion/spaCy/issues/592
import spacy
text = "He has a Ph.D. in tacology."
spacy.en.English.Defaults.tokenizer_exceptions["Ph.D."] = [{"F": "Ph.D."}]
parser = English()
tokens = parser(text2)
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print(tokens)

['Mr.', 'Smith', 'loves', 'tacos', '.', 'He', 'has', 'a', 'Ph.D.', 'in', 'tacology', '.']


In [22]:
# It can be fixed with a little effort https://github.com/explosion/spaCy/issues/592
import spacy
text = "He has a Ph.D. in tacology."
nlp = spacy.load('en')
nlp.Defaults.tokenizer_exceptions["Ph.D."] = [{"F": "Ph.D."}]
#spacy.lang.tokenizer_exceptions["Ph.D."] = [{"F": "Ph.D."}]
#parser = English()
tokens = parser(convert_unicode((text2)))
tokens = [token.orth_ for token in tokens if not token.orth_.isspace()]
print tokens

[u'Mr.', u'Smith', u'loves', u'tacos', u'.', u'He', u'has', u'a', u'Ph.D.', u'in', u'tacology', u'.']


In [15]:
nlp = spacy.load('en')
nlp.Defaults.tokenizer_exceptions["Ph.D."] = [{"F": "Ph.D."}]

In [23]:
# tokens in spaCy have a lot of information
dir(tokens[0])

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getslice__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_formatter_field_name_split',
 '_formatter_parser',
 'capitalize',
 'center',
 'count',
 'decode',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'islower',
 'isnumeric',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [24]:
def print_token(token):
    print("==========================")
    print("value:",token.orth_)
    print("lemma:",token.lemma_) # lemma is the root of a word
    print("shape:",token.shape_) # shape is capitalization and punctuation

In [26]:
# Note the lemma for "ran" and "was"
text3 = "He ran to the store because he was king of the apes."
tokens = parser(convert_unicode(text3))
for token in tokens:
    print_token(token)

('value:', u'He')
('lemma:', u'-PRON-')
('shape:', u'Xx')
('value:', u'ran')
('lemma:', u'run')
('shape:', u'xxx')
('value:', u'to')
('lemma:', u'to')
('shape:', u'xx')
('value:', u'the')
('lemma:', u'the')
('shape:', u'xxx')
('value:', u'store')
('lemma:', u'store')
('shape:', u'xxxx')
('value:', u'because')
('lemma:', u'because')
('shape:', u'xxxx')
('value:', u'he')
('lemma:', u'-PRON-')
('shape:', u'xx')
('value:', u'was')
('lemma:', u'be')
('shape:', u'xxx')
('value:', u'king')
('lemma:', u'king')
('shape:', u'xxxx')
('value:', u'of')
('lemma:', u'of')
('shape:', u'xx')
('value:', u'the')
('lemma:', u'the')
('shape:', u'xxx')
('value:', u'apes')
('lemma:', u'ape')
('shape:', u'xxxx')
('value:', u'.')
('lemma:', u'.')
('shape:', u'.')
