Word Tokenisation
============

In [1]:
text = "The quick brown fox jumped over the lazy dog"

tokens = text.split()
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [2]:
text = "The quick brown fox, and an Oxford comma"

tokens = text.split()
print(tokens)

['The', 'quick', 'brown', 'fox,', 'and', 'an', 'Oxford', 'comma']


In [3]:
from nltk.tokenize import word_tokenize

text = "The quick brown fox, and an Oxford comma"

tokens = word_tokenize(text)
print(tokens)

['The', 'quick', 'brown', 'fox', ',', 'and', 'an', 'Oxford', 'comma']


In [4]:
from nltk.tokenize import word_tokenize

text = "Tweet about #NLProc @PyConUK :)"

tokens = word_tokenize(text)
print(tokens)

['Tweet', 'about', '#', 'NLProc', '@', 'PyConUK', ':', ')']


In [5]:
from nltk.tokenize import TweetTokenizer

text = "Tweet about #NLProc @PyConUK :)"

tokenizer = TweetTokenizer()
tokens = tokenizer.tokenize(text)
print(tokens)

['Tweet', 'about', '#NLProc', '@PyConUK', ':)']


In [6]:
from nltk.tokenize import TweetTokenizer
text = "Tweet about #NLProc @PyConUK :)"

tokenizer = TweetTokenizer(strip_handles=True)
tokens = tokenizer.tokenize(text)
print(tokens)

['Tweet', 'about', '#NLProc', ':)']


In [7]:
from nltk.tokenize import word_tokenize
text = "How about currencies (like £100,000.00) and dates (like 19th September)"

tokens = word_tokenize(text)
print(tokens)

['How', 'about', 'currencies', '(', 'like', '£100,000.00', ')', 'and', 'dates', '(', 'like', '19th', 'September', ')']


Stemming
=====

In [8]:
from nltk.stem import PorterStemmer

s = PorterStemmer()
print(s.stem('Having'))
print(s.stem('Have'))
print(s.stem('Had'))

print(s.stem('Fishing'))
print(s.stem('Fish'))
print(s.stem('Fisher'))
print(s.stem('Fishes'))
print(s.stem('Fished'))

Have
Have
Had
Fish
Fish
Fisher
Fish
Fish


Lemmatisation
=====

Lemmatisation is similar to stemming, as it produces a normalised version of the input word.

The output is a lemma, i.e. a proper word (different from stemming)

The input word is lemmatised according to its Part-of-Speech (POS) tag, i.e. verb, noun, etc.

In [9]:
# You'll need the "wordnet" package from NLTK data
# python -m nltk.downloader wordnet
from nltk.stem import WordNetLemmatizer

s = WordNetLemmatizer()
print(s.lemmatize('having', pos='v'))
print(s.lemmatize('have', pos='v'))
print(s.lemmatize('had', pos='v'))

print(s.lemmatize('fishing', pos='v'))
print(s.lemmatize('fish', pos='v'))
print(s.lemmatize('fisher', pos='n'))
print(s.lemmatize('fishes', pos='v'))
print(s.lemmatize('fished', pos='v'))

print(s.lemmatize('am', pos='v'))
print(s.lemmatize('is', pos='v'))
print(s.lemmatize('was', pos='v'))

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/marcob/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************