# Introduction to NLP with Python's NLTK

* "NLTK is a leading platform for building Python programs to work with human language data." -- NLTK website
* https://www.nltk.org/

In [1]:
import nltk

In [2]:
text = '''
Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.
'''

In [3]:
print(text)


Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.



## Tokenization

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
sent = sent_tokenize(text)
print(sent)

['\nCall me Ishmael.', 'Some years ago—never mind how long precisely—having little\nor no money in my purse, and nothing particular to interest me on shore, \nI thought I would sail about a little and see the watery part of the world.']


In [8]:
print(word_tokenize(sent[1]))

['Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.']


In [9]:
words = []
for s in sent:
    for w in word_tokenize(s):
        words.append(w)

In [10]:
print(words)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.']


## Stopword removal

In [11]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
print(list(punctuation))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [15]:
myStopWords = list(punctuation) + stopwords.words('english')

In [16]:
print(words)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.']


In [17]:
wordsNoStop = []
for i in words:
    if i not in myStopWords:
        wordsNoStop.append(i)
print(words)
print(wordsNoStop)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.']
['Call', 'Ishmael', 'Some', 'years', 'ago—never', 'mind', 'long', 'precisely—having', 'little', 'money', 'purse', 'nothing', 'particular', 'interest', 'shore', 'I', 'thought', 'I', 'would', 'sail', 'little', 'see', 'watery', 'part', 'world']


In [18]:
wordsNoStopComp = [w for w in words if w not in myStopWords]
print(wordsNoStopComp)

['Call', 'Ishmael', 'Some', 'years', 'ago—never', 'mind', 'long', 'precisely—having', 'little', 'money', 'purse', 'nothing', 'particular', 'interest', 'shore', 'I', 'thought', 'I', 'would', 'sail', 'little', 'see', 'watery', 'part', 'world']


## N-grams

In [19]:
from nltk.collocations import *

In [20]:
finder = BigramCollocationFinder.from_words(wordsNoStop)

In [21]:
finder

<nltk.collocations.BigramCollocationFinder at 0x7f07ff8e4d90>

In [22]:
finder.ngram_fd

FreqDist({('Call', 'Ishmael'): 1, ('Ishmael', 'Some'): 1, ('Some', 'years'): 1, ('years', 'ago—never'): 1, ('ago—never', 'mind'): 1, ('mind', 'long'): 1, ('long', 'precisely—having'): 1, ('precisely—having', 'little'): 1, ('little', 'money'): 1, ('money', 'purse'): 1, ...})

In [23]:
finder.ngram_fd.items()

dict_items([(('Call', 'Ishmael'), 1), (('Ishmael', 'Some'), 1), (('Some', 'years'), 1), (('years', 'ago—never'), 1), (('ago—never', 'mind'), 1), (('mind', 'long'), 1), (('long', 'precisely—having'), 1), (('precisely—having', 'little'), 1), (('little', 'money'), 1), (('money', 'purse'), 1), (('purse', 'nothing'), 1), (('nothing', 'particular'), 1), (('particular', 'interest'), 1), (('interest', 'shore'), 1), (('shore', 'I'), 1), (('I', 'thought'), 1), (('thought', 'I'), 1), (('I', 'would'), 1), (('would', 'sail'), 1), (('sail', 'little'), 1), (('little', 'see'), 1), (('see', 'watery'), 1), (('watery', 'part'), 1), (('part', 'world'), 1)])

In [24]:
sorted(finder.ngram_fd.items())

[(('Call', 'Ishmael'), 1),
 (('I', 'thought'), 1),
 (('I', 'would'), 1),
 (('Ishmael', 'Some'), 1),
 (('Some', 'years'), 1),
 (('ago—never', 'mind'), 1),
 (('interest', 'shore'), 1),
 (('little', 'money'), 1),
 (('little', 'see'), 1),
 (('long', 'precisely—having'), 1),
 (('mind', 'long'), 1),
 (('money', 'purse'), 1),
 (('nothing', 'particular'), 1),
 (('part', 'world'), 1),
 (('particular', 'interest'), 1),
 (('precisely—having', 'little'), 1),
 (('purse', 'nothing'), 1),
 (('sail', 'little'), 1),
 (('see', 'watery'), 1),
 (('shore', 'I'), 1),
 (('thought', 'I'), 1),
 (('watery', 'part'), 1),
 (('would', 'sail'), 1),
 (('years', 'ago—never'), 1)]

## Stemming and Tagging

In [25]:
text2 = 'Ishmael sailed because sailing and wanting to sail was in his blood.'

In [26]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [27]:
words = word_tokenize(text2)

In [28]:
print(words)

['Ishmael', 'sailed', 'because', 'sailing', 'and', 'wanting', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']


In [29]:
wordLancasterStems = [LancasterStemmer().stem(w) for w in words]
wordPorterStems = [PorterStemmer().stem(w) for w in words]

In [30]:
print(wordLancasterStems)
print(wordPorterStems)

['ishmael', 'sail', 'becaus', 'sail', 'and', 'want', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']
['ishmael', 'sail', 'becaus', 'sail', 'and', 'want', 'to', 'sail', 'wa', 'in', 'hi', 'blood', '.']


In [32]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [33]:
nltk.pos_tag(words)

[('Ishmael', 'NNP'),
 ('sailed', 'VBD'),
 ('because', 'IN'),
 ('sailing', 'NN'),
 ('and', 'CC'),
 ('wanting', 'VBG'),
 ('to', 'TO'),
 ('sail', 'VB'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('his', 'PRP$'),
 ('blood', 'NN'),
 ('.', '.')]

In [34]:
nltk.pos_tag(word_tokenize('Once upon a time there was a cat.  It was black and fluffy.'))

[('Once', 'RB'),
 ('upon', 'IN'),
 ('a', 'DT'),
 ('time', 'NN'),
 ('there', 'EX'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('cat', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('was', 'VBD'),
 ('black', 'JJ'),
 ('and', 'CC'),
 ('fluffy', 'JJ'),
 ('.', '.')]

Check out the [Penn Treebank Project list](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

## Word sense disambiguation

In [38]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [39]:
from nltk.corpus import wordnet
for ss in wordnet.synsets('sail'):
    print(ss, ss.definition())

Synset('sail.n.01') a large piece of fabric (usually canvas fabric) by means of which wind is used to propel a sailing vessel
Synset('cruise.n.01') an ocean trip taken for pleasure
Synset('sail.n.03') any structure that resembles a sail
Synset('sail.v.01') traverse or travel on (a body of water)
Synset('sweep.v.02') move with sweeping, effortless, gliding motions
Synset('sail.v.03') travel on water propelled by wind
Synset('voyage.v.01') travel on water propelled by wind or by other means


In [40]:
from nltk.wsd import lesk

In [41]:
print(words)

['Ishmael', 'sailed', 'because', 'sailing', 'and', 'wanting', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']


In [42]:
wordSense = lesk(words, 'sail')

In [43]:
print(wordSense, wordSense.definition())

Synset('sail.n.01') a large piece of fabric (usually canvas fabric) by means of which wind is used to propel a sailing vessel


In [44]:
wordSense = lesk(words, 'sailed')

In [45]:
print(wordSense, wordSense.definition())

Synset('voyage.v.01') travel on water propelled by wind or by other means


In [46]:
wordSense = lesk(words, 'wanting to sail')

In [47]:
print(wordSense, wordSense.definition())

AttributeError: 'NoneType' object has no attribute 'definition'

In [48]:
t = 'I will sail to Mexico each winter.'
s = lesk(word_tokenize(t), 'sail')
print(s, s.definition())

Synset('sail.n.03') any structure that resembles a sail
