# Introduction to NLP with Python's NLTK

* "NLTK is a leading platform for building Python programs to work with human language data." -- NLTK website
* https://www.nltk.org/

In [None]:
import nltk

In [None]:
text = '''
Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.
'''

In [None]:
print(text)

## Tokenization

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# you'll see an error if you have not downloaded some relevent resources from nltk
sent = sent_tokenize(text)
print(sent)

NLTK will require that you download corpora and other information.

In [None]:
# I found that I needed the following for this notebook
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
sent = sent_tokenize(text)
print(sent)

In [None]:
print(word_tokenize(sent[1]))

In [None]:
words = []
for s in sent:
    for w in word_tokenize(s):
        words.append(w)

In [None]:
print(words)

## Stopword removal

In [None]:
from nltk.corpus import stopwords
from string import punctuation

In [None]:
print(punctuation)

In [None]:
print(stopwords.words('english'))

In [None]:
print(list(punctuation))

In [None]:
myStopWords = list(punctuation) + stopwords.words('english')

In [None]:
print(words)

In [None]:
wordsNoStop = []
for i in words:
    if i not in myStopWords:
        wordsNoStop.append(i)
print(words)
print(wordsNoStop)

We'll use list comprehension to streamline this process.

In [None]:
# Example list comprehension
[i for i in [1,2,3,4]]

In [None]:
[a for a in range(5)]

In [None]:
[x for x in [2,3,6,5,7,8,4] if x > 5]

In [None]:
wordsNoStopComp = [w for w in words if w not in myStopWords]
print(wordsNoStopComp)

## N-grams

In [None]:
from nltk.collocations import *

In [None]:
finder = BigramCollocationFinder.from_words(wordsNoStop)

In [None]:
finder

In [None]:
finder.ngram_fd

In [None]:
finder.ngram_fd.items()

In [None]:
sorted(finder.ngram_fd.items())

## Stemming and Tagging

In [None]:
text2 = 'Ishmael sailed because sailing and wanting to sail was in his blood.'

In [None]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [None]:
words = word_tokenize(text2)

In [None]:
print(words)

In [None]:
wordLancasterStems = [LancasterStemmer().stem(w) for w in words]
wordPorterStems = [PorterStemmer().stem(w) for w in words]

In [None]:
print(wordLancasterStems)
print(wordPorterStems)

In [None]:
nltk.pos_tag(words)

In [None]:
nltk.pos_tag(words)

In [None]:
nltk.pos_tag(word_tokenize('Once upon a time there was a cat.  It was black and fluffy.'))

Check out the [Penn Treebank Project list](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

## Word sense disambiguation

In [None]:
from nltk.corpus import wordnet
for ss in wordnet.synsets('sail'):
    print(ss, ss.definition())

In [None]:
from nltk.wsd import lesk

In [None]:
print(words)

In [None]:
wordSense = lesk(words, 'sail')

In [None]:
print(wordSense, wordSense.definition())

In [None]:
wordSense = lesk(words, 'sailed')

In [None]:
print(wordSense, wordSense.definition())

In [None]:
wordSense = lesk(words, 'wanting to sail')

In [None]:
# will give an error
# because you can only look at a definition of single words
print(wordSense, wordSense.definition())

In [None]:
t = 'I will sail to Mexico each winter.'
s = lesk(word_tokenize(t), 'sail')
print(s, s.definition())

In [None]:
# One can help guide disambiguation by 
# further including the part of speech,
# though note that the POS specifications
# are those specific to synsets

for ss in wordnet.synsets('sail'):
    print(ss, ss.pos())

In [None]:
t = 'I will sail to Mexico each winter.'
s = lesk(word_tokenize(t), 'sail', pos='v')
print(s, s.definition())