# Introduction to NLP with Python's NLTK

* "NLTK is a leading platform for building Python programs to work with human language data." -- NLTK website
* https://www.nltk.org/

In [1]:
import nltk

We'll use the first lines of Moby Dick to explore some NLP basics:

In [2]:
text = '''
Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.'
'''

In [3]:
print(text)


Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.'



## Tokenization

**Tokenization** breaks the raw text into smaller pieces like sentences and words.

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize

* `sent_tokenize` takes a string and breaks it down into a list of sentences.
* `word_tokenize` takes a string and breaks it down into a list of words.

In [5]:
sentences = sent_tokenize(text)
print(sentences)

['\nCall me Ishmael.', "Some years ago—never mind how long precisely—having little\nor no money in my purse, and nothing particular to interest me on shore, \nI thought I would sail about a little and see the watery part of the world.'"]


In [6]:
print(word_tokenize(sentences[1]))

['Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.', "'"]


In [7]:
words = word_tokenize(text)
print(words)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.', "'"]


In [8]:
words2 = []
for s in sentences:
    for w in word_tokenize(s):
        words2.append(w)

In [9]:
print(words2)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.', "'"]


## Stopword removal

Usually in language analysis we don't want our analysis to be skewed by very common words like 'a', 'the', 'and', etc.  These are stopwords and can be removed before commencing a more detailed analysis.  We often may not want to analyse punctuation marks either when analysing language use.

In [10]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\chief\.conda\envs\geoprj\lib\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
print(list(punctuation))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [14]:
myStopWords = list(punctuation) + stopwords.words('english')

In [15]:
print(words)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.', "'"]


In [16]:
wordsNoStop = []
for i in words:
    if i not in myStopWords:
        wordsNoStop.append(i)
print(words)
print(wordsNoStop)

['Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago—never', 'mind', 'how', 'long', 'precisely—having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', '.', "'"]
['Call', 'Ishmael', 'Some', 'years', 'ago—never', 'mind', 'long', 'precisely—having', 'little', 'money', 'purse', 'nothing', 'particular', 'interest', 'shore', 'I', 'thought', 'I', 'would', 'sail', 'little', 'see', 'watery', 'part', 'world']


We'll use list comprehension to streamline this process.

In [17]:
# Example list comprehension
[i for i in [1,2,3,4]]

[1, 2, 3, 4]

In [18]:
[a for a in range(5)]

[0, 1, 2, 3, 4]

In [19]:
[x for x in [2,3,6,5,7,8,4] if x > 5]

[6, 7, 8]

In [20]:
wordsNoStopComp = [w for w in words if w not in myStopWords]
print(wordsNoStopComp)

['Call', 'Ishmael', 'Some', 'years', 'ago—never', 'mind', 'long', 'precisely—having', 'little', 'money', 'purse', 'nothing', 'particular', 'interest', 'shore', 'I', 'thought', 'I', 'would', 'sail', 'little', 'see', 'watery', 'part', 'world']


## N-grams

Words that are near to each other can allow us to draw deeper conclusions about a given text. We can split a text into pairs of co-located words (bi-grams), triplets (tri-grams), and generally into n-tuplets (n-grams).

In [21]:
from nltk.collocations import *

In [22]:
finder = BigramCollocationFinder.from_words(wordsNoStop)

In [23]:
finder

<nltk.collocations.BigramCollocationFinder at 0x2d1213549a0>

In [24]:
finder.ngram_fd

FreqDist({('Call', 'Ishmael'): 1, ('Ishmael', 'Some'): 1, ('Some', 'years'): 1, ('years', 'ago—never'): 1, ('ago—never', 'mind'): 1, ('mind', 'long'): 1, ('long', 'precisely—having'): 1, ('precisely—having', 'little'): 1, ('little', 'money'): 1, ('money', 'purse'): 1, ...})

In [25]:
finder.ngram_fd.items()

dict_items([(('Call', 'Ishmael'), 1), (('Ishmael', 'Some'), 1), (('Some', 'years'), 1), (('years', 'ago—never'), 1), (('ago—never', 'mind'), 1), (('mind', 'long'), 1), (('long', 'precisely—having'), 1), (('precisely—having', 'little'), 1), (('little', 'money'), 1), (('money', 'purse'), 1), (('purse', 'nothing'), 1), (('nothing', 'particular'), 1), (('particular', 'interest'), 1), (('interest', 'shore'), 1), (('shore', 'I'), 1), (('I', 'thought'), 1), (('thought', 'I'), 1), (('I', 'would'), 1), (('would', 'sail'), 1), (('sail', 'little'), 1), (('little', 'see'), 1), (('see', 'watery'), 1), (('watery', 'part'), 1), (('part', 'world'), 1)])

In [26]:
sorted(finder.ngram_fd.items())

[(('Call', 'Ishmael'), 1),
 (('I', 'thought'), 1),
 (('I', 'would'), 1),
 (('Ishmael', 'Some'), 1),
 (('Some', 'years'), 1),
 (('ago—never', 'mind'), 1),
 (('interest', 'shore'), 1),
 (('little', 'money'), 1),
 (('little', 'see'), 1),
 (('long', 'precisely—having'), 1),
 (('mind', 'long'), 1),
 (('money', 'purse'), 1),
 (('nothing', 'particular'), 1),
 (('part', 'world'), 1),
 (('particular', 'interest'), 1),
 (('precisely—having', 'little'), 1),
 (('purse', 'nothing'), 1),
 (('sail', 'little'), 1),
 (('see', 'watery'), 1),
 (('shore', 'I'), 1),
 (('thought', 'I'), 1),
 (('watery', 'part'), 1),
 (('would', 'sail'), 1),
 (('years', 'ago—never'), 1)]

## Stemming and Tagging

Stemming allows us to improve our estimate of word frequency by combining the counts of similar forms of words (e.g. counting sail, sailing, and sailed as representative of the common stem "sail").

Tagging helps us to disambiguate words by identifying their part-of-speech.

In [27]:
text2 = 'Ishmael sailed because sailing and wanting to sail was in his blood.'

In [28]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [29]:
words = word_tokenize(text2)

In [30]:
print(words)

['Ishmael', 'sailed', 'because', 'sailing', 'and', 'wanting', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']


In [31]:
wordLancasterStems = [LancasterStemmer().stem(w) for w in words]
wordPorterStems = [PorterStemmer().stem(w) for w in words]

In [32]:
print(wordLancasterStems)
print(wordPorterStems)

['ishmael', 'sail', 'becaus', 'sail', 'and', 'want', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']
['ishmael', 'sail', 'becaus', 'sail', 'and', 'want', 'to', 'sail', 'wa', 'in', 'hi', 'blood', '.']


In [33]:
nltk.pos_tag(words)

[('Ishmael', 'NNP'),
 ('sailed', 'VBD'),
 ('because', 'IN'),
 ('sailing', 'NN'),
 ('and', 'CC'),
 ('wanting', 'VBG'),
 ('to', 'TO'),
 ('sail', 'VB'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('his', 'PRP$'),
 ('blood', 'NN'),
 ('.', '.')]

In [34]:
nltk.pos_tag(word_tokenize('Once upon a time there was a cat.  It was black and fluffy.'))

[('Once', 'RB'),
 ('upon', 'IN'),
 ('a', 'DT'),
 ('time', 'NN'),
 ('there', 'EX'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('cat', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('was', 'VBD'),
 ('black', 'JJ'),
 ('and', 'CC'),
 ('fluffy', 'JJ'),
 ('.', '.')]

Check out the [Penn Treebank Project list](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)

## Word sense disambiguation

We can further disambiguate words by looking at their synsets.  Synsets are groupings of synonymous words that are conceptually similar.

In [36]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [37]:
from nltk.corpus import wordnet
for ss in wordnet.synsets('sail'):
    print(ss, ss.definition())

Synset('sail.n.01') a large piece of fabric (usually canvas fabric) by means of which wind is used to propel a sailing vessel
Synset('cruise.n.01') an ocean trip taken for pleasure
Synset('sail.n.03') any structure that resembles a sail
Synset('sail.v.01') traverse or travel on (a body of water)
Synset('sweep.v.02') move with sweeping, effortless, gliding motions
Synset('sail.v.03') travel on water propelled by wind
Synset('voyage.v.01') travel on water propelled by wind or by other means


One algorithm for disambiguating a word is the Lesk algorithm, which loosely speaking looks at the definitions of neighboring words to that word and selects the definition that has the highest overlap with these neighboring definitions.

In [38]:
from nltk.wsd import lesk

In [39]:
print(words)

['Ishmael', 'sailed', 'because', 'sailing', 'and', 'wanting', 'to', 'sail', 'was', 'in', 'his', 'blood', '.']


In [40]:
wordSense = lesk(words, 'sail')

In [41]:
print(wordSense, wordSense.definition())

Synset('sail.n.01') a large piece of fabric (usually canvas fabric) by means of which wind is used to propel a sailing vessel


In [42]:
wordSense = lesk(words, 'sailed')

In [43]:
print(wordSense, wordSense.definition())

Synset('voyage.v.01') travel on water propelled by wind or by other means


In [44]:
wordSense = lesk(words, 'wanting to sail')

In [45]:
print(wordSense, wordSense.definition())

AttributeError: 'NoneType' object has no attribute 'definition'

In [46]:
t = 'I sailed to Mexico on a boat each winter.'
s = lesk(word_tokenize(t), 'sailed')
print(s, s.definition())

Synset('voyage.v.01') travel on water propelled by wind or by other means
