# NLP with NLTK (.. and a little sklearn) 

Natural Language Processing with the Natural Language Toolkit

[nltk](http://www.nltk.org/) is a Python package for NLP.

In [1]:
from __future__ import print_function

In [2]:
# pip install nltk
import nltk
import pandas as pd

Much of NLTK depends on additional data which you'll have to download. Use `nltk.download()` to get at least the following:

 * averaged_perceptron_tagger (in models)
 * maxent_treebank_pos_tagger (in models)
 * punkt (in models)
 * maxent_ne_chunk (in models)
 * words (in corpora)

You can install these and continue without restarting your kernel.

In [None]:
nltk.download()

### Sentence tokenization

In [3]:
from nltk.tokenize import sent_tokenize

text = """Hello. How are you, dear Mr. Sir? Are you well?
          Here: drink this! It will make you feel better.
          I mean, it won't make you feel worse!"""

sentences = sent_tokenize(text)
print(sentences)


['Hello.', 'How are you, dear Mr. Sir?', 'Are you well?', 'Here: drink this!', 'It will make you feel better.', "I mean, it won't make you feel worse!"]


### Word tokenization

In [4]:
# TreebankWordTokenizer assumes that our input has already been segmented into sentences..


from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentences[5])

['I', 'mean', ',', 'it', 'wo', "n't", 'make', 'you', 'feel', 'worse', '!']

In [5]:
from nltk.tokenize import word_tokenize
words = word_tokenize(sentences[5])
words

['I', 'mean', ',', 'it', 'wo', "n't", 'make', 'you', 'feel', 'worse', '!']

In [6]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(sentences[5])

['I', 'mean', ',', 'it', 'won', "'", 't', 'make', 'you', 'feel', 'worse', '!']

Demo of different tokenizers: http://text-processing.com/demo/tokenize/

### Part of speech tagging

In [7]:
from nltk.tag import pos_tag
words=pos_tag(word_tokenize("Who's going to that thing today?"))
words

[('Who', 'WP'),
 ("'s", 'VBZ'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('that', 'DT'),
 ('thing', 'NN'),
 ('today', 'NN'),
 ('?', '.')]

##### Some of POS tags: 
WP: wh-pronoun ("who", "what")  
VBZ: verb, 3rd person sing. present ("takes")  
VBG: verb, gerund/present participle ("taking")  
TO: to ("to go", "to him")   
DT: determiner ("the", "this")  
NN: noun, singular or mass ("door")  
.: Punctuation (".", "?")  

All tags: http://www.monlp.com/2011/11/08/part-of-speech-tags/

### Chunking
Extracting phrases

In [10]:
## the 'named entity' chunker!  ne_chunk utilizes 

from nltk.chunk import ne_chunk
words = word_tokenize("""I'm Julia and I'm here to say
                         I love NLTK in a major way.""")
tags = pos_tag(words)
tree = ne_chunk(tags)
print(tags)
print(tree)

[('I', 'PRP'), ("'m", 'VBP'), ('Julia', 'JJ'), ('and', 'CC'), ('I', 'PRP'), ("'m", 'VBP'), ('here', 'RB'), ('to', 'TO'), ('say', 'VB'), ('I', 'PRP'), ('love', 'VBP'), ('NLTK', 'NNP'), ('in', 'IN'), ('a', 'DT'), ('major', 'JJ'), ('way', 'NN'), ('.', '.')]
(S
  I/PRP
  'm/VBP
  (PERSON Julia/JJ)
  and/CC
  I/PRP
  'm/VBP
  here/RB
  to/TO
  say/VB
  I/PRP
  love/VBP
  (ORGANIZATION NLTK/NNP)
  in/IN
  a/DT
  major/JJ
  way/NN
  ./.)


In [13]:
tree

nltk.tree.Tree

In [16]:
tree.draw()

In [14]:
words2 = word_tokenize("First National Bank announced earnings!")
tags2=pos_tag(words2)
tree2=ne_chunk(tags2)

In [15]:
tree2.draw()

### Included text corpora

Also install these!

 * movie_reviews: Imdb reviews characterized as pos & neg  
 * treebank: tagged and parsed Wall Street Journal text  
 * brown: tagged & categorized English text (news, fiction, etc)  

(There are over 60 others.)

In [1]:
nltk.download()

NameError: name 'nltk' is not defined

In [4]:
# A chunked corpora reader..

from nltk.corpus import treebank_chunk
treebank_chunk.tagged_sents()[0]

  return [tok for tok in self._regexp.split(text) if tok]


[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [5]:
treebank_chunk.chunked_sents()[0].draw()

  return [tok for tok in self._regexp.split(text) if tok]


In [None]:
tree=treebank_chunk.chunked_sents()[0]
' '.join([w for w, t in tree.leaves()])

# TextBlob

In [4]:
# pip install textblob
from textblob import TextBlob

GATSBY_TEXT = """In my younger and more vulnerable years my father
                 gave me some advice that I've been turning over
                 in my mind ever since. "Whenever you feel like
                 criticizing any one," he told me, "blah blah blah."""

gatsby = TextBlob(GATSBY_TEXT)

In [5]:
gatsby.tags

[('In', 'IN'),
 ('my', 'PRP$'),
 ('younger', 'JJR'),
 ('and', 'CC'),
 ('more', 'RBR'),
 ('vulnerable', 'JJ'),
 ('years', 'NNS'),
 ('my', 'PRP$'),
 ('father', 'NN'),
 ('gave', 'VBD'),
 ('me', 'PRP'),
 ('some', 'DT'),
 ('advice', 'NN'),
 ('that', 'IN'),
 ('I', 'PRP'),
 ("'ve", 'VBP'),
 ('been', 'VBN'),
 ('turning', 'VBG'),
 ('over', 'IN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('mind', 'NN'),
 ('ever', 'RB'),
 ('since', 'IN'),
 ('Whenever', 'IN'),
 ('you', 'PRP'),
 ('feel', 'VBP'),
 ('like', 'IN'),
 ('criticizing', 'VBG'),
 ('any', 'DT'),
 ('one', 'CD'),
 ('he', 'PRP'),
 ('told', 'VBD'),
 ('me', 'PRP'),
 ('blah', 'NN'),
 ('blah', 'NN'),
 ('blah', 'NN')]

In [6]:
gatsby.noun_phrases

WordList(['vulnerable years', 'whenever', 'blah blah blah'])

###  How do you really feel?    TextBlob:  Sentiment Analysis

In [7]:
TextBlob("Oh my god I love this bootcamp, it's so awesome.").sentiment

Sentiment(polarity=0.75, subjectivity=0.8)

In [8]:
TextBlob("it's so awesome").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [9]:
TextBlob("Oh my god.").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [10]:
TextBlob("I love this bootcamp.").sentiment

Sentiment(polarity=0.5, subjectivity=0.6)

In [13]:
TextBlob("Oh my god I love this bootcamp.").sentiment

Sentiment(polarity=0.5, subjectivity=0.6)

In [11]:
TextBlob("it's so awesome.").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [12]:
print(TextBlob("I hate cupcakes.").sentiment)

Sentiment(polarity=-0.8, subjectivity=0.9)


In [14]:
gatsby.sentences

[Sentence("In my younger and more vulnerable years my father
                  gave me some advice that I've been turning over
                  in my mind ever since."), Sentence(""Whenever you feel like
                  criticizing any one," he told me, "blah blah blah.")]

In [None]:
gatsby.words

In [15]:
gatsby.sentences[0].words

WordList(['In', 'my', 'younger', 'and', 'more', 'vulnerable', 'years', 'my', 'father', 'gave', 'me', 'some', 'advice', 'that', 'I', "'ve", 'been', 'turning', 'over', 'in', 'my', 'mind', 'ever', 'since'])

#### Stemming

In [16]:
stemmer = nltk.stem.porter.PorterStemmer()
for word in TextBlob("Are you running in two marathons?").words:
    print(stemmer.stem(word))

are
you
run
in
two
marathon


To see different nltk stemmers in effect:
http://text-processing.com/demo/stem/

In [17]:
for word, count in gatsby.word_counts.items():
    print("%15s %i" % (word, count))

             in 2
             my 3
        younger 1
            and 1
           more 1
     vulnerable 1
          years 1
         father 1
           gave 1
             me 2
           some 1
         advice 1
           that 1
              i 1
             ve 1
           been 1
        turning 1
           over 1
           mind 1
           ever 1
          since 1
       whenever 1
            you 1
           feel 1
           like 1
    criticizing 1
            any 1
            one 1
             he 1
           told 1
           blah 3


In [18]:
def get_count(item):
    return item[1]

for word, count in sorted(gatsby.word_counts.items(), key=get_count, reverse=True):
    print("%15s %i" % (word, count))

             my 3
           blah 3
             in 2
             me 2
        younger 1
            and 1
           more 1
     vulnerable 1
          years 1
         father 1
           gave 1
           some 1
         advice 1
           that 1
              i 1
             ve 1
           been 1
        turning 1
           over 1
           mind 1
           ever 1
          since 1
       whenever 1
            you 1
           feel 1
           like 1
    criticizing 1
            any 1
            one 1
             he 1
           told 1


### Movie Reviews 
(without stopwords!)

In [None]:
nltk.download()

In [19]:
import nltk
from textblob import TextBlob
from nltk.corpus import movie_reviews

fileids = movie_reviews.fileids()[:100]
doc_words = [movie_reviews.words(fileid) for fileid in fileids]
documents = [' '.join(words) for words in doc_words]
print(documents[0:1])

['plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what \' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn \' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it \' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no

##### Top bigrams in reviews

In [20]:
from nltk.util import ngrams

from collections import Counter
from operator import itemgetter

from nltk.corpus import stopwords
#stopwords are frequent words that bring little meaning to a text
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop = set(stop)

counter = Counter()

n = 2
for doc in documents:
    words = TextBlob(doc).words
    words = [w for w in words if w not in stop]
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))

     special effects 20
         ghosts mars 18
         first movie 14
           prinze jr 12
         monkey bone 12
         even though 11
           hong kong 11
        fight scenes 11
            want see 10
           van damme 10
         jackie chan 10
         every scene 10
         movies like 9
          romeo must 9
            must die 9
            big john 9
              sci fi 9
           years ago 8
         sounds like 8
         screen time 8
      john carpenter 8
           two hours 8
            year old 8
         action film 8
         big gorilla 8
            one best 8
      freddie prinze 8
           dr moreau 8
               ho ho 8
         spice girls 8


### Using Sklearn algorithms with text data

CountVectorizer:  Convert a collection of text documents to a matrix of token counts
This implementation produces a sparse representation.


['above all', 'above all to', 'all to', 'all to thine', 'be true', 'come to', 'come to this', 'in the', 'in the state', 'is rotten', 'is rotten in', 'is should', 'is should come', 'of denmark', 'own self', 'own self be', 'rotten in', 'rotten in the', 'self be', 'self be true', 'should come', 'should come to', 'something is', 'something is rotten', 'state of', 'state of denmark', 'that is', 'that is should', 'the state', 'the state of', 'thine own', 'thine own self', 'this above', 'this above all', 'to thine', 'to thine own', 'to this']


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

text = ['That is should come to this!', 'This above all: to thine own self be true.', 'Something is rotten in the state of Denmark.']

# CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
vectorizer = CountVectorizer(ngram_range=(2,3))

# call `fit` to build the vocabulary
vectorizer.fit(text)

# then, use `get_feature_names` to return the tokens
print(vectorizer.get_feature_names())

# finally, call `transform` to convert text to a bag of words
x = vectorizer.transform(text)

print('Sparse Matrix')
# A compressed version; the "sparse" matrix.
print(type(x))
print(x)

print ('Matrix')
x_back = x.toarray()
print(type(x_back))
print(x_back)

pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

['above all', 'above all to', 'all to', 'all to thine', 'be true', 'come to', 'come to this', 'in the', 'in the state', 'is rotten', 'is rotten in', 'is should', 'is should come', 'of denmark', 'own self', 'own self be', 'rotten in', 'rotten in the', 'self be', 'self be true', 'should come', 'should come to', 'something is', 'something is rotten', 'state of', 'state of denmark', 'that is', 'that is should', 'the state', 'the state of', 'thine own', 'thine own self', 'this above', 'this above all', 'to thine', 'to thine own', 'to this']
Sparse Matrix
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 5)	1
  (0, 6)	1
  (0, 11)	1
  (0, 12)	1
  (0, 20)	1
  (0, 21)	1
  (0, 26)	1
  (0, 27)	1
  (0, 36)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (1, 14)	1
  (1, 15)	1
  (1, 18)	1
  (1, 19)	1
  (1, 30)	1
  (1, 31)	1
  (1, 32)	1
  (1, 33)	1
  (1, 34)	1
  (1, 35)	1
  (2, 7)	1
  (2, 8)	1
  (2, 9)	1
  (2, 10)	1
  (2, 13)	1
  (2, 16)	1
  (2, 17)	1
  (2, 22)	1
  (2, 23)	1
  (2, 24)	1
  (2, 25

Unnamed: 0,above all,above all to,all to,all to thine,be true,come to,come to this,in the,in the state,is rotten,...,that is should,the state,the state of,thine own,thine own self,this above,this above all,to thine,to thine own,to this
0,0,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1,1,1,1,1,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,1,1,1,...,0,1,1,0,0,0,0,0,0,0


# COOL STUFF HAPPENING HERE

In [44]:
import nltk
from textblob import TextBlob
from nltk.corpus import movie_reviews

fileids = movie_reviews.fileids()[:100]
doc_words = [movie_reviews.words(fileid) for fileid in fileids]
documents = [' '.join(words) for words in doc_words]
print(documents[0:1])

['plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what \' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn \' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it \' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no

In [33]:
#### TF: frequency in this document
#### IDF: inverse frequency in the corpus

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(2,3))
doc_vectors = vectorizer.fit_transform(documents)

classes = np.array(['pos']*50 + ['neg']*50)


model = MultinomialNB().fit(doc_vectors, classes)

In [43]:
classes

array(['pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg'], 
      dtype='<U3')

In [34]:
print(GATSBY_TEXT)

In my younger and more vulnerable years my father
                 gave me some advice that I've been turning over
                 in my mind ever since. "Whenever you feel like
                 criticizing any one," he told me, "blah blah blah.


In [39]:
gatsby_vector = vectorizer.transform([GATSBY_TEXT])
model.predict(gatsby_vector)

array(['pos'], 
      dtype='<U3')

array(['pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos',
       'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg'], 
      dtype='<U3')

In [41]:
len(GATSBY_TEXT.split())

36