In [1]:
import nltk
#nltk.download()

### HMM POS TAGGING

In [2]:
# OK, here is our HMM POS Tagger for this example

# states
start = -1; VB = 0; TO = 1; NN = 2; PPSS = 3; stateCount = 4
stateNames = ["VB", "TO", "NN", "PPSS"]

# outputs
I = 0; WANT = 1; To = 2; RACE=3

timeSteps = 4

# state transition probabilities
trans = {}
trans[(start, VB)] = .19
trans[(start, TO)] = .0043
trans[(start, NN)] = .041
trans[(start, PPSS)] = .067

trans[(VB, VB)] = .0038
trans[(VB, TO)] = .035
trans[(VB, NN)] = .047
trans[(VB, PPSS)] = .0070

trans[(TO, VB)] = .83
trans[(TO, TO)] = 0
trans[(TO, NN)] = .00047
trans[(TO, PPSS)] = 0

trans[(NN, VB)] = .0040
trans[(NN, TO)] = .016
trans[(NN, NN)] = .087
trans[(NN, PPSS)] = .0045

trans[(PPSS, VB)] = .23
trans[(PPSS, TO)] = .00079
trans[(PPSS, NN)] = .0012
trans[(PPSS, PPSS)] = .00014

# state outputs
output = {}
output[(VB, I)] = 0
output[(VB, WANT)] = .0093
output[(VB, To)] = 0
output[(VB, RACE)] = .00012

output[(TO, I)] = 0
output[(TO, WANT)] = 0
output[(TO, To)] = .99
output[(TO, RACE)] = 0

output[(NN, I)] = 0
output[(NN, WANT)] = .000054
output[(NN, To)] = 0
output[(NN, RACE)] = .00057

output[(PPSS, I)] = .37
output[(PPSS, WANT)] = 0
output[(PPSS, To)] = 0
output[(PPSS, RACE)] = 0


sentence = [I, WANT, To, RACE]
words = ["I", "WANT", "TO", "RACE"]

# manage cell values and back pointers
cells = {}
backStates = {}

def computeMaxPrev(t, sNext):
    maxValue = 0
    maxState = 0
    
    for s in range(stateCount):
        value = cells[t, s] * trans[(s, sNext)]
        if (s == 0 or value > maxValue):
            maxValue = value
            maxState = s
            
    return (maxValue, maxState)
    
def viterbi(trans, output, sentence):

    # special handling for t=0 which have no prior states)
    for s in range(stateCount):
        cells[(0, s)] = trans[(start, s)] * output[(s, sentence[0])]
        
    # handle rest of time steps
    for t in range(1, timeSteps):
        for s in range(stateCount):
            maxValue, maxState = computeMaxPrev(t-1, s)
            backStates[(t,s)] = maxState
            cells[(t, s)] = maxValue * output[(s, sentence[t])]
            #print("t=", t, "s=", s, "maxValue=", maxValue, "maxState=", maxState, "output=", output[(s, sentence[t])], "equals=", cells[(t, s)])
        
    # walk thru cells backwards to get most probable path
    path = []
    
    for tt in range(timeSteps):
        t = timeSteps - tt - 1    # step t backwards over timesteps
        maxValue = 0
        maxState = 0
        
        for s in range(stateCount):
            value = cells[t, s] 
            if (s == 0 or value > maxValue):
                maxValue = value
                maxState = s
                
        path.insert(0, maxState)
        
    return path

# test our algorithm on the POS TAG data
path = viterbi(trans, output, sentence)

print("Tagged Sentence:")
for i in range(timeSteps):
    state = path[i]
    print("  word=", words[i], "\ttag=", stateNames[state])
    

Tagged Sentence:
  word= I 	tag= PPSS
  word= WANT 	tag= VB
  word= TO 	tag= TO
  word= RACE 	tag= VB


### Classical NLP Applications

In [7]:
# code to build a classifier to classify names as male or female
# demonstrates the basics of feature extraction and model building

names = [(name, 'male') for name in nltk.corpus.names.words("male.txt")]
names += [(name, 'female') for name in nltk.corpus.names.words("female.txt")]

def extract_gender_features(name):
    name = name.lower()
    features = {}
    features["suffix"] = name[-1:]
    features["suffix2"] = name[-2:] if len(name) > 1 else name[0]
    features["suffix3"] = name[-3:] if len(name) > 2 else name[0]
    #features["suffix4"] = name[-4:] if len(name) > 3 else name[0]
    #features["suffix5"] = name[-5:] if len(name) > 4 else name[0]
    #features["suffix6"] = name[-6:] if len(name) > 5 else name[0]
    features["prefix"] = name[:1]
    features["prefix2"] = name[:2] if len(name) > 1 else name[0]
    features["prefix3"] = name[:3] if len(name) > 2 else name[0]
    features["prefix4"] = name[:4] if len(name) > 3 else name[0]
    features["prefix5"] = name[:5] if len(name) > 4 else name[0]
    #features["wordLen"] = len(name)
    
    #for letter in "abcdefghijklmnopqrstuvwyxz":
    #    features[letter + "-count"] = name.count(letter)
   
    return features

data = [(extract_gender_features(name), gender) for (name,gender) in names]

import random
random.shuffle(data)

#print(data[:10])
#print()
#print(data[-10:])

dataCount = len(data)
trainCount = int(.8*dataCount)

trainData = data[:trainCount]
testData = data[trainCount:]
bayes = nltk.NaiveBayesClassifier.train(trainData)

def classify(name):
    label = bayes.classify(extract_gender_features(name))
    print("name=", name, "classifed as=", label)

print("trainData accuracy=", nltk.classify.accuracy(bayes, trainData))
print("testData accuracy=", nltk.classify.accuracy(bayes, testData))

classify('raj')
classify('nancy')
classify('alex')

bayes.show_most_informative_features(25)

trainData accuracy= 0.9211644374508261
testData accuracy= 0.842039018250472
name= raj classifed as= male
name= nancy classifed as= female
name= alex classifed as= female
Most Informative Features
                 suffix2 = 'na'           female : male   =     89.2 : 1.0
                 suffix2 = 'ta'           female : male   =     39.2 : 1.0
                 suffix2 = 'ld'             male : female =     36.8 : 1.0
                  suffix = 'k'              male : female =     36.7 : 1.0
                 suffix2 = 'ia'           female : male   =     33.9 : 1.0
                  suffix = 'a'            female : male   =     33.5 : 1.0
                 suffix2 = 'sa'           female : male   =     31.3 : 1.0
                 suffix2 = 'rt'             male : female =     30.0 : 1.0
                 suffix2 = 'rd'             male : female =     29.0 : 1.0
                 suffix2 = 'do'             male : female =     25.8 : 1.0
                 suffix3 = 'ard'            male : fem

In [4]:
# movie reviews / sentiment analysis - part #1
from nltk.corpus import movie_reviews as reviews
import random

docs = [(list(reviews.words(id)), cat)  for cat in reviews.categories() for id in reviews.fileids(cat)]
random.shuffle(docs)

#print([ (len(d[0]), d[0][:2], d[1]) for d in docs[:10]])

fd = nltk.FreqDist(word.lower() for word in reviews.words())
topKeys = [ key for (key,value) in fd.most_common(2000)]


In [5]:
# movie reviews sentiment analysis - part #2
import nltk


def review_features(doc):
    docSet = set(doc)
    features = {}
    
    for word in topKeys:
        features[word] = (word in docSet)
        
    return features

#review_features(reviews.words("pos/cv957_8737.txt"))

data = [(review_features(doc), label) for (doc,label) in docs]

dataCount = len(data)
trainCount = int(.8*dataCount)

trainData = data[:trainCount]
testData = data[trainCount:]
bayes2 = nltk.NaiveBayesClassifier.train(trainData)

print("train accuracy=", nltk.classify.accuracy(bayes2, trainData))
print("test accuracy=", nltk.classify.accuracy(bayes2, testData))

bayes2.show_most_informative_features(20)


train accuracy= 0.868125
test accuracy= 0.81
Most Informative Features
                   mulan = True              pos : neg    =      8.3 : 1.0
             outstanding = True              pos : neg    =      8.1 : 1.0
             wonderfully = True              pos : neg    =      7.5 : 1.0
                  wasted = True              neg : pos    =      6.8 : 1.0
                   awful = True              neg : pos    =      6.3 : 1.0
                   damon = True              pos : neg    =      6.1 : 1.0
                  allows = True              pos : neg    =      5.1 : 1.0
                   bland = True              neg : pos    =      5.0 : 1.0
                   worst = True              neg : pos    =      4.8 : 1.0
                  poorly = True              neg : pos    =      4.7 : 1.0
                 sandler = True              neg : pos    =      4.6 : 1.0
                   waste = True              neg : pos    =      4.6 : 1.0
               portrayed = Tr