# Chapter 6

In [82]:
import nltk

### Gender Classification by Feature from Name

In [2]:
# Feature extraction function returns a dictionary of relevant features
def gender_features(word):
    return {'last_letter': word[-1]}  # feature set

gender_features('Jack')

{'last_letter': 'k'}

In [3]:
from nltk.corpus import names
import random
# list of tagged names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [4]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [5]:
classifier.classify(gender_features('Neo'))

'male'

In [6]:
classifier.classify(gender_features('Trinity'))

'female'

In [7]:
# evaluate performance
nltk.classify.accuracy(classifier, test_set)

0.766

In [8]:
classifier.show_most_informative_features()

Most Informative Features
             last_letter = 'a'            female : male   =     38.3 : 1.0
             last_letter = 'k'              male : female =     31.9 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.3 : 1.0
             last_letter = 'm'              male : female =      9.4 : 1.0
             last_letter = 'd'              male : female =      9.4 : 1.0
             last_letter = 'o'              male : female =      8.4 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'g'              male : female =      5.3 : 1.0


In [9]:
# using first, last, number of each letters in a name, contains a letter
# this is called overfitting
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [10]:
gender_features2('John') 

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [11]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)  # overfitted classifier

In [12]:
# overfitted classifier
nltk.classify.accuracy(classifier, test_set)

0.792

In [21]:
# split data into 3 sets, devtest is used for debugging and seeing the errors the classifier runs into
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [22]:
# create feature sets
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.777

In [23]:
# get errors from classifier
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

In [25]:
for (tag, guess, name) in sorted(errors):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Babs                          
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Beilul                        
correct=female   guess=male     name=Bel                           
correct=female   guess=male     name=Bliss                         
correct=female   guess=male     name=Brier                         
correct=female   guess=male     name=Brook                         
correct=female   guess=male     name=Carol      

In [26]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [27]:
# suffix classifier
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, devtest_set)

0.786

In [28]:
classifier.show_most_informative_features()

Most Informative Features
                 suffix2 = 'na'           female : male   =    146.6 : 1.0
                 suffix2 = 'la'           female : male   =     65.1 : 1.0
                 suffix1 = 'a'            female : male   =     39.8 : 1.0
                 suffix1 = 'k'              male : female =     39.5 : 1.0
                 suffix2 = 'ia'           female : male   =     33.8 : 1.0
                 suffix2 = 'ld'             male : female =     32.5 : 1.0
                 suffix2 = 'us'             male : female =     31.4 : 1.0
                 suffix2 = 'ra'           female : male   =     31.1 : 1.0
                 suffix2 = 'rt'             male : female =     30.1 : 1.0
                 suffix2 = 'ta'           female : male   =     29.0 : 1.0


## Document Classification

In [30]:
from nltk.corpus import movie_reviews

In [31]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [32]:
random.shuffle(documents)

In [52]:
# look at 2000 most common words (random)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for w, c in all_words.most_common()][:2000]

In [53]:
word_features

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'his',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'her',
 'all',
 '?',
 'there',
 'like',
 'so',
 'out',
 'about',
 'up',
 'more',
 'what',
 'when',
 'which',
 'or',
 'she',
 'their',
 ':',
 'some',
 'just',
 'can',
 'if',
 'we',
 'him',
 'into',
 'even',
 'only',
 'than',
 'no',
 'time',
 'good',
 'most',
 'its',
 'will',
 'story',
 'would',
 'been',
 'much',
 'character',
 'also',
 'get',
 'other',
 'do',
 'two',
 'well',
 'them',
 'very',
 'characters',
 ';',
 'first',
 '--',
 'after',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'any',
 'does',
 'really',
 'had',
 'while',
 'films',
 'how',
 'plot',
 'little',
 'where',
 'people',
 'over',
 'could',
 'then',
 'me',
 'scene',
 'man',
 'bad',
 '

In [47]:
# check (True|False) if the article contains a specific word from the list of 2000
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({0})'.format(word)] = (word in document_words)
    return features

In [48]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(thrive)': False,
 'contains(sunny)': False,
 'contains(available)': False,
 'contains(accents)': False,
 'contains(preening)': False,
 'contains(multiples)': False,
 'contains(jumps)': False,
 'contains(procreation)': False,
 'contains(crappy)': False,
 'contains(homerian)': False,
 'contains(pastoral)': False,
 'contains(heretofore)': False,
 'contains(overlays)': False,
 'contains(eradicate)': False,
 'contains(forecast)': False,
 'contains(aggravating)': False,
 'contains(entire)': False,
 'contains(disheartened)': False,
 'contains(idea)': True,
 'contains(schillings)': False,
 'contains(unrequited)': False,
 'contains(shoddier)': False,
 'contains(artwork)': False,
 'contains(governed)': False,
 'contains(depletion)': False,
 'contains(afire)': False,
 'contains(boiled)': False,
 'contains(facinelli)': False,
 'contains(humanity)': False,
 'contains(assumption)': False,
 'contains(unravel)': False,
 'contains(littlest)': False,
 'contains(cantonese)': False,
 'contains(

In [54]:
# create the feature sets with labels
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [55]:
print(nltk.classify.accuracy(classifier, test_set))

0.83


In [57]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
        contains(seagal) = True              neg : pos    =      8.3 : 1.0
         contains(mulan) = True              pos : neg    =      7.6 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.4 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0


### Rule-Based POS Tagging with NB Classifier

In [58]:
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [59]:
# get the most common suffixes (100), 1,2,3 letter suffixes
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [60]:
# get the feature counts
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

In [61]:
# create feature set for decision tree classifier
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [62]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [64]:
# create rule based classifier which uses common suffixes to guess on POS tag
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [73]:
classifier.classify(pos_features('cats'))

'NNS'

In [68]:
# print out the rules that choose the POS tag by suffix
print(classifier.pseudocode(depth=6))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: 
          if endswith(and) == False: return '.'
          if endswith(and) == True: return 'CC'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: 
        if endswith(was) == False: 
          if endswith(as) == False: return 'PP$'
          if endswith(as) == True: return 'CS'
        if endswith(was) == True: return 'BEDZ'
      if endswith(is) == True: 
        if endswith(his) == False: return 'BEZ'
        if endswith(his) == True: return 'PP$'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [74]:
# Using More Context of a Sentence
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features
pos_features(brown.sents()[0], 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [75]:
tagged_sents = brown.tagged_sents(categories='news')

In [76]:
# build the feature sets
featuresets = []

for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

In [77]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [78]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [79]:
classifier.show_most_informative_features()

Most Informative Features
               suffix(1) = '.'                 . : NN     =   6950.8 : 1.0
               suffix(2) = 'he'               AT : NN     =   3296.2 : 1.0
               suffix(2) = 'ho'              WPS : NN     =   2982.4 : 1.0
               suffix(1) = 'r'               JJR : NNS    =   2252.6 : 1.0
               suffix(2) = 'to'               TO : JJ     =   2180.6 : 1.0
               suffix(1) = 'h'               ABX : NNS    =   2013.7 : 1.0
               suffix(2) = 'es'              NNS : IN     =   1676.3 : 1.0
               suffix(3) = 'hat'              CS : NN     =   1576.4 : 1.0
               suffix(1) = "'"                '' : JJ     =   1502.2 : 1.0
               suffix(2) = 'ng'              VBG : VBN    =   1241.0 : 1.0
