# Learning to Classify Text

https://www.nltk.org/book/ch06.html

In [28]:
import nltk
from nltk import *

In [29]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

In [30]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
    [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [31]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]


In [33]:
train_set[0]

({'last_letter': 'a'}, 'female')

In [34]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [35]:
classifier.classify(gender_features('Neo'))

'male'

In [36]:
classifier.classify(gender_features('Trinity'))

'female'

In [37]:
print(nltk.classify.accuracy(classifier, test_set))

0.774


In [38]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     36.9 : 1.0
             last_letter = 'k'              male : female =     31.8 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.2 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'm'              male : female =      8.3 : 1.0
             last_letter = 'o'              male : female =      8.1 : 1.0
             last_letter = 'r'              male : female =      6.5 : 1.0
             last_letter = 'g'              male : female =      6.5 : 1.0


In [39]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [40]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [41]:
gender_features2('John') 

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [42]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.786


In [43]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [44]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.773


In [45]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [46]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Aimil                         
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Allyson                       
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Barb                          
correct=female   guess=male     name=Bev                           
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Carmel                        
correct=female   guess=male     name=Caro                          
correct=female   guess=male     name=Carolin    

In [26]:
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [27]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.771


In [47]:
sports_sentences = [ 'The team dominated the game',
                    'They lost the ball',
                    'The game was intense',
                    'The goalkeeper catched the ball',
                    'The other team controlled the ball' ]

In [48]:
def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)

In [49]:
positive_featuresets = map(features, sports_sentences)

In [51]:
list(positive_featuresets)

[{'contains(the)': True,
  'contains(team)': True,
  'contains(dominated)': True,
  'contains(game)': True},
 {'contains(they)': True,
  'contains(lost)': True,
  'contains(the)': True,
  'contains(ball)': True},
 {'contains(the)': True,
  'contains(game)': True,
  'contains(was)': True,
  'contains(intense)': True},
 {'contains(the)': True,
  'contains(goalkeeper)': True,
  'contains(catched)': True,
  'contains(ball)': True},
 {'contains(the)': True,
  'contains(other)': True,
  'contains(team)': True,
  'contains(controlled)': True,
  'contains(ball)': True}]