In [5]:
import nltk 
from nltk.corpus import names
import random
import copy

In [6]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [7]:
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [8]:
## So now we have a list of string tuples, names and gender.  
print (names[:10])
print len(names)

[(u'Page', 'female'), (u'Prent', 'male'), (u'Annalise', 'female'), (u'Janetta', 'female'), (u'Omar', 'male'), (u'Kent', 'male'), (u'Gretta', 'female'), (u'Cami', 'female'), (u'Alister', 'male'), (u'Ambrose', 'male')]
7944


In [9]:
## Broken down as in the book
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
print nltk.classify.accuracy(classifier, test_set)

0.764


In [11]:
## Broken down as request for project 3
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
#check the lengths
print len(train_set)
print len(dev_set)
print len(test_set)

500
500
6944


In [13]:
## check accuracy of the dev_set
print nltk.classify.accuracy(classifier, dev_set)

0.754


In [14]:
## Note these features are different from those in the text
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'o'             male : female =     16.7 : 1.0
             last_letter = u't'             male : female =     13.0 : 1.0
             last_letter = u'r'             male : female =     11.5 : 1.0
             last_letter = u'm'             male : female =      5.6 : 1.0
             last_letter = u'd'             male : female =      5.2 : 1.0


In [14]:
## includes first letter, last letter, has letter, and counts of each letter
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [15]:
## Regenerate featuresets, with same order as before
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier2 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier2, dev_set)

0.742


In [16]:
## counts of letters o and a appear important
classifier2.show_most_informative_features(10)

Most Informative Features
              lastletter = u'a'           female : male   =     29.7 : 1.0
              lastletter = u'r'             male : female =     16.5 : 1.0
              lastletter = u't'             male : female =      8.0 : 1.0
              lastletter = u's'             male : female =      5.9 : 1.0
              lastletter = u'o'             male : female =      5.9 : 1.0
                count(e) = 3              female : male   =      5.5 : 1.0
              lastletter = u'd'             male : female =      5.3 : 1.0
                count(h) = 2                male : female =      4.9 : 1.0
                count(w) = 1                male : female =      4.6 : 1.0
                count(i) = 2              female : male   =      4.5 : 1.0


In [33]:
## Seeing what was important above, generate a new feature list without the 'has' function.
## includes first letter, last letter, and counts of each letter
def gender_features3(name):
    m=int(round(len(name)/2))
    midletter = name[m].lower()
    features = {}
    features['first_letter']  =  name[0].lower()
    features['middle_letter'] =  midletter 
    features['last_letter']   =  name[-1].lower()
    return features
## Regenerate featuresets, with same order as before
featuresets = [(gender_features3(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.764


In [18]:
errors = []
for (name, tag) in dev_names:
    guess = classifier3.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [19]:
## it guess a lot of male name as females.
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name)

correct=male     guess=female   name=Alberto                       
correct=male     guess=female   name=Alic                          
correct=male     guess=female   name=Allan                         
correct=male     guess=female   name=Aloysius                      
correct=male     guess=female   name=Alston                        
correct=male     guess=female   name=Amery                         
correct=male     guess=female   name=Antin                         
correct=male     guess=female   name=Antonius                      
correct=male     guess=female   name=Ariel                         
correct=male     guess=female   name=Artur                         
correct=male     guess=female   name=Arvin                         
correct=male     guess=female   name=Ave                           
correct=male     guess=female   name=Avery                         
correct=male     guess=female   name=Barde                         
correct=male     guess=female   name=Barnett    

In [20]:
## Lets make a function for generating a new dev and train set, while keeping the same test set
def getNewSets(names, gender_features):
    n = copy.deepcopy(names)
    n = n[0:1000]
    n = sorted(n, key=lambda k: random.random())
    namesnew = n + names[1000:]
    featuresets = [(gender_features(n), g) for (n,g) in namesnew]
    train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
    train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
    return train_names, dev_names, test_names,train_set, dev_set, test_set

In [21]:
##looks at a combination of first and last letter
def gender_features4(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["firstAndLast"] = name[0].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features
## Regenerate featuresets, with same order as before

In [22]:
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features4)

In [23]:
## this is worse
classifier4 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier4, dev_set)

0.756


In [24]:
##looks at a combination of first two letters
def gender_features5(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twofirst"] = name[0].lower() + name[1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [25]:
## and still worse
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features5)
classifier5 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier5, dev_set)

0.758


In [26]:
## as you can see, version 3 is better even afer re-randomizing
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features3)
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.77


In [27]:
##looks at a combination of first two letters
def gender_features6(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twolast"] = name[-2].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [28]:
## this really has no improvement
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features6)
classifier6 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier6, dev_set)

0.746


In [29]:
##include second to last
def gender_features7(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["secondToLast"] = name[-2].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [30]:
## this really has no improvement either
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features7)
classifier7 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier7, dev_set)

0.77


So at this point, version 3 appears to do as good as any, with ~77-78%. Still, names like Dave and Mike are assigned incorrectly.