In [93]:
import nltk 
from nltk.corpus import names
import random
import copy

In [94]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [95]:
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [96]:
## So now we have a list of string tuples, names and gender.  
print (names[:10])
print len(names)

[(u'Xever', 'male'), (u'Raoul', 'male'), (u'Arleyne', 'female'), (u'Saraann', 'female'), (u'Mady', 'female'), (u'Jean-Marc', 'male'), (u'Park', 'male'), (u'Cassaundra', 'female'), (u'Samson', 'male'), (u'Bartholomeo', 'male')]
7944


In [9]:
## Broken down as in the book
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
print nltk.classify.accuracy(classifier, test_set)

0.788


In [11]:
## Broken down as request for project 3
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
#check the lengths
print len(train_set)
print len(dev_set)
print len(test_set)

500
500
6944


In [13]:
## check accuracy of the dev_set
print nltk.classify.accuracy(classifier, dev_set)

0.75


In [14]:
## Note these features are different from those in the text
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'o'             male : female =     16.7 : 1.0
             last_letter = u't'             male : female =     13.0 : 1.0
             last_letter = u'r'             male : female =     11.5 : 1.0
             last_letter = u'm'             male : female =      5.6 : 1.0
             last_letter = u'd'             male : female =      5.2 : 1.0


In [17]:
## includes first letter, last letter, has letter, and counts of each letter
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [32]:
## Regenerate featuresets, with same order as before
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier2 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier2, dev_set)

0.756


In [24]:
## counts of letters o and a appear important
classifier2.show_most_informative_features(10)

Most Informative Features
              lastletter = u'o'             male : female =     16.7 : 1.0
              lastletter = u't'             male : female =     13.0 : 1.0
              lastletter = u'r'             male : female =     11.5 : 1.0
                count(o) = 2                male : female =      6.5 : 1.0
              lastletter = u'm'             male : female =      5.6 : 1.0
              lastletter = u'd'             male : female =      5.2 : 1.0
                count(a) = 2              female : male   =      5.2 : 1.0
              lastletter = u'k'             male : female =      4.3 : 1.0
             firstletter = u'z'             male : female =      4.3 : 1.0
             firstletter = u'h'             male : female =      3.9 : 1.0


In [27]:
## Seeing what was important above, generate a new feature list without the 'has' function.
## includes first letter, last letter, and counts of each letter
def gender_features3(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features
## Regenerate featuresets, with same order as before
featuresets = [(gender_features3(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.768


In [33]:
errors = []
for (name, tag) in dev_names:
    guess = classifier3.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [35]:
## it guess a lot of male name as females.
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name)

correct=male     guess=female   name=Aaron                         
correct=male     guess=female   name=Abdulkarim                    
correct=male     guess=female   name=Abelard                       
correct=male     guess=female   name=Abraham                       
correct=male     guess=female   name=Albert                        
correct=male     guess=female   name=Alfredo                       
correct=male     guess=female   name=Alphonso                      
correct=male     guess=female   name=Andy                          
correct=male     guess=female   name=Antoni                        
correct=male     guess=female   name=Antonin                       
correct=male     guess=female   name=Arthur                        
correct=male     guess=female   name=Augusto                       
correct=male     guess=female   name=Augustus                      
correct=male     guess=female   name=Austen                        
correct=male     guess=female   name=Avram      

In [128]:
## Lets make a function for generating a new dev and train set, while keeping the same test set
def getNewSets(names, gender_features):
    n = copy.deepcopy(names)
    n = n[0:1000]
    n = sorted(n, key=lambda k: random.random())
    namesnew = n + names[1000:]
    featuresets = [(gender_features(n), g) for (n,g) in namesnew]
    train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
    train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
    return train_names, dev_names, test_names,train_set, dev_set, test_set

In [126]:
##looks at a combination of first and last letter
def gender_features4(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["firstAndLast"] = name[0].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features
## Regenerate featuresets, with same order as before

In [129]:
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features4)

In [130]:
## this is worse
classifier4 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier4, dev_set)

0.75


In [131]:
##looks at a combination of first two letters
def gender_features5(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twofirst"] = name[0].lower() + name[1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [132]:
## and still worse
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features5)
classifier5 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier5, dev_set)

0.724


In [138]:
## as you can see, version 3 is better even afer re-randomizing
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features3)
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.782


In [135]:
##looks at a combination of first two letters
def gender_features6(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twolast"] = name[-2].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [139]:
## this really has no improvement
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features6)
classifier6 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier6, dev_set)

0.78


In [141]:
##include second to last
def gender_features7(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["secondToLast"] = name[-2].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [144]:
## this really has no improvement either
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features7)
classifier7 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier7, dev_set)

0.772


So at this point, version 3 appears to do as good as any, with ~77-78%. Still, names like Dave and Mike are assigned incorrectly.