In [1]:
import nltk 
from nltk.corpus import names
import random
import copy

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [3]:
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [4]:
## So now we have a list of string tuples, names and gender.  
print (names[:10])
print len(names)

[(u'Lana', 'female'), (u'Vonni', 'female'), (u'Suzy', 'female'), (u'Georgia', 'male'), (u'Nonie', 'female'), (u'Bennet', 'male'), (u'Elicia', 'female'), (u'Jefry', 'male'), (u'Stella', 'female'), (u'Maryanne', 'female')]
7944


In [5]:
## Broken down as in the book
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [6]:
print nltk.classify.accuracy(classifier, test_set)

0.758


In [7]:
## Broken down as request for project 3
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
#check the lengths
print len(train_set)
print len(dev_set)
print len(test_set)

500
500
6944


In [9]:
## check accuracy of the dev_set
print nltk.classify.accuracy(classifier, dev_set)

0.776


In [10]:
## Note these features are different from those in the text
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     19.8 : 1.0
             last_letter = u'd'             male : female =     11.2 : 1.0
             last_letter = u'i'           female : male   =      8.1 : 1.0
             last_letter = u'o'             male : female =      5.7 : 1.0
             last_letter = u's'             male : female =      4.5 : 1.0


In [11]:
## includes first letter, last letter, has letter, and counts of each letter
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [12]:
## Regenerate featuresets, with same order as before
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier2 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier2, dev_set)

0.754


In [13]:
## counts of letters o and a appear important
classifier2.show_most_informative_features(10)

Most Informative Features
              lastletter = u'a'           female : male   =     19.8 : 1.0
              lastletter = u'd'             male : female =     11.2 : 1.0
              lastletter = u'i'           female : male   =      8.1 : 1.0
                  has(w) = True             male : female =      8.1 : 1.0
                count(w) = 1                male : female =      8.1 : 1.0
                count(o) = 2                male : female =      6.2 : 1.0
              lastletter = u'o'             male : female =      5.7 : 1.0
              lastletter = u's'             male : female =      4.5 : 1.0
              lastletter = u'r'             male : female =      4.3 : 1.0
                count(g) = 2                male : female =      3.8 : 1.0


In [14]:
## Seeing what was important above, generate a new feature list without the 'has' function.
## includes first letter, last letter, and counts of each letter
def gender_features3(name):
    m=int(round(len(name)/2))
    midletter = name[m].lower()
    features = {}
    features['first_letter']  =  name[0].lower()
    features['middle_letter'] =  midletter 
    features['last_letter']   =  name[-1].lower()
    return features
## Regenerate featuresets, with same order as before
featuresets = [(gender_features3(n), g) for (n,g) in names]
train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.758


In [15]:
errors = []
for (name, tag) in dev_names:
    guess = classifier3.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [16]:
## it guess a lot of male name as females.
for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print 'correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name)

correct=female   guess=male     name=Adrian                        
correct=female   guess=male     name=April                         
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Caril                         
correct=female   guess=male     name=Carlyn                        
correct=female   guess=male     name=Carlynn                       
correct=female   guess=male     name=Carmen                        
correct=female   guess=male     name=Carol-Jean                    
correct=female   guess=male     name=Caron                         
correct=female   guess=male     name=Celestyn                      
correct=female   guess=male     name=Chandal                       
correct=female   guess=male     name=Charlott                      
correct=female   guess=male     name=Chrystel                      
correct=female   guess=male     name=Ciel                          
correct=female   guess=male     name=Doralynn   

In [17]:
## Lets make a function for generating a new dev and train set, while keeping the same test set
def getNewSets(names, gender_features):
    n = copy.deepcopy(names)
    n = n[0:1000]
    n = sorted(n, key=lambda k: random.random())
    namesnew = n + names[1000:]
    featuresets = [(gender_features(n), g) for (n,g) in namesnew]
    train_names, dev_names, test_names = names[0:500], names[500:1000], names[1000:]
    train_set, dev_set, test_set = featuresets[0:500], featuresets[500:1000], featuresets[1000:]
    return train_names, dev_names, test_names,train_set, dev_set, test_set

In [18]:
##looks at a combination of first and last letter
def gender_features4(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["firstAndLast"] = name[0].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features
## Regenerate featuresets, with same order as before

In [19]:
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features4)

In [20]:
## this is worse
classifier4 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier4, dev_set)

0.76


In [21]:
##looks at a combination of first two letters
def gender_features5(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twofirst"] = name[0].lower() + name[1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [22]:
## and still worse
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features5)
classifier5 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier5, dev_set)

0.732


In [23]:
## as you can see, version 3 is better even afer re-randomizing
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features3)
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier3, dev_set)

0.738


In [24]:
##looks at a combination of first two letters
def gender_features6(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["twolast"] = name[-2].lower() + name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [25]:
## this really has no improvement
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features6)
classifier6 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier6, dev_set)

0.752


In [26]:
##include second to last
def gender_features7(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    features["secondToLast"] = name[-2].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
    return features


In [27]:
## this really has no improvement either
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features7)
classifier7 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier7, dev_set)

0.778


In [28]:
def gender_features8(name):
    m=int(round(len(name)/2))
    midletter = name[m].lower()
    features = {}
    features['first_letter']  =  name[0].lower()
    features['middle_letter'] =  midletter 
    features['last_letter']   =  name[-1].lower()
    return features


In [29]:
## this does worse
train_names, dev_names, test_names,train_set, dev_set, test_set = getNewSets(names, gender_features8)
classifier8 = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier8, dev_set)

0.748


So at this point, version 3 appears to do as good as any, with ~77-78%. Still, names like Dave and Mike are assigned incorrectly.

In [30]:
print nltk.classify.accuracy(classifier3, dev_set)
print nltk.classify.accuracy(classifier3, test_set)

0.788
0.759936635945


## How does the performance on the test set compare to the performance on the dev-test set? 

###  The performance on the dev-test set is slightly better than the performance on the test set.  Classifier 3 seems to have the highest accuracy with about 78% accuracy on the dev-test set and 76% accuracy on the test set.     

## Is this what you'd expect?

###  Yes.  Since you're using the dev_test set to develop the model, you would expect the performance of the model on the dev_test to be a little better than the performance on the test set.