In [134]:
from __future__ import print_function
import string
import random
from nltk.corpus import names
from nltk.classify import apply_features

In [123]:
# Construct the feature extractor
def gender_features(word):
    features = {
        'last_letter': word[-1],
        'first_letter': word[0],
        'suffix2': word[-2:],
    }
    for letter in string.ascii_lowercase:
        features['count({})'.format(letter)] = name.lower().count(letter)
        features['has({})'.format(letter)] = letter in name.lower()
    return features

In [124]:
# Load the data
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)
crossval_names = labeled_names[500:1500]

In [125]:
# Extract features and prepare train/test datasets
train_set = apply_features(gender_features, labeled_names[1500:])
crossval_set = apply_features(gender_features, crossval_names)
test_set = apply_features(gender_features, labeled_names[:500])

In [126]:
# Train the model
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [127]:
# Evaluate accuracy
nltk.classify.accuracy(classifier, crossval_set)

0.791

In [135]:
# Evaluate errors
errors = []
for (name, tag) in crossval_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

print('Number of errors: ', len(errors))
errors

Number of errors:  485


[('female', 'male', u'Kelci'),
 ('female', 'male', u'Elise'),
 ('female', 'male', u'Roberta'),
 ('female', 'male', u'Maggy'),
 ('female', 'male', u'Moria'),
 ('female', 'male', u'Fred'),
 ('female', 'male', u'Karrie'),
 ('female', 'male', u'Jeanine'),
 ('female', 'male', u'Valerye'),
 ('female', 'male', u'Billi'),
 ('female', 'male', u'Kissiah'),
 ('female', 'male', u'Juliane'),
 ('female', 'male', u'Chrysler'),
 ('female', 'male', u'Viki'),
 ('female', 'male', u'Randee'),
 ('female', 'male', u'Clio'),
 ('female', 'male', u'Fannie'),
 ('female', 'male', u'Tabitha'),
 ('female', 'male', u'Vivienne'),
 ('female', 'male', u'Goldi'),
 ('female', 'male', u'Debora'),
 ('female', 'male', u'Ami'),
 ('female', 'male', u'Alexis'),
 ('female', 'male', u'Kelcy'),
 ('female', 'male', u'Ulrika'),
 ('female', 'male', u'Tori'),
 ('female', 'male', u'Faydra'),
 ('female', 'male', u'Cathrine'),
 ('female', 'male', u'Kit'),
 ('female', 'male', u'Lily'),
 ('female', 'male', u'Ros'),
 ('female', 'male', u'