In [48]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
import pandas as pd

# Creating the training, dev_test, and test sets

First we load the data into a variable and random shuffle it to make sure three data set are randomly selected
Then we seperate data into three data set with the required size 

In [49]:
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
test_names = names[:500]
devtest_names = names[500:1000]
train_names = names[1000:]

In [50]:
train_names[1:10]

[('Adolf', 'male'),
 ('Fairfax', 'male'),
 ('Kala', 'female'),
 ('Eloise', 'female'),
 ('Margie', 'female'),
 ('Selle', 'female'),
 ('Murphy', 'male'),
 ('Durand', 'male'),
 ('Giffy', 'male')]

Each row in data set is a tuple include name and gender of a person

# Build features for the model

For the first feature set, we are gooing to look first letter, last letter, if last letter is vowel. We also look at the occurrence of each letter and suffix of last two characters 

In [51]:
def gender_features(name):
    features = {}
    features["FirstLetter"] = name[0].lower()
    features["LastLetter"] = name[-1].lower()
    features["LastIsVowel"] = (name[-1] in 'aeiouy')
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
        features["first(%s)" % letter] = name.lower().find(letter)
    features["suffix2"] = name[-2].lower()
    features["last2"]  = (name[-2].lower() + name[-1].lower())
    if len(name) >= 3:
        features["last3"] = (name[-3].lower() + name[-2].lower() + name[-1].lower())
    else:
        features["last3"] = (" " + name[-2].lower() + name[-1].lower())
    features["length"] = len(name)
    return features

# Building a Naive Bayes classifier 


First we apply the features to all three data sets and then train the model using Naive Bayes classifier

Development accurancy is 0.81

In [52]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
nbclassifier = nltk.NaiveBayesClassifier.train(train_set) 
print (nltk.classify.accuracy(nbclassifier, devtest_set))

0.81


Test accurancy is 0.8

In [56]:
print (nltk.classify.accuracy(nbclassifier, test_set))

0.8


# Error review

In [57]:
errors = []
for (name, tag) in devtest_names:
    guess = nbclassifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

for (tag, guess, name) in sorted(errors):
    print ('correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name))

correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Audrey                        
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bev                           
correct=female   guess=male     name=Bliss                         
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Corliss                       
correct=female   guess=male     name=Corry                         
correct=female   guess=male     name=Darb                          
correct=female   guess=male     name=Dion                          
correct=female   guess=male     name=Dody       

The first model has a high accurancy and both development and test data set get similar accurancy. so the model is applicable but looks like the model has problem to predict names with e as last letter and with ie as suffix

# Building Decision Tree classifier 

For the second model we use decision tree classifier and the same feature set as before

Development accurancy is 0.726

In [58]:
dtclassifier = nltk.classify.DecisionTreeClassifier.train(train_set, entropy_cutoff=0,support_cutoff=0)
print (nltk.classify.accuracy(dtclassifier, devtest_set))

0.726


Test accurancy is 0.732

In [59]:
print (nltk.classify.accuracy(dtclassifier, test_set))

0.732


The accurenacy is lower than naive bayes but the result is still consistant between test and development data set. 

# Building Naive Bayes clissifier using last letter and syllables count 

For the third model we add syllables count into considersion 

In [60]:
def count_syllables(word):
    vowels = 'aeiouy'
    word = word.lower()
    count = 0
    if word[0] in vowels:
        count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i-1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if word.endswith('bile'):
        count -= 1
    if count == 0:
        count += 1
    return count


def gender_features2(word):
    return {
        'last_letter': word[-1],
        'last_two': word[-2:],
        'last_is_vowel': (word[-1] in 'aeiouy'),
        'num_syll': count_syllables(word)
    }

In [61]:
train_set = [(gender_features2(n), g) for (n,g) in train_names]
devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names]
test_set = [(gender_features2(n), g) for (n,g) in test_names]
nbclassifier = nltk.NaiveBayesClassifier.train(train_set) 
print (nltk.classify.accuracy(nbclassifier, devtest_set))

0.776


In [62]:
print (nltk.classify.accuracy(dtclassifier, test_set))

0.622


This model still has higher accurancy than the decision tree but the result from development data set and test data set are not very consistant

# Error review for model with syllables count

In [63]:
errors = []
for (name, tag) in devtest_names:
    guess = nbclassifier.classify(gender_features2(name))
    if guess != tag:
        errors.append((tag, guess, name))

for (tag, guess, name) in sorted(errors):
    print ('correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name))

correct=female   guess=male     name=Aeriel                        
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Bev                           
correct=female   guess=male     name=Bliss                         
correct=female   guess=male     name=Brittan                       
correct=female   guess=male     name=Carlen                        
correct=female   guess=male     name=Cathleen                      
correct=female   guess=male     name=Cathyleen                     
correct=female   guess=male     name=Catlin                        
correct=female   guess=male     name=Chad                          
correct=female   guess=male     name=Charil     

From the error review it seems like remove suffix reduced accurancy because names end with en, een are predicted wrong
and it doesn't slove the low predict power for names end with ie

# How does the performance on the test set compare to the performance on the dev-test set

It is generally the model has predict power becasue all models are much better than random guess since accurancy is higher than 50% and the model with more features seems better performace becasue not only the accurancy are higer but also the predict power to new names are also high. In addition, Naive Bayes model seems to be better than decision tree in this problem. I think it makes sence to me becasue decision tree tend to overfit the model and Naive Bayse has a strong independence assumption and letters within a name should be independent 