# I. Data gathering and exploration

Let's read the data and explore the corpus a bit

In [1]:
import collections
from nltk.corpus import names

# Check the girl names
girl_names = names.words('female.txt') 
print(girl_names[:10], '...') 
print('#GirlNames=', len(girl_names)) # 5001

# Check the boy names
boy_names = names.words('male.txt') 
print(boy_names[:10], '...') 
print('#BoyNames=', len(boy_names)) # 2943

# There are a few less boy names than girl names

# In this case, because the data was very conveniently stored, this step took very little

# Check that there are names belonging to both the classes

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale'] ...
#GirlNames= 5001
['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim'] ...
#BoyNames= 2943


In [2]:
# We know that there are a lot of girl names that end in `a`
# Let's see how many
girl_names_ending_in_a = [name for name in girl_names if name.endswith('a')] 
print('#GirlNamesEndingInA=', len(girl_names_ending_in_a)) # 1773

# Approx. a third of girl names end in `a`. That's a good insight
# Let's see what are the most common letters girl names end with 
girl_ending_letters = collections.Counter([name[-1] for name in girl_names]) 
print("MostCommonEndingLettersForGirls=", girl_ending_letters)

# Our intuition was right. The most common letter is `a` 
# Here are the first 3: 'a': 1773, 'e': 1432, 'y': 461

#GirlNamesEndingInA= 1773
MostCommonEndingLettersForGirls= Counter({'a': 1773, 'e': 1432, 'y': 461, 'n': 386, 'i': 317, 'l': 179, 'h': 105, 's': 93, 't': 68, 'r': 47, 'd': 39, 'o': 33, 'm': 13, 'g': 10, 'x': 10, 'b': 9, 'u': 6, 'w': 5, 'z': 4, 'k': 3, 'v': 2, 'p': 2, 'f': 2, ' ': 1, 'j': 1})


In [3]:
# I'm not sure what's the most common last letter for English boy names

# Let's see the stats
boy_ending_letters = collections.Counter([name[-1] for name in boy_names]) 
print("MostCommonEndingLettersForBoys=", boy_ending_letters)
# Here are the first 3: 'n': 478, 'e': 468, 'y': 332,

# Here’s a problem: the 2nd and 3rd most common last letters are the same for both genders. 

MostCommonEndingLettersForBoys= Counter({'n': 478, 'e': 468, 'y': 332, 's': 230, 'd': 228, 'r': 190, 'l': 187, 'o': 165, 't': 164, 'h': 93, 'm': 70, 'k': 69, 'i': 50, 'g': 32, 'a': 29, 'f': 25, 'c': 25, 'b': 21, 'p': 18, 'w': 17, 'v': 16, 'u': 12, 'z': 11, 'x': 10, 'j': 3})


# II. Feature Engineering

What are the best markers that predict what gender a pronoun is?

In [5]:
def features1(name): 
    """
    Iteration1: just take the last letter into consideration
    """
    return {
        'last_letter': name[-1] 
    }

In [7]:
import random


def build_dataset(feature_extractor):
    """
    Build a name traning set
    """
    # Get the names
    boy_names, girl_names = names.words('male.txt'), names.words('female.txt')
    
    # Build the dataset
    boy_names_dataset = [(feature_extractor(name), 'boy') for name in boy_names] 
    girl_names_dataset = [(feature_extractor(name), 'girl') for name in girl_names]
    
    # Put all the names together
    data = boy_names_dataset + girl_names_dataset # Mix everything together
    random.shuffle(data)

    # Split the dataset into training data and test data
    cutoff = int(0.75 * len(data))
    train_data, test_data = data[:cutoff], data[cutoff + 1:]
    return train_data, test_data

In [13]:
def features2(name): 
    """
    Iteration2: last letter + last 2 letters
    """
    return {
        'last_letter': name[-1], 
        'last2_letter': name[-2:] 
    }

In [15]:
def features3(name): 
    """
    Iteration2: first letter, last letter, last 2 letters, vowel count
    """
    return {
        'last_letter': name[-1], 
        'first_letter': name[0], 
        'last2_letter': name[-2:],
        'vowel_count': len([c for c in name if c in 'AEIOUaeiou'])
    }

# III. Building a model

Let's choose a simple model and train it on our dataset. Assess the performance.

In [10]:
import nltk

train_data, test_data = build_dataset(features1)

# Let's train probably the most popular classifier in the world
name_classifier = nltk.DecisionTreeClassifier.train(train_data)

# Take if for a spin 
print(name_classifier.classify(features1('Bono'))) # boy 
print(name_classifier.classify(features1('Latiffa'))) # girl
print(name_classifier.classify(features1('Gaga'))) # girl 
print(name_classifier.classify(features1('Joey'))) # girl

print(nltk.classify.accuracy(name_classifier, test_data)) # 0.7420654911838791
print(name_classifier.pretty_format())

boy
girl
girl
girl
0.7596977329974811
last_letter= ? ........................................ girl
last_letter=a? ........................................ girl
last_letter=b? ........................................ boy
last_letter=c? ........................................ boy
last_letter=d? ........................................ boy
last_letter=e? ........................................ girl
last_letter=f? ........................................ boy
last_letter=g? ........................................ boy
last_letter=h? ........................................ girl
last_letter=i? ........................................ girl
last_letter=j? ........................................ boy
last_letter=k? ........................................ boy
last_letter=l? ........................................ girl
last_letter=m? ........................................ boy
last_letter=n? ........................................ boy
last_letter=o? ........................................ 

In [14]:
import nltk

train_data, test_data = build_dataset(features2)

# Let's train probably the most popular classifier in the world
name_classifier = nltk.DecisionTreeClassifier.train(train_data)

# Take if for a spin 
print(name_classifier.classify(features2('Bono'))) # boy 
print(name_classifier.classify(features2('Latiffa'))) # girl
print(name_classifier.classify(features2('Gaga'))) # girl 
print(name_classifier.classify(features2('Joey'))) # girl

print(nltk.classify.accuracy(name_classifier, test_data)) # 0.783375314861461
print(name_classifier.pretty_format())

boy
girl
girl
girl
0.783375314861461
last2_letter=Al? ...................................... boy
last2_letter=Bo? ...................................... boy
last2_letter=Ed? ...................................... boy
last2_letter=Em? ...................................... girl
last2_letter=Er? ...................................... boy
last2_letter=Ev? ...................................... boy
last2_letter=Ez? ...................................... boy
last2_letter=Hy? ...................................... boy
last2_letter=Jo? ...................................... girl
last2_letter=Ki? ...................................... girl
last2_letter=La? ...................................... girl
last2_letter=Si? ...................................... boy
last2_letter=Ty? ...................................... boy
last2_letter=Vi? ...................................... girl
last2_letter=aa? ...................................... boy
last2_letter=ab? ...................................... bo

In [17]:
import nltk

train_data, test_data = build_dataset(features3)

# Let's train probably the most popular classifier in the world
name_classifier = nltk.DecisionTreeClassifier.train(train_data)

# Take if for a spin 
print(name_classifier.classify(features3('Bono'))) # boy 
print(name_classifier.classify(features3('Latiffa'))) # girl
print(name_classifier.classify(features3('Gaga'))) # girl 
print(name_classifier.classify(features3('Joey'))) # girl

print(nltk.classify.accuracy(name_classifier, test_data)) # 0.783375314861461
print(name_classifier.pretty_format())

boy
girl
girl
girl
0.7944584382871537
last2_letter=Ag? ...................................... girl
last2_letter=Bo? ...................................... boy
last2_letter=Cy? ...................................... boy
last2_letter=Di? ...................................... girl
last2_letter=Ed? ...................................... boy
last2_letter=Ev? ...................................... boy
last2_letter=Ez? ...................................... boy
last2_letter=Hy? ...................................... boy
last2_letter=Jo? ...................................... girl
last2_letter=Ki? ...................................... girl
last2_letter=La? ...................................... girl
last2_letter=Ty? ...................................... boy
last2_letter=Vi? ...................................... girl
last2_letter=aa? ...................................... boy
last2_letter=ab? ...................................... girl
  first_letter=B? .....................................