# I. Data gathering and exploration

Let's read the data and explore the corpus a bit

In [5]:
from nltk.corpus import names
girl_names = names.words('female.txt')
print(girl_names[:10])
boy_names = names.words('male.txt')
print(boy_names[:10])
print("#GirlNames=", len(girl_names))
print("#BoyNames=", len(boy_names))



['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']
['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim']
#GirlNames= 5001
#BoyNames= 2943


In [7]:
from collections import Counter
fl_girls = Counter([name[-1] for name in girl_names])
print(fl_girls)
fl_boys = Counter([name[-1] for name in boy_names])
print(fl_boys)

Counter({'a': 1773, 'e': 1432, 'y': 461, 'n': 386, 'i': 317, 'l': 179, 'h': 105, 's': 93, 't': 68, 'r': 47, 'd': 39, 'o': 33, 'm': 13, 'g': 10, 'x': 10, 'b': 9, 'u': 6, 'w': 5, 'z': 4, 'k': 3, 'v': 2, 'p': 2, 'f': 2, ' ': 1, 'j': 1})
Counter({'n': 478, 'e': 468, 'y': 332, 's': 230, 'd': 228, 'r': 190, 'l': 187, 'o': 165, 't': 164, 'h': 93, 'm': 70, 'k': 69, 'i': 50, 'g': 32, 'a': 29, 'f': 25, 'c': 25, 'b': 21, 'p': 18, 'w': 17, 'v': 16, 'u': 12, 'z': 11, 'x': 10, 'j': 3})


# II. Feature Engineering

What are the best markers that predict what gender a pronoun is?

In [9]:
def features1(name):
    return {'last_letter': name[-1]}

print(features1('Abbe'))

{'last_letter': 'e'}


In [23]:
def features2(name):
    return {
        'last_letter': name[-1], 
        'last2_letter': name[-2:],
    }

In [32]:
def features3(name):
    return {
        'last_letter': name[-1], 
        'last2_letter': name[-2:],
        'length': len(name),
        'first_letter': name[0]
    }

In [14]:
import random
def build_dataset(feature_extractor):
    boy_names, girl_names = names.words('male.txt'), names.words('female.txt')
    boy_dataset = [(feature_extractor(name), 'boy') for name in boy_names]
    girl_dataset = [(feature_extractor(name), 'girl') for name in girl_names]
    dataset = boy_dataset + girl_dataset
    random.shuffle(dataset)
    cutoff = int(0.75 * len(dataset))
    train_data, test_data = dataset[:cutoff], dataset[cutoff + 1:]
    return train_data, test_data

train_data, test_data = build_dataset(features1)
print(train_data[:10])

[({'last_letter': 's'}, 'girl'), ({'last_letter': 'r'}, 'boy'), ({'last_letter': 'n'}, 'boy'), ({'last_letter': 'y'}, 'boy'), ({'last_letter': 'e'}, 'girl'), ({'last_letter': 'y'}, 'girl'), ({'last_letter': 'l'}, 'boy'), ({'last_letter': 'a'}, 'girl'), ({'last_letter': 'n'}, 'girl'), ({'last_letter': 'n'}, 'girl')]


# III. Building a model

Let's choose a simple model and train it on our dataset. Assess the performance.

In [22]:
import nltk

train_data, test_data = build_dataset(features1)
# print(train_data[:10])
name_guesser = nltk.DecisionTreeClassifier.train(train_data)
name_guesser.classify(features1('Luca'))

[({'last_letter': 'o'}, 'boy'), ({'last_letter': 'a'}, 'girl'), ({'last_letter': 'y'}, 'boy'), ({'last_letter': 'u'}, 'boy'), ({'last_letter': 'n'}, 'boy'), ({'last_letter': 'r'}, 'boy'), ({'last_letter': 'h'}, 'girl'), ({'last_letter': 'a'}, 'girl'), ({'last_letter': 'a'}, 'girl'), ({'last_letter': 'a'}, 'girl')]


'girl'

In [27]:
import nltk

train_data, test_data = build_dataset(features2)
# print(train_data[:10])

name_guesser = nltk.DecisionTreeClassifier.train(train_data)
name_guesser.classify(features2('Luca'))

print(name_guesser.pretty_format())

last2_letter=Ag? ...................................... girl
last2_letter=Al? ...................................... boy
last2_letter=Bo? ...................................... girl
last2_letter=Di? ...................................... girl
last2_letter=Er? ...................................... boy
last2_letter=Ev? ...................................... boy
last2_letter=Ez? ...................................... boy
last2_letter=Hy? ...................................... boy
last2_letter=Jo? ...................................... girl
last2_letter=Ki? ...................................... girl
last2_letter=La? ...................................... girl
last2_letter=Si? ...................................... boy
last2_letter=Ty? ...................................... boy
last2_letter=Vi? ...................................... girl
last2_letter=aa? ...................................... boy
last2_letter=ab? ...................................... boy
last2_letter=ac? ................

In [45]:
import nltk

train_data, test_data = build_dataset(features2)
# print(train_data[:10])

name_guesser = nltk.DecisionTreeClassifier.train(train_data)
print(name_guesser.classify(features3('Luca')))
print(name_guesser.classify(features3('Luke')))
print(name_guesser.classify(features3('John')))
print(name_guesser.classify(features3('Lara')))
print(name_guesser.classify(features3('Laura')))
print(name_guesser.classify(features3('Andrea')))
print(name_guesser.classify(features3('Andrea')))

print(nltk.classify.accuracy(name_guesser, test_data))

girl
boy
boy
girl
girl
girl
girl
0.7808564231738035


In [51]:
print(test_data[:10])
labels, predictions = [], []
for data_point in test_data:
    labels.append(data_point[1])
    predictions.append(name_guesser.classify(data_point[0]))
    
print(predictions[:10])
print(labels[:10])

correct_predictions = 0
total_prediction = 0

for p, l in zip(predictions, labels):
    if p == l:
        correct_predictions += 1
        
    total_prediction += 1
    
print(correct_predictions / total_prediction)

[({'last_letter': 'h', 'last2_letter': 'th'}, 'boy'), ({'last_letter': 'a', 'last2_letter': 'ta'}, 'girl'), ({'last_letter': 'a', 'last2_letter': 'ha'}, 'girl'), ({'last_letter': 'y', 'last2_letter': 'ny'}, 'girl'), ({'last_letter': 'i', 'last2_letter': 'fi'}, 'girl'), ({'last_letter': 'i', 'last2_letter': 'ri'}, 'boy'), ({'last_letter': 't', 'last2_letter': 'tt'}, 'boy'), ({'last_letter': 't', 'last2_letter': 'at'}, 'girl'), ({'last_letter': 'a', 'last2_letter': 'ea'}, 'boy'), ({'last_letter': 'i', 'last2_letter': 'di'}, 'girl')]
['girl', 'girl', 'girl', 'girl', 'girl', 'girl', 'boy', 'boy', 'girl', 'girl']
['boy', 'girl', 'girl', 'girl', 'girl', 'boy', 'boy', 'girl', 'boy', 'girl']
0.7808564231738035
