## Feature extraction

The code below was extracted from https://www.nltk.org/book/ch06.html

In [1]:
import random
import nltk
from nltk.corpus import names

In [2]:
print("\nNumber of male names:")
print (len(names.words('male.txt')))


Number of male names:
2943


In [3]:
print("\nNumber of female names:")
print (len(names.words('female.txt')))


Number of female names:
5001


In [4]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
print("\nFirst 10 male names:")
print (male_names[0:15])
print("\nFirst 10 female names:")
print (female_names[0:15])


First 10 male names:
['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner']

First 10 female names:
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline']


In [5]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [6]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [18]:
labeled_names[0:10]

[('Andromeda', 'female'),
 ('Kerrill', 'female'),
 ('Romonda', 'female'),
 ('Bjorne', 'male'),
 ('Christian', 'female'),
 ('Janus', 'male'),
 ('Kerstin', 'female'),
 ('Loree', 'female'),
 ('Arne', 'male'),
 ('Rochette', 'female')]

In [8]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
print(nltk.classify.accuracy(classifier, test_set))

0.744


In [10]:
classifier.classify(gender_features('Neo'))

'male'

In [11]:
classifier.classify(gender_features('Andrea'))

'female'

In [12]:
classifier.classify(gender_features('Pepe'))

'female'

In [13]:
classifier.classify(gender_features('Marcela'))

'female'

In [14]:
classifier.classify(gender_features('Luis'))

'male'

In [16]:
classifier.show_most_informative_features(15)

Most Informative Features
             last_letter = 'a'            female : male   =     36.8 : 1.0
             last_letter = 'k'              male : female =     33.2 : 1.0
             last_letter = 'f'              male : female =     17.3 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'p'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =      9.7 : 1.0
             last_letter = 'm'              male : female =      8.1 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      7.3 : 1.0
             last_letter = 'w'              male : female =      6.6 : 1.0
             last_letter = 'g'              male : female =      5.5 : 1.0
             last_letter = 't'              male : female =      4.4 : 1.0
             last_letter = 'z'              male : female =      4.3 : 1.0

In [19]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [20]:
gender_features2('Marcela')

{'first_letter': 'm',
 'last_letter': 'a',
 'count(a)': 2,
 'has(a)': True,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 1,
 'has(c)': True,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 0,
 'has(h)': False,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 1,
 'has(l)': True,
 'count(m)': 1,
 'has(m)': True,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [21]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [44]:
test_set[0]

({'first_letter': 'a',
  'last_letter': 'a',
  'count(a)': 2,
  'has(a)': True,
  'count(b)': 0,
  'has(b)': False,
  'count(c)': 0,
  'has(c)': False,
  'count(d)': 2,
  'has(d)': True,
  'count(e)': 1,
  'has(e)': True,
  'count(f)': 0,
  'has(f)': False,
  'count(g)': 0,
  'has(g)': False,
  'count(h)': 0,
  'has(h)': False,
  'count(i)': 0,
  'has(i)': False,
  'count(j)': 0,
  'has(j)': False,
  'count(k)': 0,
  'has(k)': False,
  'count(l)': 0,
  'has(l)': False,
  'count(m)': 1,
  'has(m)': True,
  'count(n)': 1,
  'has(n)': True,
  'count(o)': 1,
  'has(o)': True,
  'count(p)': 0,
  'has(p)': False,
  'count(q)': 0,
  'has(q)': False,
  'count(r)': 1,
  'has(r)': True,
  'count(s)': 0,
  'has(s)': False,
  'count(t)': 0,
  'has(t)': False,
  'count(u)': 0,
  'has(u)': False,
  'count(v)': 0,
  'has(v)': False,
  'count(w)': 0,
  'has(w)': False,
  'count(x)': 0,
  'has(x)': False,
  'count(y)': 0,
  'has(y)': False,
  'count(z)': 0,
  'has(z)': False},
 'female')

In [22]:
print(nltk.classify.accuracy(classifier, test_set))

0.744


In [40]:
classifier.classify(gender_features('Viviane'))

'female'

In [41]:
classifier.show_most_informative_features(15)

Most Informative Features
             last_letter = 'a'            female : male   =     36.8 : 1.0
             last_letter = 'k'              male : female =     33.2 : 1.0
             last_letter = 'f'              male : female =     17.3 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'p'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =      9.7 : 1.0
                count(v) = 2              female : male   =      8.8 : 1.0
             last_letter = 'm'              male : female =      8.1 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      7.3 : 1.0
             last_letter = 'w'              male : female =      6.6 : 1.0
             last_letter = 'g'              male : female =      5.5 : 1.0
                count(w) = 2                male : female =      5.1 : 1.0

## References
* https://www.nltk.org/book/ch06.html