## 1 Supervised classification:

##### Gender identification:

In [150]:
import nltk

def gender_features(name):
    return {'last_letter': name[-1]
            #, 'first_letter': word[0]
            #, 'name_length': len(word)
           } # builds a feature set: human-readable feature name and value pair

gender_features('Shrek')

{'last_letter': 'k'}

In [151]:
def gender_features2(name):
    features = {}
    features['first_letter'] = name[0].lower()
    features['last_letter'] = name[-1].lower()
    for x in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(x)] = name.lower().count(x)
        features['has({})'.format(x)] = (x in name.lower())
    return features

In [152]:
from nltk.corpus import names

labeled_names = ([(x, 'male') for x in names.words('male.txt')] +
                [(x, 'female') for x in names.words('female.txt')])

import random
random.shuffle(labeled_names)
labeled_names[:5]

[('Cristen', 'female'),
 ('Moore', 'male'),
 ('Ursala', 'female'),
 ('Alston', 'male'),
 ('Kellen', 'female')]

In [153]:
nltk.FreqDist([y for x, y in labeled_names])

FreqDist({'female': 5001, 'male': 2943})

In [169]:
# When feature set is relatively small

#featuresets = [(gender_features2(x), y) for x, y in labeled_names]
#train_set, test_set = featuresets[500:], featuresets[:500]

In [156]:
# When feature set is large

from nltk.classify import apply_features

train_set = apply_features(gender_features2, labeled_names[500:])
test_set = apply_features(gender_features2, labeled_names[:500])

In [157]:
len(featuresets), len(train_set)

(7944, 7444)

In [166]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

name = 'Trinity'
print('{} is a '.format(name) + classifier.classify(gender_features2(name)))

Trinity is a female


In [159]:
print(nltk.classify.accuracy(classifier, test_set))

0.752


Find most the most effective features:

In [168]:
classifier.show_most_informative_features(6) # likelihood ratios

Most Informative Features
             last_letter = 'a'            female : male   =     34.4 : 1.0
             last_letter = 'k'              male : female =     30.0 : 1.0
             last_letter = 'f'              male : female =     17.1 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0
                count(v) = 2              female : male   =      9.7 : 1.0


##### Choosing the right features:

In [161]:
def gender_features2(name):
    features = {}
    features['first_letter'] = name[0].lower()
    features['last_letter'] = name[-1].lower()
    for x in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(x)] = name.lower().count(x)
        features['has({})'.format(x)] = (x in name.lower())
    return features

In [190]:
#gender_features2('John')

In [191]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
#len(train_names), len(devtest_names), len(test_names)

Apply features to the dataset:

In [192]:
train_set = apply_features(gender_features, train_names)
devtest_set = apply_features(gender_features, devtest_names)
test_set = apply_features(gender_features, test_names)

Train the classifier:

In [193]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.762


Review errors:

In [195]:
errors = []

for x, y in devtest_names:
    guess = classifier.classify(gender_features(x))
    if guess != y:
        errors.append((y, guess, x))
        
for (y, guess, x) in sorted(errors)[:10]:
    print('correct={:<8}' 'guess={:<8s}' 'name={:<30}'.format(y, guess, x))

correct=female  guess=male    name=Adel                          
correct=female  guess=male    name=Ailyn                         
correct=female  guess=male    name=Allison                       
correct=female  guess=male    name=Angil                         
correct=female  guess=male    name=Annabel                       
correct=female  guess=male    name=Anne-Mar                      
correct=female  guess=male    name=Arabel                        
correct=female  guess=male    name=Arlen                         
correct=female  guess=male    name=Berget                        
correct=female  guess=male    name=Bliss                         


Update features:

In [199]:
def gender_features(name):
    return {'suffix1': name[-1],
            'suffix2': name[-2:]}

gender_features('Ailyn')

{'suffix1': 'n', 'suffix2': 'yn'}

Retrain the classifier:

In [204]:
train_set = apply_features(gender_features, train_names)
devtest_set = apply_features(gender_features, devtest_names)
test_set = apply_features(gender_features, test_names)

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.787


In [202]:
classifier.show_most_informative_features(6)

Most Informative Features
                 suffix2 = 'na'           female : male   =     88.7 : 1.0
                 suffix2 = 'la'           female : male   =     63.6 : 1.0
                 suffix2 = 'us'             male : female =     61.1 : 1.0
                 suffix2 = 'ia'           female : male   =     32.8 : 1.0
                 suffix1 = 'a'            female : male   =     32.3 : 1.0
                 suffix2 = 'sa'           female : male   =     30.9 : 1.0


Final accuracy test:

In [208]:
print(nltk.classify.accuracy(classifier, test_set))
name = 'Neo'
print('{} is a '.format(name) + classifier.classify(gender_features2(name)))

0.758
Neo is a female
