Name Gender Classifier

In [1]:
import nltk

names = [(name, 'male') for name in nltk.corpus.names.words("male.txt")]
names += [(name, 'female') for name in nltk.corpus.names.words("female.txt")]

In [6]:
def extract_gender_features(name):
    name = name.lower()
    features = {}
    features["suffix"] = name[-1:]
    features["suffix2"] = name[-2:] if len(name)>1 else name[0]
    features["suffix3"] = name[-3:] if len(name)>2 else name[0]
    features["prefix"] = name[:1]
    features["prefix2"] = name[:2] if len(name)>1 else name[0]
    features["prefix3"] = name[:3] if len(name)>2 else name[0]
    features["prefix4"] = name[:4] if len(name)>3 else name[0]
    features["prefix5"] = name[:5] if len(name)>4 else name[0]
    features["wordLen"] = len(name)
    
    for letter in "abcdefghijklmnopqrstuvwxyz":
        features[letter + "-count"] = name.count(letter)
    
    return features


def classify(name):
    label = bayes.classify(extract_gender_features(name))
    print("name=", name, "classified as: ", label)

In [5]:
data = [(extract_gender_features(name), gender) for (name, gender) in names]

import random
random.shuffle(data)

In [7]:
dataCount = len(data)
trainCount = int(0.8*dataCount)

trainData = data[:trainCount]
testData = data[trainCount:]
bayes = nltk.NaiveBayesClassifier.train(trainData)

print("trainData accuracy=", nltk.classify.accuracy(bayes, trainData))
print("testData accuracy=", nltk.classify.accuracy(bayes, testData))

classify("david")
classify("susan")
classify("alex")

bayes.show_most_informative_features(25)

trainData accuracy= 0.9142407553107789
testData accuracy= 0.8521082441787288
name= david classified as:  male
name= susan classified as:  female
name= alex classified as:  male
Most Informative Features
                 suffix2 = 'na'           female : male   =     84.6 : 1.0
                 suffix2 = 'la'           female : male   =     65.0 : 1.0
                 suffix2 = 'ia'           female : male   =     46.4 : 1.0
                  suffix = 'a'            female : male   =     34.0 : 1.0
                 suffix2 = 'sa'           female : male   =     31.8 : 1.0
                  suffix = 'k'              male : female =     28.9 : 1.0
                 suffix2 = 'do'             male : female =     27.2 : 1.0
                 suffix2 = 'rt'             male : female =     27.0 : 1.0
                 suffix2 = 'rd'             male : female =     25.0 : 1.0
                 suffix2 = 'io'             male : female =     23.9 : 1.0
                 suffix2 = 'us'             mal

In [20]:
errors = []

for (name, labels) in names:
    if bayes.classify(extract_gender_features(name)) != labels:
        errors.append({"name": name,  "label": labels})
        
errors

[{'name': 'Abbey', 'label': 'male'},
 {'name': 'Abby', 'label': 'male'},
 {'name': 'Addie', 'label': 'male'},
 {'name': 'Aditya', 'label': 'male'},
 {'name': 'Adrian', 'label': 'male'},
 {'name': 'Adrien', 'label': 'male'},
 {'name': 'Alaa', 'label': 'male'},
 {'name': 'Alan', 'label': 'male'},
 {'name': 'Alfie', 'label': 'male'},
 {'name': 'Ali', 'label': 'male'},
 {'name': 'Alix', 'label': 'male'},
 {'name': 'Allah', 'label': 'male'},
 {'name': 'Allen', 'label': 'male'},
 {'name': 'Allie', 'label': 'male'},
 {'name': 'Allin', 'label': 'male'},
 {'name': 'Allyn', 'label': 'male'},
 {'name': 'Amery', 'label': 'male'},
 {'name': 'Anatole', 'label': 'male'},
 {'name': 'Andie', 'label': 'male'},
 {'name': 'Andre', 'label': 'male'},
 {'name': 'Andrea', 'label': 'male'},
 {'name': 'Andri', 'label': 'male'},
 {'name': 'Andy', 'label': 'male'},
 {'name': 'Angel', 'label': 'male'},
 {'name': 'Angie', 'label': 'male'},
 {'name': 'Antoine', 'label': 'male'},
 {'name': 'Antone', 'label': 'male'},

In [28]:
# part 1
from nltk.corpus import movie_reviews as reviews
import random

docs = [(list(reviews.words(id)), cat) for cat in reviews.categories() for id in reviews.fileids(cat)]
random.shuffle(docs)

fd = nltk.FreqDist(word.lower() for word in reviews.words())
topKeys = [key for (key, value) in fd.most_common(2000)]

In [30]:
# part 2
def review_features(doc):
    docSet = set(doc)
    features = {}
    
    for word in topKeys:
        features[word] = (word in docSet)
        
    return features
    

data = [(review_features(doc), label) for (doc, label) in docs]

dataCount = len(data)
trainCount = int(0.8*dataCount)

trainData = data[:trainCount]
testData = data[trainCount:]

bayes2 = nltk.NaiveBayesClassifier.train(trainData)

print("train AccuracY=", nltk.classify.accuracy(bayes2, trainData))
print("test accuracy=", nltk.classify.accuracy(bayes2, testData))
bayes2.show_most_informative_features(20)

train AccuracY= 0.86875
test accuracy= 0.815
Most Informative Features
             outstanding = True              pos : neg    =      9.3 : 1.0
                    lame = True              neg : pos    =      7.5 : 1.0
                  seagal = True              neg : pos    =      6.2 : 1.0
                   damon = True              pos : neg    =      6.1 : 1.0
             wonderfully = True              pos : neg    =      5.9 : 1.0
                   flynt = True              pos : neg    =      5.6 : 1.0
                   awful = True              neg : pos    =      5.5 : 1.0
              ridiculous = True              neg : pos    =      5.2 : 1.0
                  poorly = True              neg : pos    =      5.2 : 1.0
                  wasted = True              neg : pos    =      5.0 : 1.0
                   waste = True              neg : pos    =      5.0 : 1.0
               fantastic = True              pos : neg    =      5.0 : 1.0
                    jedi = Tr