In [1]:
import nltk

In [5]:
def gender_features(word):
    return {"last_letter":word[-1]}

In [14]:
from nltk.corpus import names
print(names.words(["male.txt"])[:20])
print(names.words(["female.txt"])[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [17]:
namegender = [(name, "male") for name in names.words(["male.txt"])] +\
    [(name, "female") for name in names.words(["female.txt"])]
print(len(namegender))
print(namegender[:10])
print(namegender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [21]:
import random
random.shuffle(namegender)
print(namegender[:20])

[('Forbes', 'male'), ('Eba', 'female'), ('Linet', 'female'), ('Susana', 'female'), ('Celie', 'female'), ('Cinnamon', 'female'), ('Engracia', 'female'), ('Roobbie', 'female'), ('Bonny', 'female'), ('Jaimie', 'female'), ('Manuel', 'male'), ('Arthur', 'male'), ('Elke', 'female'), ('Nadine', 'female'), ('Kendal', 'male'), ('Jenny', 'female'), ('Jakob', 'male'), ('Shana', 'female'), ('Tommi', 'female'), ('Lee', 'female')]


In [43]:
trainnames = namegender[500:]
testnames = namegender[:500]

In [44]:
train_set = [(gender_features(name), gender ) for name, gender in trainnames]
test_set = [(gender_features(name), gender ) for name, gender in testnames]  

In [45]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [46]:
print(classifier.classify(gender_features("Neo")))
print(classifier.classify(gender_features("Mary")))
print(classifier.classify(gender_features("Trinity")))

male
female
female


In [47]:
nltk.classify.accuracy(classifier, test_set)

0.786

In [48]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'k'              male : female =     43.5 : 1.0
             last_letter = 'a'            female : male   =     35.6 : 1.0
             last_letter = 'f'              male : female =     15.8 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     10.4 : 1.0
             last_letter = 'd'              male : female =     10.0 : 1.0
             last_letter = 'm'              male : female =      8.4 : 1.0
             last_letter = 'o'              male : female =      7.7 : 1.0
             last_letter = 'r'              male : female =      6.6 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'g'              male : female =      5.1 : 1.0
             last_letter = 'z'              male : female =      4.3 : 1.0
             last_letter = 't'              male : female =      4.2 : 1.0

In [51]:
def get_errors(test):
    errors = []
    for (name, gender) in testnames:
        guess = classifier.classify(gender_features(name))
        if guess != gender:
            errors.append((gender, guess, name))
    return errors

In [64]:
errors = get_errors(test_set)
print(len(errors))
for gender, guess, name in errors:
    print("Tag: {:s}\tGuess: {:s}\tName: {:s}".format(gender, guess, name))

107
Tag: female	Guess: male	Name: Linet
Tag: female	Guess: male	Name: Cinnamon
Tag: male	Guess: female	Name: Shorty
Tag: male	Guess: female	Name: Moe
Tag: male	Guess: female	Name: Tedie
Tag: male	Guess: female	Name: Trace
Tag: female	Guess: male	Name: Nell
Tag: female	Guess: male	Name: Fanchon
Tag: female	Guess: male	Name: Dagmar
Tag: male	Guess: female	Name: Sherlocke
Tag: male	Guess: female	Name: Ossie
Tag: female	Guess: male	Name: Kirstyn
Tag: male	Guess: female	Name: Kalle
Tag: male	Guess: female	Name: Barth
Tag: female	Guess: male	Name: Madel
Tag: female	Guess: male	Name: Karylin
Tag: female	Guess: male	Name: Noelyn
Tag: female	Guess: male	Name: Imogen
Tag: female	Guess: male	Name: Wileen
Tag: male	Guess: female	Name: Wylie
Tag: male	Guess: female	Name: Augustine
Tag: male	Guess: female	Name: Joe
Tag: female	Guess: male	Name: Gwenn
Tag: female	Guess: male	Name: Mirabel
Tag: female	Guess: male	Name: Alison
Tag: male	Guess: female	Name: Pooh
Tag: female	Guess: male	Name: Moll
Tag: f

Define a new feature extraction function that includes features for two-letter suffixes, such as the one here:

def gender_features3(word):

return {‘suffix1’: word[-1],

‘suffix2’: word[-2]}

Keep the variables train_names and test_names that define the training and test set.

Make new train_set and test_set variables. Carry out the classification and look at the errors in the test set.

Is this classification more accurate? Can you see error examples that you could use to make new features to improve? (You don’t have to do this, just observe it.)

Make an answer with your original accuracy on the test set and the new accuracy, and you may also make any observations that you can about the remaining errors.

If you have time, you can make a new gender_features function that keeps three suffix letters, but make allowances if any names are only two characters long. Or perhaps a gender_features function that uses the first letter and the last two letters.

In [53]:
def letter_extractions(name):
    return {"last_letter": name[-1], "previous_letter":name[-1]}

In [54]:
train_set = [(letter_extractions(name), gender) for name, gender in trainnames]
test_set = [(letter_extractions(name), gender) for name, gender in testnames]

In [57]:
classifier2 = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier2, test_set)

0.788

In [59]:
def get_errors2(test):
    errors = []
    for (name, gender) in testnames:
        guess = classifier2.classify(letter_extractions(name))
        if guess != gender:
            errors.append((gender, guess, name))
    return errors

In [67]:
errors2 = get_errors2(testnames)
print(len(errors))
errors2

107


[('female', 'male', 'Linet'),
 ('female', 'male', 'Cinnamon'),
 ('male', 'female', 'Shorty'),
 ('male', 'female', 'Moe'),
 ('male', 'female', 'Tedie'),
 ('male', 'female', 'Trace'),
 ('female', 'male', 'Nell'),
 ('female', 'male', 'Fanchon'),
 ('female', 'male', 'Dagmar'),
 ('male', 'female', 'Sherlocke'),
 ('male', 'female', 'Ossie'),
 ('female', 'male', 'Kirstyn'),
 ('male', 'female', 'Kalle'),
 ('female', 'male', 'Ardith'),
 ('female', 'male', 'Madel'),
 ('female', 'male', 'Karylin'),
 ('female', 'male', 'Noelyn'),
 ('female', 'male', 'Rebekah'),
 ('female', 'male', 'Imogen'),
 ('female', 'male', 'Wileen'),
 ('male', 'female', 'Wylie'),
 ('male', 'female', 'Augustine'),
 ('male', 'female', 'Joe'),
 ('female', 'male', 'Gwenn'),
 ('female', 'male', 'Mirabel'),
 ('female', 'male', 'Alison'),
 ('female', 'male', 'Moll'),
 ('female', 'male', 'Jesselyn'),
 ('female', 'male', 'Meagan'),
 ('male', 'female', 'Noble'),
 ('male', 'female', 'Zane'),
 ('female', 'male', 'Cristen'),
 ('male', 'fe

In [62]:
classifier2.show_most_informative_features(20)

Most Informative Features
             last_letter = 'k'              male : female =     43.5 : 1.0
         previous_letter = 'k'              male : female =     43.5 : 1.0
             last_letter = 'a'            female : male   =     35.6 : 1.0
         previous_letter = 'a'            female : male   =     35.6 : 1.0
             last_letter = 'f'              male : female =     15.8 : 1.0
         previous_letter = 'f'              male : female =     15.8 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
         previous_letter = 'p'              male : female =     12.5 : 1.0
         previous_letter = 'v'              male : female =     10.4 : 1.0
             last_letter = 'v'              male : female =     10.4 : 1.0
             last_letter = 'd'              male : female =     10.0 : 1.0
         previous_letter = 'd'              male : female =     10.0 : 1.0
             last_letter = 'm'              male : female =      8.4 : 1.0

In [72]:
better_names = [item for item in errors if item not in errors2]
better_names

[('male', 'female', 'Barth'),
 ('male', 'female', 'Pooh'),
 ('male', 'female', 'Hirsch'),
 ('male', 'female', 'Garth'),
 ('male', 'female', 'Rolph'),
 ('male', 'female', 'Ramesh'),
 ('male', 'female', 'Zebadiah'),
 ('male', 'female', 'Worth')]

In [78]:
def first_last(word):
    if len(word) >= 3:
        return {"first":word[0], "middle":word[-2], "last":word[-1]}

train_set = [(first_last(name), gender) for name, gender in trainnames if first_last(name) is not None]
test_set = [(first_last(name), gender) for name, gender in testnames  if first_last(name) is not None]

classifier3 = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier3, test_set))

0.806


In [84]:
from nltk.corpus import movie_reviews
movie_reviews.categories()

['neg', 'pos']

In [107]:
documents = [(list(movie_reviews.words(fileid)), cat) for cat in 
             movie_reviews.categories() for fileid in movie_reviews.fileids(cat)]

In [112]:
random.shuffle(documents)
test = documents[:100]
train = documents[100:]

In [113]:
word_freq = nltk.FreqDist([w.lower() for document in documents for w  in document[0]])

In [156]:
most_common = word_freq.most_common(2000)
word_feature = [w for w, c in most_common]

def get_features(document, feature_set):
    document_words = set(document)
    feature_dict = {}
    for w in feature_set:
        key = "V_{:s}".format(w)
        feature_dict[key] = (w in document_words)
    return feature_dict

In [157]:
train_set = [(get_features(doc, word_feature), cat) for doc, cat in train]
test_set = [(get_features(doc, word_feature), cat) for doc, cat in test]

In [158]:
movie_classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(movie_classifier, test_set))

0.84


In [165]:
from nltk.corpus import stopwords
import re

stop = stopwords.words("english")
pat = re.compile('[^a-z]+')
words_only = [w for document in documents for w in document[0]]
most_common = nltk.FreqDist(words_only)
word_features = [w for w, c in most_common.most_common(500)]

In [166]:
train_set = [(get_features(doc, word_features), cat) for doc, cat in train]
test_set = [(get_features(doc, word_features), cat) for doc, cat in test] 
movie_classifier2 = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(movie_classifier2, test_set))

0.84
