This example is taken from the gender identification example at https://www.nltk.org/book/ch06.html

# A first try at Naive Bayes classification using NLTK

### Feature extractor
We will use one feature from a name: the last letter

In [None]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Stanley')

### Examples
Prepare a list of examples with corresponding class labels

In [None]:
import nltk
from nltk.corpus import names
import random

# display(names.words('male.txt'))

male = [(name, 'male') for name in names.words('male.txt')]
female = [(name, 'female') for name in names.words('female.txt')]
labeled_names = male + female

random.shuffle(labeled_names)
labeled_names

### Training/test data
Use the feature extractor to prepare training and testing data

In [189]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
# display(train_set[:3])

# Uses multinomial naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

### Try it out on a few names

In [191]:
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))
print(classifier.classify(gender_features('Laren')))


male
female
female
female
male


### Check accuracy

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

### Likelihood ratios
Names in the training set that end in "a" are female 34 times more often than they are male.

In [None]:
classifier.show_most_informative_features(5)

# Can we do better?
Update the feature extractor to see if we can do better.

In [None]:
from nltk.metrics.scores import (precision, recall)

def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features('Hephzibah')

In [195]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))


print(nltk.classify.accuracy(classifier, test_set))

# classifier.classify_many(test_set)
# print('Precision: {}'.format(precision(refsets['pos'], testsets['pos'])))
# print 'Recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])

classifier.show_most_informative_features(5)

male
female
female
female
0.77
Most Informative Features
             last_letter = 'a'            female : male   =     37.3 : 1.0
             last_letter = 'k'              male : female =     32.6 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


# Let's do it with scipy-learn
Adapted from https://scikit-learn.org/stable/modules/naive_bayes.html

In [196]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from sklearn import datasets
iris = datasets.load_iris()
from sklearn.naive_bayes import GaussianNB

# Use Gaussian for general datasets
clf = GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r) 


[1.         0.84615385 1.        ] [1.         1.         0.85714286]


In [178]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
import numpy
import numpy as np

alphabet = 'abcdefghijklmnopqrstuvwxyz'
le = preprocessing.LabelEncoder()
le.fit([l for l in alphabet])

def gender_features(name):
    features = []
    first = name[0].lower()
    last = name[-1].lower()

    a = np.zeros(len(alphabet))
    a[le.transform([first])[0]] = 1
    features.extend(a)

    a = np.zeros(len(alphabet))
    if alphabet.find(last) > -1:
        a[le.transform([last])[0]] = 1
    features.extend(a)

    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features.append(name.lower().count(letter))
        features.append(letter in name.lower())
    return features


# display(gender_features("Hank"))

random.shuffle(labeled_names)
X = [gender_features(name) for (name, _) in labeled_names]
y = [gender for (_, gender) in labeled_names]
X

X_train, X_test = X[500:], X[:500]
y_train, y_test = y[500:], y[:500]

# Use multinomial for text classification
clf = MultinomialNB()
clf.fit(X_train, y_train).predict(X_test)
y_pred = clf.predict(X_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f)                                                


[0.81619938 0.70391061] [0.83174603 0.68108108] [0.82389937 0.69230769]
