This example is taken from the gender identification example at https://www.nltk.org/book/ch06.html

In [31]:
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import names
import random
from sklearn.metrics import precision_recall_fscore_support


# A first try at Naive Bayes classification using NLTK

### Feature extractor
We will use one feature from a name: the last letter

In [47]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Stanley')

{'last_letter': 'y'}

### Examples
Prepare a list of examples with corresponding class labels

In [None]:
# display(names.words('male.txt'))

male = [(name, 'male') for name in names.words('male.txt')]
female = [(name, 'female') for name in names.words('female.txt')]
labeled_names = male + female

random.shuffle(labeled_names)
labeled_names

### Training/test data
Use the feature extractor to prepare training and testing data

In [48]:
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]
# train_set, test_set = featuresets[500:], featuresets[:500]
train_set, test_set = train_test_split(featuresets)
# display(train_set[:3])

# Uses multinomial naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

### Try it out on a few names

In [None]:
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))
print(classifier.classify(gender_features('Laren')))


### Check precision/recall

In [50]:

y_true = list(list(zip(*test_set))[1])
test_features = list(list(zip(*test_set))[0])
y_predict = [classifier.classify(features) for features in test_features]


(p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
print(p,r,f,s)

[0.79517134 0.7008547 ] [0.82940699 0.65165563] [0.81192843 0.67536033] [1231  755]


### Likelihood ratios
Names in the training set that end in "a" are female 34 times more often than they are male.

In [None]:
classifier.show_most_informative_features(5)

# Can we do better?
Update the feature extractor to see if we can do better.

In [None]:
from nltk.metrics.scores import (precision, recall)

def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features('Hephzibah')

In [None]:
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]
train_set, test_set = train_test_split(featuresets)
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))


# print(nltk.classify.accuracy(classifier, test_set))
y_true = list(list(zip(*test_set))[1])
test_features = list(list(zip(*test_set))[0])
y_predict = [classifier.classify(features) for features in test_features]
(p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
print(p,r,f,s)

classifier.show_most_informative_features(15)

# Gaussian Naive Bayes Classifer
This type of classifer works with quantitative variables (numbers)

## We'll do it with scipy-learn

Adapted from https://scikit-learn.org/stable/modules/naive_bayes.html

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

iris = datasets.load_iris()
print(iris.keys())

# Use Gaussian for datasets with quantitative variables
clf = GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f) 


## Use a label encoder to convert categorical variables to quantitative "dummy" variables
This is known as one-hot encoding

In [78]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
le = preprocessing.LabelEncoder()

# Initialize the label encoder
le.fit([l for l in alphabet])

# Create a feature vector with all zeros, one for each letter of the alphabet
# a b c d ... x y z
# 0 0 0 0 ... 0 0 0
features = np.zeros(len(alphabet))

# Use the label encoder to get the index of the feature
var_index = le.transform(['c'])

# Mark the 'c' feature as present
# a b c d ... x y z
# 0 0 1 0 ... 0 0 0
features[var_index] = 1
display(features)

# Mark the 'd' and 'x' features as present
# a b c d ... x y z
# 0 0 1 1 ... 1 0 0
features[le.transform(['d'])] = 1
features[le.transform(['x'])] = 1

display(features)

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0.])

Do the name example again with a Gaussian Naive Bayes Classifier, using dummy variables to account for the categorical variables.

In [71]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
import numpy
import numpy as np

alphabet = 'abcdefghijklmnopqrstuvwxyz'
le = preprocessing.LabelEncoder()
le.fit([l for l in alphabet])

def gender_features(name):
    features = []
    first = name[0].lower()
    last = name[-1].lower()

    a = np.zeros(len(alphabet))
    a[le.transform([first])[0]] = 1
    features.extend(a)

    a = np.zeros(len(alphabet))
    if alphabet.find(last) > -1:
        a[le.transform([last])[0]] = 1
    features.extend(a)

    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features.append(name.lower().count(letter))
        features.append(letter in name.lower())
    return features


# display(gender_features("Hank"))

random.shuffle(labeled_names)
X = [gender_features(name) for (name, _) in labeled_names]
y = [gender for (_, gender) in labeled_names]
X

X_train, X_test = X[500:], X[:500]
y_train, y_test = y[500:], y[:500]

# Use multinomial for text classification
clf = MultinomialNB()
clf.fit(X_train, y_train).predict(X_test)
y_pred = clf.predict(X_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f)                                                


[0.81651376 0.73988439] [0.85576923 0.68085106] [0.83568075 0.70914127]


# OneHotEncoder from sklearn

In [87]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
# The dataset. There are two features: gender and years of college
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)

# features are [female, male, 1, 2, 3]
print(enc.categories_)

# Encode three data points. Note that the last data point doesn't have any entry for 1, 2, or 3
feature_vectors = enc.transform([['Female', 1], ['Male', 2], ['Male', 4]]).toarray()
print(feature_vectors)

# Get the semantic meaning for two encoded data points
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
[[1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0.]]


array([['Male', 1],
       [None, 2]], dtype=object)