This example is taken from the gender identification example at https://www.nltk.org/book/ch06.html

In [16]:
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import names
import random
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
import numpy as np

# A first try at Naive Bayes classification using NLTK

### Feature extractor
We will use one feature from a name: the last letter

In [2]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Stanley')

{'last_letter': 'y'}

In [3]:
nltk.download('names')

[nltk_data] Downloading package names to /home/ben/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

### Examples
Prepare a list of examples with corresponding class labels

In [4]:
# display(names.words('male.txt'))

male = [(name, 'male') for name in names.words('male.txt')]
female = [(name, 'female') for name in names.words('female.txt')]
labeled_names = male + female

random.shuffle(labeled_names)
labeled_names

[('Benny', 'female'),
 ('Emelita', 'female'),
 ('Anjela', 'female'),
 ('Octavia', 'female'),
 ('Clint', 'male'),
 ('Clemmy', 'female'),
 ('Aliza', 'female'),
 ('Vincents', 'male'),
 ('Cordula', 'female'),
 ('Cybill', 'female'),
 ('Meara', 'female'),
 ('Noell', 'female'),
 ('Megen', 'female'),
 ('Gabbie', 'female'),
 ('Jotham', 'male'),
 ('Sindee', 'female'),
 ('Barth', 'male'),
 ('Verna', 'female'),
 ('Herrmann', 'male'),
 ('Anne', 'female'),
 ('Orrin', 'male'),
 ('Bayard', 'male'),
 ('Pepi', 'female'),
 ('Maribeth', 'female'),
 ('Eddie', 'male'),
 ('Isadore', 'male'),
 ('Carlton', 'male'),
 ('Sapphire', 'female'),
 ('Cole', 'male'),
 ('Morgana', 'female'),
 ('Helaine', 'female'),
 ('Paolina', 'female'),
 ('Fergus', 'male'),
 ('Goober', 'male'),
 ('Wini', 'female'),
 ('Storey', 'female'),
 ('Barbee', 'female'),
 ('Kath', 'female'),
 ('Johnette', 'female'),
 ('Brigitta', 'female'),
 ('Clemmie', 'male'),
 ('Van', 'female'),
 ('Merridie', 'female'),
 ('Shalna', 'female'),
 ('Dora', 'femal

### Training/test data
Use the feature extractor to prepare training and testing data

In [5]:
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]
# train_set, test_set = featuresets[500:], featuresets[:500]
train_set, test_set = train_test_split(featuresets)
# display(train_set[:3])

# Uses multinomial naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

### Try it out on a few names

In [6]:
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))
print(classifier.classify(gender_features('Laren')))


male
female
female
female
male


### Check precision/recall

In [7]:

y_true = list(list(zip(*test_set))[1])
test_features = list(list(zip(*test_set))[0])
y_predict = [classifier.classify(features) for features in test_features]


(p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
print(p,r,f,s)

[0.81654957 0.6822695 ] [0.82362205 0.67178771] [0.82007056 0.67698804] [1270  716]


### Likelihood ratios
Names in the training set that end in "a" are female 34 times more often than they are male.

In [8]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.8 : 1.0
             last_letter = 'k'              male : female =     33.1 : 1.0
             last_letter = 'f'              male : female =     24.0 : 1.0
             last_letter = 'v'              male : female =     12.8 : 1.0
             last_letter = 'd'              male : female =      9.7 : 1.0


# Can we do better?
Update the feature extractor to see if we can do better.

In [9]:
from nltk.metrics.scores import (precision, recall)

def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

gender_features('Hephzibah')

{'first_letter': 'h',
 'last_letter': 'h',
 'count(a)': 1,
 'has(a)': True,
 'count(b)': 1,
 'has(b)': True,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 3,
 'has(h)': True,
 'count(i)': 1,
 'has(i)': True,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 1,
 'has(p)': True,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 1,
 'has(z)': True}

In [10]:
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]
train_set, test_set = train_test_split(featuresets)
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(classifier.classify(gender_features('Amy')))
print(classifier.classify(gender_features('Andy')))

male
female
female
female


In [11]:
# print(nltk.classify.accuracy(classifier, test_set))
y_true = list(list(zip(*test_set))[1])
test_features = list(list(zip(*test_set))[0])
y_predict = [classifier.classify(features) for features in test_features]
(p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
print(p,r,f,s)

classifier.show_most_informative_features(15)

[0.82096404 0.69808542] [0.83959311 0.66949153] [0.83017408 0.68348955] [1278  708]
Most Informative Features
             last_letter = 'k'              male : female =     38.9 : 1.0
             last_letter = 'a'            female : male   =     37.5 : 1.0
             last_letter = 'p'              male : female =     16.1 : 1.0
             last_letter = 'd'              male : female =     12.2 : 1.0
             last_letter = 'm'              male : female =     11.2 : 1.0
             last_letter = 'f'              male : female =     10.3 : 1.0
             last_letter = 'o'              male : female =      9.1 : 1.0
             last_letter = 'v'              male : female =      7.6 : 1.0
                count(v) = 2              female : male   =      7.0 : 1.0
             last_letter = 'r'              male : female =      6.4 : 1.0
             last_letter = 'w'              male : female =      5.0 : 1.0
             last_letter = 'g'              male : female =      

# Gaussian Naive Bayes Classifer
This type of classifer works with quantitative variables (numbers)

## We'll do it with scipy-learn

Adapted from https://scikit-learn.org/stable/modules/naive_bayes.html

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

iris = datasets.load_iris()
print(iris.keys())

# Use Gaussian for datasets with quantitative variables
clf = GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f)
print(X_train.shape)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
[1.         0.93333333 1.        ] [1.         1.         0.90909091] [1.         0.96551724 0.95238095]
(112, 4)


In [13]:
X_train
y_train

array([0, 0, 2, 1, 1, 2, 2, 0, 0, 0, 2, 1, 1, 0, 2, 2, 1, 0, 0, 2, 0, 2,
       2, 2, 1, 1, 0, 2, 1, 1, 0, 1, 0, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2,
       2, 0, 0, 1, 0, 1, 2, 2, 0, 1, 1, 1, 1, 0, 2, 2, 0, 1, 2, 1, 0, 0,
       0, 1, 2, 1, 2, 0, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2, 1, 2, 0, 0, 1, 0,
       1, 1, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, 2, 1, 0, 2, 1, 0, 2, 0,
       1, 2])

## Use a label encoder to convert categorical variables to quantitative "dummy" variables
This is known as one-hot encoding

In [17]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
le = preprocessing.LabelEncoder()

# Initialize the label encoder
le.fit([l for l in alphabet])

# Create a feature vector with all zeros, one for each letter of the alphabet
# a b c d ... x y z
# 0 0 0 0 ... 0 0 0
features = np.zeros(len(alphabet))
features

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
# Use the label encoder to get the index of the feature
var_index = le.transform(['c'])
var_index

array([2])

In [115]:
# Mark the 'c' feature as present
# a b c d ... x y z
# 0 0 1 0 ... 0 0 0
features[var_index] = 1
display(features)

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [117]:
# Mark the 'd' and 'x' features as present
# a b c d ... x y z
# 0 0 1 1 ... 1 0 0
features[le.transform(['d'])] = 1
features[le.transform(['x'])] = 1

features[le.transform(['a','b'])] = 1

display(features)

array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0.])

Do the name example again with a Gaussian Naive Bayes Classifier, using dummy variables to account for the categorical variables.

In [118]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
import numpy
import numpy as np

alphabet = 'abcdefghijklmnopqrstuvwxyz'
le = preprocessing.LabelEncoder()
le.fit([l for l in alphabet])

def gender_features(name):
    features = []
    first = name[0].lower()
    last = name[-1].lower()

    a = np.zeros(len(alphabet))
    a[le.transform([first])[0]] = 1
    features.extend(a)

    a = np.zeros(len(alphabet))
    if alphabet.find(last) > -1:
        a[le.transform([last])[0]] = 1
    features.extend(a)

    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features.append(name.lower().count(letter))
        features.append(letter in name.lower())
    return features


# display(gender_features("Hank"))

random.shuffle(labeled_names)
X = [gender_features(name) for (name, _) in labeled_names]
y = [gender for (_, gender) in labeled_names]
X

X_train, X_test = X[500:], X[:500]
y_train, y_test = y[500:], y[:500]

# Use multinomial for text classification
clf = MultinomialNB()
clf.fit(X_train, y_train).predict(X_test)
y_pred = clf.predict(X_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
print(p, r, f)                                                


[0.79935275 0.7486911 ] [0.83728814 0.69756098] [0.81788079 0.72222222]


# OneHotEncoder from sklearn

In [120]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
# The dataset. There are two features: gender and years of college
X = [['Male', 1], ['Female', 3], ['Female', 2], ['Female', 6]]
enc.fit(X)

# features are [female, male, 1, 2, 3]
print(enc.categories_)

[array(['Female', 'Male'], dtype=object), array([1, 2, 3, 6], dtype=object)]


In [123]:
# Encode three data points. Note that the last data point doesn't have any entry for 1, 2, or 3
feature_vectors = enc.transform([['Female', 1], ['Male', 2], ['Male', 4]]).toarray()
print(feature_vectors)

# Get the semantic meaning for two encoded data points
enc.inverse_transform([[0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]])

[[1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


array([['Male', 1],
       [None, 2]], dtype=object)