In [1]:
import pickle
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
import warnings
import random
import os
import spacy
from PIL import Image

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn import model_selection as ms
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

# Study examples

## Sklearn example

In [3]:
ds = datasets.load_iris()

In [4]:
ds.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [5]:
ds['target'].shape, ds['data'].shape

((150,), (150, 4))

In [6]:
ds['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
ds.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
print(ds['DESCR'])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [9]:
gnb=GaussianNB()

In [10]:
y_pred = gnb.fit(ds.data, ds.target).predict(ds.data)

In [11]:
(ds.target!=y_pred).sum()

6

In [12]:
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])

In [13]:
clf=GaussianNB()
clf.fit(X,Y)

GaussianNB(priors=None)

In [14]:
clf.predict([[10,-1]])

array([2])

## Udacity example

### Functions

In [62]:
def prettyPicture(clf, X_test, y_test, img_title=None):
    plt.clf()
    x_min = 0.0; x_max = 1.0
    y_min = 0.0; y_max = 1.0

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    h = .01  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)

    # Plot also the test points
    grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
    bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
    grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
    bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]

    plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
    plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
    plt.legend()
    plt.xlabel("bumpiness")
    plt.ylabel("grade")
    
    if img_title==None:
        plt.savefig("test.png")
    else:
        plt.savefig(f'{img_title}.png')
    

In [16]:
def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
    random.seed(42)
    grade = [random.random() for ii in range(0,n_points)]
    bumpy = [random.random() for ii in range(0,n_points)]
    error = [random.random() for ii in range(0,n_points)]
    y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
    for ii in range(0, len(y)):
        if grade[ii]>0.8 or bumpy[ii]>0.8:
            y[ii] = 1.0

### split into train/test sets
    X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
    split = int(0.75*n_points)
    X_train = X[0:split]
    X_test  = X[split:]
    y_train = y[0:split]
    y_test  = y[split:]

    grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
    bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
    grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
    bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]

#    training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
#            , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}


    grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
    bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
    grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
    bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]

    test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
            , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}

    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

### Classifier

In [17]:
features_train, labels_train, features_test, labels_test= makeTerrainData()

In [18]:
### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
### in together--separate them so we can give them different colors in the scatterplot,
### and visually identify them
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]

In [19]:
## fit the classifier
clf = GaussianNB()
clf.fit(features_train, labels_train)

GaussianNB(priors=None)

In [20]:
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

<img src="test.png">

In [21]:
## predict labels
predictions = clf.predict(features_test)

In [22]:
predictions.shape

(250,)

In [23]:
## measure accuracy with metrics module
metrics.accuracy_score(labels_test, predictions)

0.884

In [24]:
## measure accuracy with GaussianNB module
clf.score(features_test, labels_test)

0.884

# Mini-project

Use a Naive Bayes Classifier to identify emails by their authors.
    
Authors and labels:
- Sara has label 0
- Chris has label 1

### Load and prepare data

In [2]:
words_file = '../tools/word_data.pkl'
authors_file = '../tools/email_authors.pkl'

In [3]:
with open(words_file,'rb') as words_file_handler:
    word_data = pickle.load(words_file_handler)
with open(authors_file,'rb') as authors_file_handler:
    author = pickle.load(authors_file_handler)  

In [5]:
type(word_data), type(author)

(list, list)

In [12]:
len(word_data)

17578

In [4]:
## create train and test split
features_train, features_test,\
labels_train, labels_test = ms.train_test_split(word_data, author, 
                                                test_size = 0.1, 
                                                random_state=42)

In [5]:
type(features_train), len(features_train), type(features_test),len(features_test)

(list, 15820, list, 1758)

In [7]:
len(features_train), len(labels_train)

(15820, 15820)

#### Vectorizer without tokenizer

In [20]:
## vectorise words
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 
                             stop_words='english')

In [21]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [22]:
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed  = vectorizer.transform(features_test)

In [23]:
features_train_transformed.shape,  features_test_transformed.shape

((15820, 37851), (1758, 37851))

In [86]:
### feature selection, because text is super high dimensional and 
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features_train_transformed, labels_train)
features_train_transformed = selector.transform(features_train_transformed).toarray()
features_test_transformed  = selector.transform(features_test_transformed).toarray()

In [87]:
### info on the data
print ("no. of Chris training emails:", sum(labels_train))
print ("no. of Sara training emails:", len(labels_train)-sum(labels_train))

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [88]:
features_train_transformed.shape, features_test_transformed.shape

((15820, 3785), (1758, 3785))

In [89]:
type(labels_train),type(labels_test)

(list, list)

In [90]:
len(labels_train), len(labels_test)

(15820, 1758)

#### Vectorizer with Spacy as tokenizer

In [31]:
# create a spaCy tokenizer
spacy.load('en')
lemmatizer = spacy.lang.en.English()

In [41]:
# tokenize the doc and lemmatize its tokens
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

In [33]:
## vectorise words
vec_spacy = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 
                             stop_words='english', tokenizer=my_tokenizer)

In [34]:
features_train_spacy = vectorizer.fit_transform(features_train)
features_test_spacy  = vectorizer.transform(features_test)

In [35]:
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features_train_spacy, labels_train)
features_train_spacy = selector.transform(features_train_spacy).toarray()
features_test_spacy  = selector.transform(features_test_spacy).toarray()

### Classifier

#### Score without tokenizer

In [59]:
clf = GaussianNB()

In [61]:
clf.fit(features_train_transformed, labels_train)

GaussianNB(priors=None)

In [100]:
pred = clf.predict(features_test_transformed)

In [101]:
clf.score(features_test_transformed, labels_test)

0.9732650739476678

#### Score with tokenizer

In [36]:
sp_clf = GaussianNB()
sp_clf.fit(features_train_spacy, labels_train)

GaussianNB(priors=None)

In [37]:
sp_pred = sp_clf.predict(features_test_spacy)

In [39]:
sp_clf.score(features_test_spacy, labels_test)

0.9732650739476678

#### Classifier without feature selection

In [40]:
features_train_vect = vectorizer.fit_transform(features_train)
features_test_vect = vectorizer.transform(features_test)

In [44]:
full_clf = GaussianNB()
full_clf.fit(features_train_vect.toarray(), labels_train)

GaussianNB(priors=None)

In [46]:
full_pred = full_clf.predict(features_test_vect.toarray())

In [48]:
full_clf.score(features_test_vect.toarray(), labels_test)

0.9630261660978384