# Other Popular Machine Learning Methods
## Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
import urllib
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Naive Bayes
### Using Naive Bayes to predict spam

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

# import request module
import urllib.request

raw_data = urllib.request.urlopen(url)

# treat dataset like csv file
dataset = np.loadtxt(raw_data, delimiter=',')

# preview column 0
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [7]:
display(dataset)
display(dataset.shape)

array([[0.000e+00, 6.400e-01, 6.400e-01, ..., 6.100e+01, 2.780e+02,
        1.000e+00],
       [2.100e-01, 2.800e-01, 5.000e-01, ..., 1.010e+02, 1.028e+03,
        1.000e+00],
       [6.000e-02, 0.000e+00, 7.100e-01, ..., 4.850e+02, 2.259e+03,
        1.000e+00],
       ...,
       [3.000e-01, 0.000e+00, 3.000e-01, ..., 6.000e+00, 1.180e+02,
        0.000e+00],
       [9.600e-01, 0.000e+00, 0.000e+00, ..., 5.000e+00, 7.800e+01,
        0.000e+00],
       [0.000e+00, 0.000e+00, 6.500e-01, ..., 5.000e+00, 4.000e+01,
        0.000e+00]])

(4601, 58)

In [8]:
# isolate predictive variables from this dataset
# the variables that describe word frequencies are standardized on the same scale, 
# where other variables have differing magnitudes

# the reasonable thing to do is to only analyze the 48 features that describe word frequency counts
# the side benefit of this is there's no pre-processing requirement

# predictive dataset, only first 48 variables
X = dataset[:,0:48]

# isolate our target variable
# spam label in the records dataset, spam -> 1, not spam -> 0
y = dataset[:,-1]

In [9]:
# break up data to training and test set, 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=17)

In [11]:
# data is comprised of continuous variables that describe frequency count of words
# try bernoulli with (binarize=True, default setting) binning to convert the frequency count to binary values

# binarize -> Threshold for binarizing (mapping to booleans) of sample features. 
# If None, input is presumed to already consist of binary vectors.

BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
print(BernNB)

# test the model's accuracy
y_expect = y_test
y_pred = BernNB.predict(X_test)

# get accuracy score
print(accuracy_score(y_expect, y_pred))

# getting accuracy score of 86% for this model

BernoulliNB(binarize=True)
0.8577633007600435


In [12]:
# multinomial naive bayes is a great candidate
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

# we don't need y_test beacuse we've already generated a y-expect variable
y_pred = MultiNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

# getting accuracy score of 88% for this model (a little better than bernoulli)
# has a higher accuracy score 

MultinomialNB()
0.8816503800217155


In [14]:
# since all our datasets are all numeric, technically we could try out a gassian naive bayes classifier
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)


y_pred = GausNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

# getting accuracy score of 82% for this model

GaussianNB()
0.8197611292073833


In [15]:
# improve bernoulli with trial and error, we set our paramter = 0.1, so we get optimal results
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))
# getting accuracy score of 91% for this model
# best model, note to adjust your model parameter settings to get the best performance for your models

BernoulliNB(binarize=0.1)
0.9109663409337676


In [16]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
Y = np.array([1, 2, 3, 4, 4, 5])
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X, Y)

BernoulliNB()

In [17]:
print(clf.predict(X[2:3]))

[3]
