# Naive Bayes

- algorithm: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

### 1. First example

In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [2]:
# input data
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])

In [3]:
# create instance of classifier
clf = GaussianNB()

In [4]:
# fit classifier to data
clf.fit(X,Y)

GaussianNB()

In [5]:
# predict classification for new data point (prints that data is classified as 1)
print(clf.predict(([[-0.8, -1]])))

[1]


### 2. Second example

In [6]:
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture, output_image
from ClassifyNB import classify
import pylab as pl

In [7]:
features_train, labels_train, features_test, labels_test = makeTerrainData()

In [8]:
### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
### in together--separate them so we can give them different colors in the scatterplot,
### and visually identify them
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]

In [9]:
# You will need to complete this function imported from the ClassifyNB script.
# Be sure to change to that code tab to complete this quiz.
clf = classify(features_train, labels_train)

In [10]:
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

#### evaluate classifier = print accuracy
- used clf.score()

In [11]:
from studentCode import NBAccuracy

In [12]:
NBAccuracy(features_train, labels_train, features_test, labels_test)

0.884

- use accuracy_score

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
pred = clf.predict(features_test)

In [16]:
print (accuracy_score(pred, labels_test))

0.884


### note: train and tested on two different sets of data
- if don't, overfit data
- thus, should always save 10% of data as testing set

### bayes rules
- takes into account test evidence with prior probability to provide posterior probability

#### Question: what is probability of having cancer with positive test result?

In [20]:
prior = 0.01 # probability before running test
test_sensitivity = 0.90 # chance of positive result given cancer
test_specificity = 0.90 # chance of negative result given no cancer

#### step-by-step: get posterior

##### joint probabilities of two events

In [26]:
# probability you have cancer given positive test
prior * test_sensitivity
p_c_pos = prior * test_sensitivity

In [27]:
# probability you don't have cancer given positive test
(1-prior) * (1-test_sensitivity)
p_not_c_pos = (1-prior) * (1-test_sensitivity)

##### normalizer: probability of positive test result

In [28]:
(prior * test_sensitivity) + ((1-prior) * (1-test_sensitivity))

norm = (prior * test_sensitivity) + ((1-prior) * (1-test_sensitivity))

##### posteriors

In [29]:
# cancer given I have positive test result
p_c_pos/norm

0.08333333333333336

In [31]:
# not cancer given I have positive test result
p_not_c_pos/norm

0.9166666666666666

In [32]:
# sum of two posteriers (as expected = 1)
(p_c_pos/norm) + (p_not_c_pos/norm)

1.0