# 0. Dependências

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

%matplotlib inline
pd.options.display.max_rows = 10

  _nan_object_mask = _nan_object_array != _nan_object_array


# 1. Introdução 

# 2. Dados

In [2]:
iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [4]:
x = df.drop(labels='class', axis=1).values
y = df['class'].values

print(x.shape, y.shape)

(150, 4) (150,)


# 3. Implementação 

In [5]:
class GaussianNaiveBayes():
    def __init__(self):
        self.class_prior_ = None
        self.theta_ = 0.0
        self.sigma_ = 0.0
    
    def fit(self, x, y):
        classes, counts = np.unique(y, return_counts=True)
        self.class_prior_ = counts / counts.sum()
        
        self.theta_ = np.array([np.mean(x[y==c], axis=0) for c in classes])
        self.sigma_ = np.array([np.var(x[y==c], axis=0) for c in classes])
            
    def predict(self, x):
        y_pred = []
        for sample in x:
            joint_prob = self.__joint_prob(sample)
            maximum_posterior = self.__maximum_posterior(joint_prob)
            y_pred.append(np.argmax(maximum_posterior))
        return np.array(y_pred)
    
    def predict_proba(self, x):
        y_pred = []
        for sample in x:
            joint_prob = self.__joint_prob(sample)
            maximum_posterior = self.__maximum_posterior(joint_prob)
            y_pred.append(maximum_posterior)
        return np.array(y_pred)
            
    def __normal_pdf(self, x, mean_c, var_c):
        exponent = ((x - mean_c)**2) / (2 * var_c)
        f = (1.0 / np.sqrt(2.0 * np.pi * var_c)) * np.exp(-exponent)
        return np.prod(f)
    
    def __joint_prob(self, x):
        joint_prob = []
        for p, t, s in zip(self.class_prior_, self.theta_, self.sigma_):
            joint_prob.append(p * self.__normal_pdf(x, t, s))
        return joint_prob
    
    def __maximum_posterior(self, joint_prob):
        marginal_pdf = np.sum(joint_prob)
        return joint_prob / marginal_pdf

# 4. Teste 

In [6]:
clf = GaussianNaiveBayes()
clf.fit(x, y)

print(clf.theta_)
print(clf.sigma_)
print(clf.predict(x[::15]))
print(clf.predict_proba(x[::15]))

[[ 5.006  3.428  1.462  0.246]
 [ 5.936  2.77   4.26   1.326]
 [ 6.588  2.974  5.552  2.026]]
[[ 0.121764  0.140816  0.029556  0.010884]
 [ 0.261104  0.0965    0.2164    0.038324]
 [ 0.396256  0.101924  0.298496  0.073924]]
[0 0 0 0 1 1 1 2 2 2]
[[  1.00000000e+000   1.35784018e-018   7.11282484e-026]
 [  1.00000000e+000   3.04074000e-017   1.66211220e-023]
 [  1.00000000e+000   1.07709981e-016   2.38609139e-024]
 [  1.00000000e+000   1.95283750e-016   1.97346867e-024]
 [  1.07497743e-041   9.99999765e-001   2.35068154e-007]
 [  3.30674649e-094   9.87471083e-001   1.25289167e-002]
 [  1.13183494e-082   9.99873680e-001   1.26320291e-004]
 [  4.65992514e-273   2.40976795e-010   1.00000000e+000]
 [  7.89094248e-222   1.17116772e-008   9.99999988e-001]
 [  6.08057313e-254   8.98577633e-011   1.00000000e+000]]


In [7]:
y_pred = clf.predict(x)
print(accuracy_score(y, y_pred))

0.96


### Comparação com o Scikit-learn

In [8]:
clf_sk = GaussianNB()
clf_sk.fit(x, y)

print(clf_sk.theta_)
print(clf_sk.sigma_)
print(clf_sk.predict(x[::15]))
print(clf_sk.predict_proba(x[::15]))

[[ 5.006  3.428  1.462  0.246]
 [ 5.936  2.77   4.26   1.326]
 [ 6.588  2.974  5.552  2.026]]
[[ 0.121764  0.140816  0.029556  0.010884]
 [ 0.261104  0.0965    0.2164    0.038324]
 [ 0.396256  0.101924  0.298496  0.073924]]
[0 0 0 0 1 1 1 2 2 2]
[[  1.00000000e+000   1.35784265e-018   7.11283512e-026]
 [  1.00000000e+000   3.04074398e-017   1.66211400e-023]
 [  1.00000000e+000   1.07710163e-016   2.38609460e-024]
 [  1.00000000e+000   1.95284044e-016   1.97347120e-024]
 [  1.07499306e-041   9.99999765e-001   2.35068227e-007]
 [  3.30685495e-094   9.87471082e-001   1.25289184e-002]
 [  1.13186560e-082   9.99873680e-001   1.26320322e-004]
 [  4.66035479e-273   2.40976995e-010   1.00000000e+000]
 [  7.89162997e-222   1.17116897e-008   9.99999988e-001]
 [  6.08114349e-254   8.98578646e-011   1.00000000e+000]]


In [9]:
y_pred = clf_sk.predict(x)
print(accuracy_score(y, y_pred))

0.96


## 5. Referências

- [Repositório do GitHub](https://github.com/odubno/GaussNaiveBayes)