In [None]:
#https://towardsdatascience.com/introduction-to-naive-bayes-classification-4cffabb1ae54
#http://scikit-learn.org/stable/modules/naive_bayes.html

#example:
#https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import datasets

In [3]:
iris_aux = datasets.load_iris()
iris = pd.DataFrame(iris_aux.data, columns=iris_aux.feature_names)
iris['target'] = pd.Series(iris_aux.target)
iris['species'] = pd.Categorical.from_codes(iris_aux.target, iris_aux.target_names)
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


### Gaussian Naive Bayes

In [4]:
X = iris.iloc[:,:4]
y = iris['target']
y = np.ravel(y)

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [6]:
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
cv_kfold = KFold(n_splits=3, shuffle=False, random_state=None)
cv_loo = LeaveOneOut()

In [7]:
from sklearn.model_selection import GridSearchCV
clf = GaussianNB(priors=None)

parameters = {
    'priors' : [None]
}

#scoring pode ser: accuracy, roc_auc, recall, precision... 
### regressao logisitca nao tem parametros para ficar testando
model_gnb = GridSearchCV(estimator = clf, param_grid = parameters, fit_params=None, n_jobs=1, iid=True, refit=True, 
                     cv=cv_kfold, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score= True)
model_gnb


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise', estimator=GaussianNB(priors=None),
       fit_params=None, iid=True, n_jobs=1, param_grid={'priors': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
fitted_gnb = model_gnb.fit(X_train, y_train)

In [9]:
fitted_gnb.best_score_

0.9553571428571429

In [10]:
predict_gnp = fitted_gnb.predict(X_test)

In [11]:
pd.DataFrame(metrics.confusion_matrix(y_test, predict_gnp))

Unnamed: 0,0,1,2
0,13,0,0
1,0,16,0
2,0,0,9


In [12]:
print(metrics.classification_report(y_test, predict_gnp))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        16
          2       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        38



### Bernoulli Naive Bayes

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html

In [14]:
X_bernoulli = np.random.randint(2, size=(6, 100))
X_bernoulli

array([[1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1],
       [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1],
       [0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 1, 0, 1,

In [15]:
Y = np.array([1, 2, 3, 4, 4, 5])
Y

array([1, 2, 3, 4, 4, 5])

In [16]:
from sklearn.naive_bayes import BernoulliNB

In [17]:
clf_bnv = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)

parameters = {
    'alpha' : [1.0],
    'binarize' : [0.0],
    'fit_prior' : [True],
    'class_prior' : [True]    
}

#scoring pode ser: accuracy, roc_auc, recall, precision... 
### regressao logisitca nao tem parametros para ficar testando
model_bnb = GridSearchCV(estimator = clf_bnv, param_grid = parameters, fit_params=None, n_jobs=1, iid=True, refit=True, 
                     cv=cv_kfold, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score= True)
model_bnb


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0], 'binarize': [0.0], 'fit_prior': [True], 'class_prior': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
grid_fitted_bnb = model_bnb.fit(X_bernoulli, Y)

TypeError: object of type 'bool' has no len()

In [None]:
model_final_bnb = grid_fitted_bnb.bes