# Objective 

To review the Naive Bayes algorithm

## Preliminaries

In [4]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

# The Algorithm

$$
P(A|B) = \dfrac{P(A) P(B|A)}{P(B)}
$$

$$
P(y|x_1, x_2,\ldots,x_n)=\dfrac{P(y) \times [P(x_1|y) P(x_2|y) \ldots P(x_n|y)]}{P(x_1, x_2, \ldots, x_n)}
$$

When the features are categorical, count of the feature in each of the examples allows us to determine the probabilities. Here we assume that each of the categorical features are independent.

$$
P(y = \text{5-star}| \text{'wow'}, \text{'awesome'}) = \dfrac{P(\text{5-star}) \times [P(\text{'wow'}| \text{5-star}) P(\text{'awesome'}| \text{5-star})]}{P(\text{'wow}, \text{'awesome'})}
$$

When the features are numeric, we assume the distribution of the features to be Gaussian and compute the conditional probability distributions. So, for numeric features we have two assumptions - independence and normality. These are rather strong assumptions and hence the 'naive'.

$$
P(y = Diabetes|BP, Glucose) = \dfrac{P(Diabetes) \times [P(BP|Diabetes) P(Glucose|Diabetes)]}{P(BP, Glucose)}
$$

# Example: **Yelp**

## Data

In [5]:
yelp_df = pd.read_csv("/content/drive/MyDrive/AI-ML/supervised-learning-revision/Day2/data/yelp.csv")

In [6]:
yelp_df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [7]:
yelp_df.shape

(10000, 10)

In [8]:
yelp15_df = (yelp_df.query(f"stars == 5 | stars == 1"))

In [9]:
(yelp15_df.stars
          .value_counts())

5    3337
1     749
Name: stars, dtype: int64

In [10]:
yelp_X, yelp_y = (yelp15_df.text, yelp15_df.stars)

In [11]:
yelp_Xtrain, yelp_Xtest, yelp_ytrain, yelp_ytest = train_test_split(yelp_X,
                                                                    yelp_y,
                                                                    test_size=0.2,
                                                                    random_state=20130810)

In [12]:
yelp_Xtrain, yelp_Xvalid, yelp_ytrain, yelp_yvalid = train_test_split(yelp_Xtrain,
                                                                      yelp_ytrain,
                                                                      test_size=0.2,
                                                                      random_state=20130810)

In [13]:
yelp_Xtrain.head()

5313    I love Mexican food. Today I ate half of a tac...
9430    Great place - nice ambiance, cool decor, tasty...
1249    Anything I write will not do justice to this a...
9162    Going to Desert Botanical Garden is the perfec...
3814    My house is clean using pet-friendly products....
Name: text, dtype: object

In [14]:
count_vectorizer = CountVectorizer(stop_words='english')

In [15]:
yelp_Xvec_train = count_vectorizer.fit_transform(yelp_Xtrain)

yelp_Xvec_valid = count_vectorizer.transform(yelp_Xvalid)
yelp_Xvec_test = count_vectorizer.transform(yelp_Xtest)

In [18]:
count_vectorizer.vocabulary_.keys()



In [19]:
words_df = pd.DataFrame(yelp_Xvec_train.toarray(), 
                        columns=count_vectorizer.get_feature_names())

words_frequency = pd.DataFrame(words_df.sum(axis=0)).reset_index()

words_frequency.columns = ['word', 'frequency']

(words_frequency.sort_values(by='frequency',
                             ascending=False)
                .head(10))

Unnamed: 0,word,frequency
10256,place,1605
5477,food,1495
6101,great,1326
6001,good,1283
7982,like,1118
7511,just,1082
13899,time,841
12105,service,810
14595,ve,760
8185,love,738


## Model

In [20]:
learner_multiomialnb = MultinomialNB()

In [21]:
%%time

learner_multiomialnb.fit(yelp_Xvec_train,
                         yelp_ytrain)

CPU times: user 3.96 ms, sys: 3.15 ms, total: 7.11 ms
Wall time: 7.23 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
learner_multiomialnb.score(yelp_Xvec_train,
                           yelp_ytrain)

0.9797245600612089

In [23]:
confusion_matrix(yelp_yvalid,
                 learner_multiomialnb.predict(yelp_Xvec_valid))

array([[ 68,  52],
       [ 15, 519]])

In [24]:
print(classification_report(yelp_yvalid,
                            learner_multiomialnb.predict(yelp_Xvec_valid)))

              precision    recall  f1-score   support

           1       0.82      0.57      0.67       120
           5       0.91      0.97      0.94       534

    accuracy                           0.90       654
   macro avg       0.86      0.77      0.80       654
weighted avg       0.89      0.90      0.89       654



# Example: **Diabetes**

##Data

In [25]:
diabetes_df = pd.read_csv("https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv")

In [26]:
diabetes_X, diabetes_y = (diabetes_df.drop('Outcome', axis=1), 
                          diabetes_df.Outcome)

In [27]:
diabetes_Xtrain, diabetes_Xtest, diabetes_ytrain, diabetes_ytest = train_test_split(diabetes_X,
                                                                                    diabetes_y,
                                                                                    test_size=0.2,
                                                                                    random_state=20130810)

diabetes_Xtrain, diabetes_Xvalid, diabetes_ytrain, diabetes_yvalid = train_test_split(diabetes_Xtrain,
                                                                                      diabetes_ytrain,
                                                                                      test_size=0.2,
                                                                                      random_state=20130810)

## Model

In [28]:
learner_gaussiannb = GaussianNB()

In [29]:
learner_gaussiannb.fit(diabetes_Xtrain, diabetes_ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [30]:
print(classification_report(diabetes_yvalid,
                            learner_gaussiannb.predict(diabetes_Xvalid)))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84        79
           1       0.77      0.55      0.64        44

    accuracy                           0.78       123
   macro avg       0.78      0.73      0.74       123
weighted avg       0.78      0.78      0.77       123

