# Bernoulli/Multinomial Naive Bayes

Naive Bayes using Bernoulli

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from sklearn.naive_bayes import BernoulliNB

In [3]:
newsgroups = fetch_20newsgroups(subset = 'all')

In [4]:
vectorizer_BNB = CountVectorizer(binary = True)

In [6]:
X1 = vectorizer_BNB.fit_transform(newsgroups.data)

In [7]:
y = newsgroups.target

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
xtrain1,xtest1,ytrain,ytest = train_test_split(X1,y,test_size=0.25,random_state=42)

In [10]:
BNB = BernoulliNB()

In [12]:
BNB.fit(xtrain1,ytrain)

In [14]:
y_pred1 = BNB.predict(xtest1)

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(ytest,y_pred1)

0.6878183361629882

Naive Bayes using Multinomial

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
newsgroups = fetch_20newsgroups(subset = 'all')

In [22]:
vectorizer_MNB = CountVectorizer(binary = False)

In [23]:
X2 = vectorizer_MNB.fit_transform(newsgroups.data)

In [24]:
y = newsgroups.target

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
xtrain2,xtest2,ytrain,ytest = train_test_split(X2,y,test_size=0.25,random_state=42)

In [27]:
MNB = MultinomialNB()

In [28]:
MNB.fit(xtrain2,ytrain)

In [29]:
y_pred2 = MNB.predict(xtest2)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
accuracy_score(ytest,y_pred2)

0.8469864176570459

Conclusion: multinomial is better compared to Bernoulli because frequency based vectors gave better accuracy

Naive Bayes using Multinomial and Tfid Vectorizer

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
from sklearn.pipeline import make_pipeline

In [34]:
model = make_pipeline(TfidfVectorizer(),MultinomialNB())

In [35]:
train_data = fetch_20newsgroups(subset = 'train')
test_data = fetch_20newsgroups(subset = 'test')                                

In [36]:
model.fit(train_data.data,train_data.target)

In [37]:
predictions_tf = model.predict(test_data.data)

In [38]:
accuracy_score(test_data.target,predictions_tf)

0.7738980350504514