## Naive Bayes and Bernouli Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_doc = pd.read_csv('example_train.csv')

In [3]:
train_doc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Document  5 non-null      object
 1   Class     5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes


In [4]:
train_doc['Class'] = train_doc.Class.map({'cinema' : 0, 'education' : 1})
train_doc

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [24]:
train_array = train_doc.values

X_train = train_array[:,0]
y_train = train_array[:,1]
y_train = y_train.astype('int') # sklearn needs y as integers

print(y_train)
y_train.info()

[1 1 1 0 0]


AttributeError: 'numpy.ndarray' object has no attribute 'info'

### Create the Bags of Words representation

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vect = CountVectorizer()

In [8]:
vect.fit(X_train)
vect.vocabulary_

{'upgrad': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [9]:
print(vect.get_feature_names())
print(len(vect.get_feature_names()))

['and', 'cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'is', 'movie', 'of', 'on', 'sholey', 'story', 'upgrad']
16


In [10]:
vect2 = CountVectorizer(stop_words='english')
vect2.fit(X_train)
len(vect2.vocabulary_)

12

In [11]:
X_train_transformed = vect2.transform(X_train)
X_train_transformed   #compressed sparse row

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [12]:
print(X_train_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


In [13]:
X_train_transformed.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]])

In [14]:
pd.DataFrame(X_train_transformed.toarray(), columns = vect2.get_feature_names())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


In [15]:
## Read the test data
test_doc = pd.read_csv('example_test.csv')

test_doc.head()

Unnamed: 0,Document,Class
0,very good educational institution,education


In [16]:
test_doc['Class'] = test_doc['Class'].map({'cinema': 0, 'education':1})

In [25]:
test_doc.head()

test_array = test_doc.values

X_test = test_array[:,0]
y_test = test_array[:,1]
y_test = y_test.astype('int') # sklearn needs y as integers
X_test

array(['very good educational institution'], dtype=object)

In [18]:
X_test_transformed = vect2.transform(X_test)

In [19]:
print(X_test_transformed.toarray())
pd.DataFrame(X_test_transformed.toarray(), columns = vect2.get_feature_names())

[[0 0 1 0 1 0 0 1 0 0 0 0]]


Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,1,0,0,1,0,0,0,0


## Build the Multinomial Naive Bayes Model

In [26]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X_train_transformed, y_train)

MultinomialNB()

In [29]:
proba = mnb.predict_proba(X_test_transformed)

In [30]:
print("probability of test document belonging to class CINEMA" , proba[:,0])
print("probability of test document belonging to class EDUCATION" , proba[:,1])

probability of test document belonging to class CINEMA [0.32808399]
probability of test document belonging to class EDUCATION [0.67191601]


## Build the Beurnoli Naive Bayes model

In [33]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(X_train_transformed, y_train)

bnb_proba = bnb.predict_proba(X_test_transformed)
bnb_proba

array([[0.2326374, 0.7673626]])