In [21]:
import pandas as pd
docs = pd.read_csv('example_train.csv')
docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [22]:
docs['Class'] = docs.Class.map({ 'education': 1, 'cinema':0 })

In [23]:
docs.head()

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [24]:
np_array = docs.as_matrix()
X = np_array[:,0]
Y = np_array[:,1]
Y = Y.astype('int')
print("X")
print(X)
print("Y")
print(Y)

X
['Upgrad is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
Y
[1 1 1 0 0]


Vectorize the words

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
#Remove the stop words
vec = CountVectorizer(stop_words='english')

In [26]:
vec.fit(X)
vec.vocabulary_

{'cinema': 0,
 'depends': 1,
 'educational': 2,
 'ethics': 3,
 'good': 4,
 'great': 5,
 'greatness': 6,
 'institution': 7,
 'movie': 8,
 'sholey': 9,
 'story': 10,
 'upgrad': 11}

In [27]:
print(vec.get_feature_names())

['cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'movie', 'sholey', 'story', 'upgrad']


Lets create the feature vectors out of each document

In [28]:
X_transformed = vec.transform(X)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [29]:
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


4 steps performed so far:

1. Import
2. Instantiate CountVectorizer
3. Vectorize docs into sparse matrix with vocabulary
4. Transform

In [30]:
test_docs = pd.read_csv('example_test.csv')
test_docs

Unnamed: 0,Document,Class
0,very good educational institution,education


In [31]:
test_docs['Class'] = test_docs.Class.map({ 'education': 1, 'cinema':0 })

In [33]:
np_array = test_docs.as_matrix()
X_test = np_array[:,0]
Y_test = np_array[:,1]
Y_test = Y_test.astype('int')
print("X_test")
print(X_test)
print("Y_test")
print(Y_test)

X_test
['very good educational institution']
Y_test
[1]


In [34]:
X_test_transformed = vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

# Multinomial Naive Bayes

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [36]:
mb = MultinomialNB()

In [38]:
mb.fit(X_transformed,Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
mb.predict_proba(X_test_transformed)

array([[0.32808399, 0.67191601]])

# Bernoulli Naive Bayes

In [41]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(X_transformed, Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [42]:
bnb.predict_proba(X_test_transformed)

array([[0.2326374, 0.7673626]])