In [6]:
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings('ignore')


In [7]:
#we have text in column 1 and class of that text in column 2
docs = pd.read_csv('example_train1.csv')

docs

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [8]:
#convert class to binary class

def change_class(x):
    
    if x == 'education':
        return 1
    else:
        return 0
    
docs['Class'] = docs['Class'].apply(change_class)    

In [9]:
docs

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [10]:
#converting into matrix and defining x and y

numpy_array = docs.as_matrix()

X = numpy_array[:, 0]
Y = numpy_array[:, 1]
Y = Y.astype('int')

print('X')
print(X)
print('\n')
print('Y')
print(Y)


X
['Teclov is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']


Y
[1 1 1 0 0]


In [11]:
# create an object of CountVectorizer() class 

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

In [12]:
#fit() method which converts a corpus of documents into a vector of unique words 

vec.fit(X)
vec.vocabulary_

{'teclov': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [13]:
#removing the stop words

vec = CountVectorizer(stop_words='english')
vec.fit(X)
vec.vocabulary_

{'teclov': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [14]:
#another way of representing data would be;
X_transformed = vec.transform(X)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [15]:
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


This representation can be understood as follows:

Consider first 4 rows of the output: (0,2), (0,5), (0,7) and (0,11). It says that the first document (index 0) has 7th , 2nd , 5th and 11th 'word' present in the document, and that they appear only once in the document- indicated by the right hand column entry.

Similarly, consider the entry (4,4) (third from bottom). It says that the fifth document has the fifth word present twice. Indeed, the 5th word('good') appears twice in the 5th document.

In real problems, you often work with large documents and vocabularies, and each document contains only a few words in the vocabulary. So it would be a waste of space to store the vocabulary in a typical dataframe, since most entries would be zero. Also, matrix products, additions etc. are much faster with sparse matrices. That's why we use sparse matrices to store the data.

In [16]:
#converting this matrix to array
X = X_transformed.toarray()
X

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [18]:
#converting to data frame
X = pd.DataFrame(X, columns = vec.get_feature_names())
X

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,teclov
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


## Now we will transform the test data

In [20]:
test_docs = pd.read_csv('example_text.csv')

In [21]:
test_docs

Unnamed: 0,Document,Class
0,very good educational institution,education


In [22]:
#convert the class column to binary
test_docs['Class'] = test_docs['Class'].apply(change_class)

In [23]:
test_docs

Unnamed: 0,Document,Class
0,very good educational institution,1


In [24]:
#converting to matrix and defining X and Y
test_numpy_array = test_docs.as_matrix()

X_test = test_numpy_array[:, 0]
Y_test = test_numpy_array[:, 1]
Y_test = Y_test.astype('int')

print('X_test \n')
print(X_test)
print('Y_test \n')
print(Y_test)

X_test 

['very good educational institution']
Y_test 

[1]


In [26]:
#transform X_test
X_test_transformed = vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [27]:
#convert to array
X_test = X_test_transformed.toarray()
X_test

array([[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

## Multinomial Naive Bayes

In [28]:
#building the model and getting the predictions
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X,Y)

mnb.predict_proba(X_test)

array([[0.32808399, 0.67191601]])

In [29]:
#putting into a data frame 

proba = mnb.predict_proba(X_test)

pred_probability = pd.DataFrame(proba, columns = ['Cinema', 'Education'])

pred_probability

Unnamed: 0,Cinema,Education
0,0.328084,0.671916


### As we can see that the test phrase belong to class Education as the probability is greater than 0.5%

# Bervoulli Naive Bayes

In [31]:
#using the bernoulli model to get the predictions
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(X,Y)

probabilities = bnb.predict_proba(X_test)

probabilities = pd.DataFrame(probabilities, columns = ['Cinema', 'Education'])

probabilities

Unnamed: 0,Cinema,Education
0,0.232637,0.767363


## So Bernoulli also predicted the test phrase to   belong to class eduction with a probability of 76 %