In [60]:
import numpy as np
import pandas as pd

# Try with an example

In [1]:
x = [
    "This was an awesome movie" , 
    "Great movie! , I liked it a lot" ,
    "Happy ending! Awesome acting by the hero" ,
    "Loved it , truly great" ,
    "bad not upto the mark" ,
    "could have been better" ,
    "Surely a disappointing movie"
]

y = [1,1,1,1,0,0,0]

xtest = [
    "I was happy and I loved the acting in the movie" ,
    "The movie is bad"
]

print(type(x[0]))


<class 'str'>


# 1.Cleaning

In [2]:

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords

def getStemmedReview(review):
    
    review = review.replace('<br /><br />'," ")
    
    # Step 1 : Tokenize
    tokenizer = RegexpTokenizer(r'\w+')
    wordsList = tokenizer.tokenize(review)
    wordsList = [word.lower() for word in wordsList]
    
    
    # Step 2 : Remove Stop Words
    sw = stopwords.words('english')
    sw = set(sw)
    wordsList = [word for word in wordsList if word not in sw]
    
    
    # Step 3 : Stemming
    ps = PorterStemmer()
    wordsList = [ps.stem(word) for word in wordsList]
    #print(wordsList)
    
    # return as a sentence
    cleaned_review = " ".join(wordsList)
    
    return cleaned_review

In [5]:
getStemmedReview(x[0])

'awesom movi'

In [3]:
xclean = [getStemmedReview(i) for i in x]
xtest_clean = [getStemmedReview(i) for i in xtest]

In [4]:
print(xclean)

['awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']


## 2.Vectorization

In [118]:
from sklearn.feature_extraction.text import CountVectorizer

In [119]:
cv = CountVectorizer(ngram_range=(1,3))

In [120]:
x_vectorizedCorpus = cv.fit_transform(xclean).toarray()
print(x_vectorizedCorpus)

print()
print(x_vectorizedCorpus.shape)
print()
print(cv.vocabulary_)

[[0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0
  0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
  0 1 1 0 0]
 [0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
  1 0 0 0 0]]

(7, 41)

{'awesom': 2, 'movi': 31, 'awesom movi': 5, 'great': 17, 'like': 24, 'lot': 26, 'great movi': 18, 'movi like': 32, 'like lot': 25, 'great movi like': 19, 'movi like lot': 33, 'happi': 20, 'end': 14, 'act': 0, 'hero': 23, 'happi end': 21, 'end awesom': 15, 'awesom act': 3, 'act hero': 1, 'happi end awesom': 22, 'end awesom act': 16, 'awesom act hero': 4, 'love': 27, 'truli': 37,

In [121]:
# test set
xtest_vectorizedCorpus = cv.transform(xtest_clean).toarray()
print(xtest_vectorizedCorpus)
print(cv.vocabulary_)
print()
print(xtest_vectorizedCorpus.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0]]
{'awesom': 2, 'movi': 31, 'awesom movi': 5, 'great': 17, 'like': 24, 'lot': 26, 'great movi': 18, 'movi like': 32, 'like lot': 25, 'great movi like': 19, 'movi like lot': 33, 'happi': 20, 'end': 14, 'act': 0, 'hero': 23, 'happi end': 21, 'end awesom': 15, 'awesom act': 3, 'act hero': 1, 'happi end awesom': 22, 'end awesom act': 16, 'awesom act hero': 4, 'love': 27, 'truli': 37, 'love truli': 28, 'truli great': 38, 'love truli great': 29, 'bad': 6, 'upto': 39, 'mark': 30, 'bad upto': 7, 'upto mark': 40, 'bad upto mark': 8, 'could': 10, 'better': 9, 'could better': 11, 'sure': 34, 'disappoint': 12, 'sure disappoint': 35, 'disappoint movi': 13, 'sure disappoint movi': 36}

(2, 41)


## 3. Multinomial Event Naive bayes

In [122]:
from sklearn.naive_bayes import MultinomialNB

In [123]:
mnb = MultinomialNB()

In [124]:
mnb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [125]:
# Perform training.
mnb.fit(x_vectorizedCorpus, y )

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [126]:
mnb.predict(xtest_vectorizedCorpus)

array([1, 0])

In [127]:
mnb.predict_proba(xtest_vectorizedCorpus)

array([[0.13904125, 0.86095875],
       [0.61648526, 0.38351474]])

In [137]:
print(mnb.score(x_vectorizedCorpus , y))

1.0


## Multivariate Bernaulli Event Naive Bayes

In [128]:
from sklearn.naive_bayes import BernoulliNB

In [129]:
bnb = BernoulliNB()

In [130]:
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [131]:
print(bnb.fit(x_vectorizedCorpus , y))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [132]:
print(bnb.predict_proba(xtest_vectorizedCorpus))

[[0.13312578 0.86687422]
 [0.80373674 0.19626326]]


In [133]:
print(bnb.predict(xtest_vectorizedCorpus))

[1 0]


In [136]:
print(bnb.score(x_vectorizedCorpus , y))

1.0
