In [1]:
import numpy as np
import pandas as pd

# Create an NLP Pipeline to Clean Reviews Data 

- Step 1 : Tokenization
- Step 2 : Stop words Removal
- Step 3 : Stemming

In [2]:
# NLTK

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords



In [3]:
def getStemmedReview(review):
    
    review = review.replace('<br /><br />'," ")
    
    # Step 1 : Tokenize
    tokenizer = RegexpTokenizer(r'\w+')
    wordsList = tokenizer.tokenize(review)
    wordsList = [word.lower() for word in wordsList]
    
    
    # Step 2 : Remove Stop Words
    sw = stopwords.words('english')
    sw = set(sw)
    wordsList = [word for word in wordsList if word not in sw]
    
    
    # Step 3 : Stemming
    ps = PorterStemmer()
    wordsList = [ps.stem(word) for word in wordsList]
    #print(wordsList)
    
    # return as a sentence
    cleaned_review = " ".join(wordsList)
    
    return cleaned_review
    

In [22]:
def getCleanDataInFile(inputFileName , outputFileName):
    
    with open(inputFileName,encoding="utf8") as f:
        reviews = f.readlines()
    
    # Output stemmed data in an output file.
    
    outfile = open(outputFileName , 'w' , encoding="utf8")

    for review in reviews:
        CleanedReview = getStemmedReview(review)
        print((CleanedReview) , file=outfile)

    outfile.close()
    
    
TrainXInputFileName = "/Volumes/part3/IMDB_Dataset/imdb_trainX.txt"
TestXInputFileName = "/Volumes/part3/IMDB_Dataset/imdb_testX.txt"

TrainXOutputFileName = "/Volumes/part3/IMDB_Dataset/StemmedReviews_Xtrain.txt"
TestXOutputFileName = "/Volumes/part3/IMDB_Dataset/StemmedReviews_Xtest.txt"
    
getCleanDataInFile(TrainXInputFileName , TrainXOutputFileName)
getCleanDataInFile(TestXInputFileName , TestXOutputFileName)

In [4]:
def getDataFromFile(filename):
    
    f = open(filename , 'r')
    X = f.readlines()
    X = np.array(X)
    return X

In [5]:
XTrainingDataFileName = "/Volumes/part3/IMDB_Dataset/imdb_trainx.txt"
XTestingDataFileName = "/Volumes/part3/IMDB_Dataset/imdb_testx.txt"

YTrainingDataFileName = "/Volumes/part3/IMDB_Dataset/imdb_trainY.txt"
YTestingDataFileName = "/Volumes/part3/IMDB_Dataset/imdb_testY.txt"

In [6]:
Xtrain = getDataFromFile(XTrainingDataFileName)
print(Xtrain.shape)

(25000,)


In [7]:
Xtest = getDataFromFile(XTestingDataFileName)
print(Xtest.shape)

(25000,)


In [8]:
Ytrain = getDataFromFile(YTrainingDataFileName)
print(Ytrain.shape)
Ytrain = Ytrain[1:]
print(Ytrain.shape)

Ytest = getDataFromFile(YTestingDataFileName)
print(Ytest.shape)

(25001,)
(25000,)
(25000,)


In [9]:
def CleanYData(Y):
    for i in range(Y.shape[0]):
        Y = Y.astype(np.int)
        
    return Y


Ytrain = CleanYData(Ytrain)
Ytest = CleanYData(Ytest)


print(Ytrain[:6])
print(Ytest[:6])
print(type(Ytrain))
print(type(Ytest))

[10  8  7  8  8  8]
[ 7 10 10 10  8 10]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [10]:
Xtrain_Stemmed = [getStemmedReview(review) for review in Xtrain]

In [11]:
print(type(Xtrain_Stemmed[0]))
print(Xtrain_Stemmed[0])

<class 'str'>
love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag


In [12]:
Xtest_Stemmed = [getStemmedReview(review) for review in Xtest]

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(ngram_range=(1,1))
Xtrain_vectorizedCorpus = cv.fit_transform(Xtrain_Stemmed).toarray()
print(Xtrain_vectorizedCorpus)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
print(len(Xtrain_vectorizedCorpus[0]))

51229


In [16]:
len(cv.vocabulary_)
d = cv.vocabulary_
d

{'love': 26940,
 'movi': 30327,
 'sinc': 41077,
 'saw': 39227,
 'open': 32503,
 'day': 11520,
 'touch': 45871,
 'beauti': 4660,
 'strongli': 43289,
 'recommend': 36960,
 'see': 39837,
 'watch': 49223,
 'famili': 15667,
 'far': 15732,
 'mpaa': 30366,
 'rate': 36689,
 'pg': 34151,
 '13': 112,
 'themat': 45048,
 'element': 14327,
 'prolong': 35619,
 'scene': 39351,
 'disastor': 12657,
 'nuditi': 31909,
 'sexual': 40199,
 'languag': 25598,
 'first': 16383,
 'thing': 45157,
 'edison': 14090,
 'chen': 8548,
 'fantast': 15719,
 'believ': 4850,
 'job': 23683,
 'cambodian': 7420,
 'hit': 20993,
 'man': 27753,
 'born': 6164,
 'bred': 6470,
 'dump': 13731,
 'gladiatori': 18432,
 'ring': 37860,
 'hone': 21259,
 'craft': 10539,
 'savag': 39204,
 'batteri': 4521,
 'order': 32584,
 'surviv': 43881,
 'live': 26583,
 'mantra': 27905,
 'kill': 24663,
 'role': 38142,
 'littl': 26571,
 'dialogu': 12370,
 'least': 25907,
 'line': 26449,
 'thai': 44982,
 'perform': 33956,
 'compel': 9711,
 'probabl': 35523,

In [17]:
ratings = np.unique(Ytest)
print(ratings)

[ 1  2  3  4  7  8  9 10]


In [18]:
myRating_X = Xtrain_vectorizedCorpus[Ytrain==7]
(myRating_X[0])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Multinomial Event Naive Bayes Classifier -> Without sklearn

In [19]:
def prior(Ytrain , rating):
    
    denominator = Ytrain.shape[0]
    numerator = np.sum(Ytrain==rating)
    
    return numerator/((float)(denominator))

def cond_prob(Xtrain , Ytrain , word , rating, d , Xtrain_vectorizedCorpus):
    
    myRating_X = Xtrain_vectorizedCorpus[Ytrain==rating]
    numerator = 1.0
    denominator = len(d)
    
    index = d[word]
    
    temp = []
    
    for freq in myRating_X:
        temp.append(freq[index])
        denominator = np.sum(np.array(freq))
    
    temp = np.array(temp)
    numerator+=np.sum(temp)
    
    return numerator/denominator
    
    
    
    


def Posterior_Prob(Xtrain , Ytrain , testReview ,d , Xtrain_vectorizedCorpus):

    post_prob = []
    
    for rating in ratings:
        likelihood = 1
        for word in testReview:
            likelihood*=cond_prob(Xtrain , Ytrain , word , rating , d , Xtrain_vectorizedCorpus)
        
        curPostProb = prior(Ytrain , rating)*likelihood
        post_prob.append(curPostProb)
    
    post_prob = np.array(post_prob)
    return post_prob

def getPrediction(Xtrain , Ytrain , Xtest , d , Xtrain_vectorizedCorpus):
    
    ypred = []
    
    for i in range(len(Xtest)):
        post_prob = Posterior_Prob(Xtrain , Ytrain , Xtest[i].split(' ') ,d , Xtrain_vectorizedCorpus)
        index = post_prob.argmax()
        ypred.append(ratings[index])
    
    
    
    ypred = np.array(ypred)
    return ypred

def getAccuracy(Xtrain , Ytrain , Xtest , Ytest , Xtrain_vectorizedCorpus , d):
    
    ypred = getPrediction(Xtrain , Ytrain , Xtest , d , Xtrain_vectorizedCorpus)
    total = Ytest.shape[0]
    numerator = np.sum(ypred==Ytest)
    
    return numerator/total

In [21]:
i = d['love']
print(i)

26940


In [None]:
myRating_X = Xtrain_vectorizedCorpus[Ytrain==7]
temp = []
for freq in myRating_X:
    temp.append(freq[i])
    
temp=np.array(temp)
np.sum(temp)

In [None]:
Xtest[0]

## Using Scikit Library Multinomial Naive Bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(Xtrain_vectorizedCorpus , Ytrain)


Xtest_vectorizedCorpus = cv.transform(Xtest_Stemmed).toarray()
print(mnb.predict(Xtest_vectorizedCorpus))
print()
print(mnb.score(Xtrain_vectorizedCorpus, Ytrain))

[1 7 4 ... 1 4 1]

0.67512


## Using Scikit Library Bernoulli Naiva Bayes

In [32]:
from sklearn.naive_bayes import BernoulliNB , GaussianNB
bnb = BernoulliNB()
bnb.fit(Xtrain_vectorizedCorpus , Ytrain)
print(bnb.predict(Xtest_vectorizedCorpus))
print()
print(bnb.score(Xtrain_vectorizedCorpus, Ytrain))

[ 1 10  4 ...  1  1  1]

0.55048


In [None]:
gnb = GaussianNB()
gnb.fit(Xtrain_vectorizedCorpus , Ytrain)
print(gnb.predict(Xtest_vectorizedCorpus))
print()
print(gnb.score(Xtrain_vectorizedCorpus, Ytrain))