In [2]:
import numpy as np

In [3]:
trainXfilePath = './Dataset/imdb_trainX.txt'
trainYfilePath = './Dataset/imdb_trainY.txt'

testXfilePath  = './Dataset/imdb_testX.txt'
testYfilePath  = './Dataset/imdb_testY.txt'

## Reading data from the text files

In [4]:
with open(trainXfilePath) as f:
    reviews = f.readlines()
X_train = []
for review in reviews:
    X_train.append(review)

    
with open(trainYfilePath) as f:
    labels = f.readlines()

Y_train = []
for label in labels:
    Y_train.append(int(label))
    

with open(testXfilePath) as f:
    reviews = f.readlines()

X_test = []
for review in reviews:
    X_test.append(review)


with open(testYfilePath) as f:
    labels = f.readlines()

Y_test = []
for label in labels:
    Y_test.append(int(label))

In [5]:
Y_train = np.array(Y_train)

In [6]:
Y_train.shape

(25000,)

In [7]:
Y_test = np.asarray(Y_test)

In [8]:
Y_test.shape

(25000,)

# Creating own Multinomial Naive Bayes model from scratch

### Function for stop word removal and stemming

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

tokenizer = RegexpTokenizer(r'\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

#returns "words" from the input text after stemming and removing stop words 
def get_useful_words(text):
    text = text.lower()
    text = text.replace("<br />","")
    
    #tokenization
    word_list = tokenizer.tokenize(text)
    
    #stop word removal
    useful_words = [w for w in word_list if w not in sw]
    
    #stemming
    stemmed_words = [ps.stem(w) for w in useful_words]
    
    return stemmed_words

#returns "text" after cleaning i.e. removing stop words and stemming 
def getCleanReview(review):
    
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in sw]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

### Function to train our Multinomial Naive Bayes model

In [118]:
def train(X_train,Y_train):
    #initialize number of training documents read till now as 0
    N = 0

    #initialize the count of each class's documents as 0
    Nc = []
    index = 0
    #dictionary which maps each label's value with its corresponding index in Nc
    label_index = {}
    #list of dictionaries of each class words mapped to their count(i.e. frequency)
    count_w_c = []
    #list of count of all words of each class
    count_c = []
    #vocabulary
    vocab = {}

    for i in np.unique(Y_train):
        Nc.append(0)
        count_c.append(0)
        count_w_c.append({})
        label_index[i] = index
        index += 1

    for i in range(Y_train.shape[0]):
        N += 1
        l_index = label_index[Y_train[i]]
        Nc[l_index] += 1

        #reading a document
        document = X_train[i]

        #extracting words from the document (and removing stop words and stemming)
        words = get_useful_words(document)
#         words = tokenizer.tokenize(document.lower())

        for word in words:
            count_c[l_index] += 1
            try:
                count_w_c[l_index][word] += 1
                vocab[word] +=1
            except:
                count_w_c[l_index][word] = 1
                vocab[word] =1
    return N, Nc, vocab, label_index, count_w_c, count_c

In [119]:
%%time
N, Nc, vocab, label_index, count_w_c, count_c = train(X_train, Y_train)

CPU times: user 52.5 s, sys: 13.3 ms, total: 52.5 s
Wall time: 52.5 s


In [120]:
print(N)
print(Nc)
print(label_index)
# print(count_w_c)
print(count_c)

25000
[5100, 2284, 2420, 2696, 2496, 3009, 2263, 4732]
{1: 0, 2: 1, 3: 2, 4: 3, 7: 4, 8: 5, 9: 6, 10: 7}
[547932, 265451, 312604, 355564, 342091, 399534, 290496, 510811]


In [121]:
list(label_index.keys())

[1, 2, 3, 4, 7, 8, 9, 10]

In [122]:
vocab_size = np.max(np.asarray(list(vocab.values())))
vocab_size

29007

### Function to get the prediction on the test data

In [123]:
def prior_prob(label):
    return Nc[label_index[label]]/N


def predict(xtest):
#     words = tokenizer.tokenize(xtest.lower())
    words = get_useful_words(xtest)
    
    classes = list(label_index.keys())
    post_prob = []
    for label in classes:
        log_likelihood = 0
        total_words = count_c[label_index[label]] + vocab_size
        for word in words:
            try:
                count_word = count_w_c[label_index[label]][word] + 1
            except:
                count_word = 1
            finally:
                log_likelihood += np.log2(count_word/total_words)
                
        prior = prior_prob(label)
        
        post = log_likelihood + np.log2(prior)
        post_prob.append(post)
    
#     print(post_prob)
    pred = np.argmax(post_prob)
    return pred
            
        

In [124]:
%%time
index = 15
pred = predict(X_test[index])
print(pred)
print(np.unique(Y_train)[pred])
print(Y_test[index])

7
10
9
CPU times: user 4.48 ms, sys: 19 µs, total: 4.5 ms
Wall time: 3.5 ms


In [125]:
type(predict(X_test[1]))

numpy.int64

### Function to find out the accuracy on the test data

In [126]:
def acc(X_test,Y_test):
    correct = 0
    itr = 0
    for i in range(Y_test.shape[0]):
        itr += 1
        pred = predict(X_test[i])
        if Y_test[i] == np.unique(Y_train)[pred]:
            correct += 1
        print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))), end='\r')
    print('\n')

### Accuracy with stemming

In [127]:
%%time
acc(X_test,Y_test)

Iterations : 25000      Accuracy : 0.3754

CPU times: user 1min 56s, sys: 1.25 s, total: 1min 57s
Wall time: 1min 53s


### Accuracy with stop word removal

In [116]:
%%time
acc(X_test,Y_test)

Iterations : 25000      Accuracy : 0.3647

CPU times: user 1min 3s, sys: 928 ms, total: 1min 4s
Wall time: 60 s


### Accuracy without stop word removal

In [102]:
%%time
acc(X_test,Y_test)

Iterations : 25000      Accuracy : 0.3765

CPU times: user 1min 47s, sys: 1.32 s, total: 1min 48s
Wall time: 1min 44s


### Random guess accuracy

In [103]:
from numpy import random

In [104]:
def acc(Y_test):
    correct = 0
    itr = 0
    for i in range(Y_test.shape[0]):
        itr += 1
        pred = random.randint(10)
        if Y_test[i] == pred:
            correct += 1
    print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))))

In [105]:
%%time
acc(Y_test)

Iterations : 25000      Accuracy : 0.0799
CPU times: user 44.5 ms, sys: 3.86 ms, total: 48.3 ms
Wall time: 66.3 ms


# Using Sklearn MultinomialNB and BernoulliNB models

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

### Data Cleaning

In [11]:
%%time
#Getting cleaned text
X_clean = [getCleanReview(i) for i in X_train]
Xt_clean = [getCleanReview(i) for i in X_test]

CPU times: user 1min 48s, sys: 256 ms, total: 1min 49s
Wall time: 1min 49s


### Vectorization

In [12]:
%%time
cv = CountVectorizer()                               #No stop word removal or stemming
X_vec = cv.fit_transform(X_clean)

CPU times: user 2.36 s, sys: 28 ms, total: 2.39 s
Wall time: 2.39 s


In [13]:
%%time
cv1 = CountVectorizer(tokenizer=get_useful_words)   #Stop word removal and stemming using our own tokenizer
X1_vec = cv1.fit_transform(X_clean)

CPU times: user 49 s, sys: 7.96 ms, total: 49 s
Wall time: 49 s


## MultinomialNB model

In [132]:
from sklearn.naive_bayes import MultinomialNB
mnb  = MultinomialNB()
mnb1 = MultinomialNB()

In [133]:
#Training
mnb.fit(X_vec,Y_train)
mnb1.fit(X1_vec,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [136]:
%%time
#vectorizing test set's 1 example
#Following two are vectors of same test example but they have different number of features
#as both are generated from different countvectorizer objects.
Xt_vec  = cv.transform(Xt_clean[1:2]).toarray()
Xt_vec1 = cv1.transform(Xt_clean[1:2]).toarray()

CPU times: user 3.2 ms, sys: 0 ns, total: 3.2 ms
Wall time: 2.88 ms


In [137]:
print(mnb.predict(Xt_vec[:1])[0])     #without stop word removal or stemming

print(mnb1.predict(Xt_vec1[:1])[0])    #with stop word removal and stemming

7
7


In [138]:
Y_test[:1][0]

7

### Accuracy without stemming (default)

In [141]:
%%time
itr = 0
correct = 0
for i in range(Y_test.shape[0]):
    itr += 1
    Xt_vec = cv.transform(Xt_clean[i:i+1]).toarray()
    if Y_test[i] == mnb.predict(Xt_vec)[0]:
        correct += 1
print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))))


Iterations : 25000      Accuracy : 0.3832
CPU times: user 27.5 s, sys: 14.7 ms, total: 27.5 s
Wall time: 13.8 s


### Accuracy with stop word removal and stemming using own tokenizer

In [143]:
%%time
itr = 0
correct = 0
for i in range(Y_test.shape[0]):
    itr += 1
    Xt_vec = cv1.transform(Xt_clean[i:i+1]).toarray()
    if Y_test[i] == mnb1.predict(Xt_vec)[0]:
        correct += 1
    print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))),end='\r')
print('\n')

Iterations : 25000      Accuracy : 0.3872

CPU times: user 2min 35s, sys: 1.99 s, total: 2min 37s
Wall time: 1min 16s


In [14]:
from sklearn.naive_bayes import BernoulliNB
bnb  = BernoulliNB()
bnb1 = BernoulliNB()

In [15]:
#Training
bnb.fit(X_vec,Y_train)
bnb1.fit(X1_vec,Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [17]:
%%time
#vectorizing test set's 1 example
#Following two are vectors of same test example but they have different number of features
#as both are generated from different countvectorizer objects.
Xt_vec  = cv.transform(Xt_clean[1:2]).toarray()
Xt_vec1 = cv1.transform(Xt_clean[1:2]).toarray()

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.57 ms


In [18]:
print(bnb.predict(Xt_vec[:1])[0])     #without stop word removal or stemming

print(bnb1.predict(Xt_vec1[:1])[0])    #with stop word removal and stemming

10
10


In [19]:
Y_test[:1][0]

7

### Accuracy without stemming (default)

In [21]:
%%time
itr = 0
correct = 0
for i in range(Y_test.shape[0]):
    itr += 1
    Xt_vec = cv.transform(Xt_clean[i:i+1]).toarray()
    if Y_test[i] == bnb.predict(Xt_vec)[0]:
        correct += 1
    print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))),end='\r')
print('\n')


Iterations : 25000      Accuracy : 0.3708

CPU times: user 3min, sys: 2.4 s, total: 3min 2s
Wall time: 1min 29s


### Accuracy with stop word removal and stemming using own tokenizer

In [22]:
%%time
itr = 0
correct = 0
for i in range(Y_test.shape[0]):
    itr += 1
    Xt_vec = cv1.transform(Xt_clean[i:i+1]).toarray()
    if Y_test[i] == bnb1.predict(Xt_vec)[0]:
        correct += 1
    print('Iterations : %d      Accuracy : %.4f'%(itr,(correct/float(itr))),end='\r')
print('\n')

Iterations : 25000      Accuracy : 0.3747

CPU times: user 4min 50s, sys: 2.53 s, total: 4min 53s
Wall time: 2min 24s
