## Implementing Naive Bayes Classifier on Large Movie Reviews dataset

### ANKIT KHANNA
### ID: 1001553616

In [4]:
import numpy as np
import pandas as pd
import nltk
import glob
import os
import random
import re
import string

from IPython.core.display import display
from nltk.corpus import stopwords

In [34]:
data_dir = 'aclImdb/'
review_files = ['pos', 'neg']
cols = ['review', 'sentiment', 'prob_pos', 'prob_neg', 'pred_sentiment']

### Load and Read Dataset

In [35]:
def load_read_data(data_set="train"):
    df = pd.DataFrame(columns=cols)
    for file in review_files:
        new_path = data_dir + data_set + "/" + file + "/"
        for filename in glob.glob(new_path + "*.txt"):
            content = ''.join(open(filename, 'r').readlines())
            df = df.append(
                {"review": content, "sentiment": file}, ignore_index=True)
    return df

train_data = pd.DataFrame(columns=cols)
test_data = pd.DataFrame(columns=cols)

train_flag = False
test_flag = False

# pickling the dataframe for faster computations
train_pkl = "./train.pkl"
test_pkl = "./test.pkl"

if os.path.isfile(train_pkl) or train_flag:
    train_data = pd.read_pickle(train_pkl, compression="gzip")
else:
    train_data = load_read_data("train")
    train_data.to_pickle(train_pkl, compression="gzip")

if os.path.isfile(test_pkl) or test_flag:
    test_data = pd.read_pickle(test_pkl, compression="gzip")
else:
    test_data = load_read_data("test")
    test_data.to_pickle(test_pkl, compression="gzip")

train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

print("Train data post pickling:")
display(train_data)
print("Test data post pickling:")
display(test_data)

Train data post pickling:


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,"I don't know if I'm just weird, but I thorough...",pos,,,
1,This is the first American film to successfull...,pos,,,
2,Esther Williams gets her first post MGM starri...,pos,,,
3,I doubt this will ever even be a cult film. I ...,neg,,,
4,I really liked this movie because I have a hus...,pos,,,
...,...,...,...,...,...
24995,"This is a really fun, breezy, light hearted ro...",pos,,,
24996,"Without ""mental anachronism"", this film which ...",pos,,,
24997,Disney? What happened? I really wish the movie...,neg,,,
24998,A Classic Hollywood Biopic is the best sense o...,pos,,,


Test data post pickling:


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,Another example that we should stay away from ...,neg,,,
1,A few buddies and myself have the strange hobb...,neg,,,
2,Did the first travesty actually make money? Th...,neg,,,
3,"as a 'physically challenged' person (god, how ...",pos,,,
4,"I always liked this movie, I have seen it so m...",pos,,,
...,...,...,...,...,...
24995,"I really enjoyed this movie, and I'm not a cla...",pos,,,
24996,My guess is that the producers of this low-bud...,neg,,,
24997,"""Haaaarrrryyy!"" <br /><br />The amplified, dis...",neg,,,
24998,Trilogies are very interesting. Some go out wi...,pos,,,


### Data Cleaning (1st of Data Pre-processing)

In [36]:
def clean_data(txt):
    txt = txt.lower().strip()
    txt = " ".join([w for w in txt.split() if len(w) > 2])
    txt = re.sub('\[.*?\]', '', txt)
    txt = re.sub('https?://\S+|www\.\S+', '', txt)
    txt = re.sub('<.*?>+', '', txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), '', txt)
    txt = re.sub('\n', '', txt)
    txt = re.sub('\w*\d\w*', '', txt)
    return txt

train_data['review'] = train_data['review'].apply(clean_data)
test_data['review'] = test_data['review'].apply(clean_data)

print("Data post cleaning:")
display(train_data.head())
display(test_data.head())

Data post cleaning:


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,dont know im just weird but thoroughly enjoyed...,pos,,,
1,this the first american film successfully adop...,pos,,,
2,esther williams gets her first post mgm starri...,pos,,,
3,doubt this will ever even cult film loved gram...,neg,,,
4,really liked this movie because have husband j...,pos,,,


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,another example that should stay away from try...,neg,,,
1,few buddies and myself have the strange hobby ...,neg,,,
2,did the first travesty actually make money thi...,neg,,,
3,physically challenged person god how hate that...,pos,,,
4,always liked this movie have seen many times b...,pos,,,


### Tokenising Data (2nd of Data Pre-processing)

In [37]:
token = nltk.tokenize.RegexpTokenizer(r'\w+')

train_data['review'] = train_data['review'].apply(token.tokenize)
test_data['review'] = test_data['review'].apply(token.tokenize)

print("Data post Tokenising:")
display(train_data.head())
display(test_data.head())

Data post Tokenising:


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,"[dont, know, im, just, weird, but, thoroughly,...",pos,,,
1,"[this, the, first, american, film, successfull...",pos,,,
2,"[esther, williams, gets, her, first, post, mgm...",pos,,,
3,"[doubt, this, will, ever, even, cult, film, lo...",neg,,,
4,"[really, liked, this, movie, because, have, hu...",pos,,,


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,"[another, example, that, should, stay, away, f...",neg,,,
1,"[few, buddies, and, myself, have, the, strange...",neg,,,
2,"[did, the, first, travesty, actually, make, mo...",neg,,,
3,"[physically, challenged, person, god, how, hat...",pos,,,
4,"[always, liked, this, movie, have, seen, many,...",pos,,,


### Stopwords Removal (3rd of Data Pre-processing)

In [38]:
def stopword_removal(text):
    words = [
        w for w in text if w not in stop_words and w in corpus or not w.isalpha()]
    words = list(filter(lambda word: words.count(word) >= 2, set(words)))
    return words

corpus = set(nltk.corpus.words.words())
stop_words = set(stopwords.words('english'))

remove_words = ['movie', 'film', 'one', 'made', 'many', 'time', 'story', 'character', 'still', 'seen', 'picture', 'people', 'see', 'never', 'come',
          'even', 'way', 'plot', 'house', 'horror', 'think', 'make', 'first', 'scene', 'director', 'two', 'show', 'become', 'brother', 'che', 'got', 'ago']
stop_words = stop_words.union(remove_words)

train_data['review'] = train_data['review'].apply(stopword_removal)
test_data['review'] = test_data['review'].apply(stopword_removal)

print("Data after removing stopwords:")
display(train_data.head())
display(test_data.head())

Data after removing stopwords:


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,"[also, probably, cabin, lake, fact, looking]",pos,,,
1,"[stays, crown, though, childhood, three, city,...",pos,,,
2,[universal],pos,,,
3,"[gram, good, dull]",neg,,,
4,"[life, funny, ben, baseball]",pos,,,


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,[],neg,,,
1,"[plane, dumb, case, monster, cast, reason, lik...",neg,,,
2,"[sequel, money, another]",neg,,,
3,"[wait, disabled, physically, person, nothing, ...",pos,,,
4,"[always, dont]",pos,,,


### Lemmatizing (4th of Data Pre-processing)

In [39]:
def lemmatization_func(txt):
    return [lemmatize_obj.lemmatize(w) for w in txt]

lemmatize_obj = nltk.stem.WordNetLemmatizer()

train_data['review'] = train_data['review'].apply(lemmatization_func)
test_data['review'] = test_data['review'].apply(lemmatization_func)
display(train_data.head())
display(test_data.head())

Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,"[also, probably, cabin, lake, fact, looking]",pos,,,
1,"[stay, crown, though, childhood, three, city, ...",pos,,,
2,[universal],pos,,,
3,"[gram, good, dull]",neg,,,
4,"[life, funny, ben, baseball]",pos,,,


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
0,[],neg,,,
1,"[plane, dumb, case, monster, cast, reason, lik...",neg,,,
2,"[sequel, money, another]",neg,,,
3,"[wait, disabled, physically, person, nothing, ...",pos,,,
4,"[always, dont]",pos,,,


### Omitting Rare Words; Occurence <5 times (5th of Data Pre-processing) 

In [40]:
def omit_words(txt, rare_words):
    omitted_words = list(set(txt) - set(rare_words))
    return omitted_words


def locate_and_omit_words(frame):
    df_rows = frame.explode('review')
    word_counter = df_rows.review.value_counts(ascending=True)
    rare_words = word_counter[word_counter <= 5].index.to_list()

    frame['review'] = frame['review'].apply(
        lambda x: omit_words(x, rare_words))

    print("Before:", df_rows.shape[0], "\nAfter:", frame.explode('review').shape[0])
    return frame


print("For Train data:")
train_data = locate_and_omit_words(train_data)
print()

print("For Test data:")
test_data = locate_and_omit_words(test_data)

For Train data:
Before: 196213 
After: 184125

For Test data:
Before: 191739 
After: 179867


### Splitting Data Set into Train:Dev:Test

In [41]:
rows = 10000
split = 0.8

mid = int(np.floor_divide(rows, 1/split))

sample_train_data = train_data[:mid].copy(deep=True)
sample_dev_data = train_data[mid:mid + (rows - mid)].copy(deep=True)
sample_test_data = test_data[:rows].copy(deep=True)

print("Train dataset:", sample_train_data.shape[0])
print("Development dataset:", sample_dev_data.shape[0])
print("Test dataset:", sample_test_data.shape[0])

Train dataset: 8000
Development dataset: 2000
Test dataset: 10000


### Probability Computation and Naive Bayes Implementation

In [42]:
# Probability of the occurrence
#     P[“the”] = num of documents containing ‘the’ / num of all documents
# Conditional probability based on the sentiment
#     P[“the” | Positive]  = # of positive documents containing “the” / num of all positive review documents

def word_prob(train):
    word_counts = {}
    
    pos_docs = train[train.sentiment == "pos"]
    num_pos_sent_docs = pos_docs.shape[0]

    neg_docs = train[train.sentiment == "neg"]
    num_neg_sent_docs = neg_docs.shape[0]

    for row in train.itertuples():
        review = row.review

        for word in review:
            pos_sent_flag = None
            neg_sent_flag = None

            if word in word_counts.keys():
                pos_sent_flag = word_counts[word]['prob_pos']
                neg_sent_flag = word_counts[word]['prob_neg']
            else:
                num_pos_docs = pos_docs[pos_docs.review.apply(lambda x: bool(set(x) & {word}))].shape[0]
                num_neg_docs = neg_docs[neg_docs.review.apply(lambda x: bool(set(x) & {word}))].shape[0]
                
                pos_sent_flag = round(num_pos_docs / num_pos_sent_docs, 4)
                neg_sent_flag = round(num_neg_docs / num_neg_sent_docs, 4)
                
                word_counts[word] = {'prob_pos': pos_sent_flag, 'prob_neg': neg_sent_flag}
    return word_counts

def naive_bayes(train, test, smoothing=False):
    train_word_probs = word_prob(train)
    correct = 0
    smoothing_param = 0

    if smoothing:
        smoothing_param = 1 / \
            sample_train_data.explode('review').review.shape[0]

    for row in test.itertuples():
        review = row.review
        pos_prob = 1.0
        neg_prob = 1.0

        for word in review:
            pos_sent_flag = 0.0
            neg_sent_flag = 0.0
            
            if word in train_word_probs.keys():
                probs_word = train_word_probs[word]
                pos_sent_flag = probs_word['prob_pos']
                neg_sent_flag = probs_word['prob_neg']
            
            pos_prob = pos_prob * (pos_sent_flag + smoothing_param)
            neg_prob = neg_prob * (neg_sent_flag + smoothing_param)

        total_train_docs = train.shape[0]
    
        num_pos_sent_docs = train[train.sentiment == "pos"].shape[0]
        num_neg_sent_docs = train[train.sentiment == "neg"].shape[0]
        
        prob_pos_sent = round(num_pos_sent_docs / total_train_docs, 4)
        prob_neg_sent = round(num_neg_sent_docs / total_train_docs, 4)
        
        pos_prob = prob_pos_sent * pos_prob
        neg_prob = prob_neg_sent * neg_prob

        pred_sent = 0

        if pos_prob > neg_prob:
            pred_sent = "pos"
        elif pos_prob < neg_prob:
            pred_sent = "neg"

        if row.sentiment == pred_sent:
            correct += 1

        test.at[row.Index, 'prob_pos'] = pos_prob
        test.at[row.Index, 'prob_neg'] = neg_prob
        test.at[row.Index, 'pred_sentiment'] = pred_sent

    accuracy = round(correct / test.shape[0] * 100, 2)
    print("Accuracy: {}%".format(accuracy))
    return

print("Predicting sentiment of reviews without smoothing")
naive_bayes(sample_train_data, sample_dev_data)

display(sample_dev_data.head(10))

Predicting sentiment of review:
Accuracy: 64.9%


Unnamed: 0,review,sentiment,prob_pos,prob_neg,pred_sentiment
8000,"[something, love, telling, bag, segment, anoth...",neg,0.0,4.92736e-36,neg
8001,[],pos,0.5046,0.4954,pos
8002,[clever],pos,0.00060552,0.00064402,neg
8003,[watching],neg,0.00938556,0.0136235,neg
8004,"[bad, noise, whole, dont, really, scary, jump,...",neg,1.3524300000000002e-18,5.62262e-16,neg
8005,"[get, start, series, really, dead, nightmare, ...",neg,1.06911e-14,4.23588e-14,neg
8006,"[club, music, rather, actually, really, true, ...",pos,4.3541e-45,2.92443e-46,pos
8007,"[animation, great]",pos,0.000517962,6.75726e-05,pos
8008,"[saw, main, part, something]",neg,4.73337e-08,3.63974e-08,pos
8009,"[telling, clearly, either, need, religion, lau...",neg,2.14549e-28,0.0,pos


### Cross Validation (5 Fold)

In [43]:
def cross_val(train, k, smoothing=False):
    dev = 1/k
    for i in range(1, k + 1):
        dev_sample = train.sample(
            frac=dev, replace=False, random_state=i).copy(deep=True)
        train_sample = train.drop(dev_sample.index, axis=0).copy(deep=True)

        if smoothing:
            print("CV Pass", i, "with smoothing")
        else:
            print("CV Pass", i)
        
        naive_bayes(train_sample, dev_sample, smoothing)
        print()
    return

cross_val(sample_train_data, 5)

CV Pass 1
Accuracy: 60.06%

CV Pass 2
Accuracy: 61.31%

CV Pass 3
Accuracy: 61.5%

CV Pass 4
Accuracy: 62.31%

CV Pass 5
Accuracy: 59.81%



### Accuracy with Smoothing

In [44]:
print("Predicting sentiment of reviews with smoothing")
naive_bayes(sample_train_data, sample_dev_data, smoothing=True)

Predicting sentiment of reviews with smoothing
Accuracy: 69.95%


### Cross Validation with Smoothing

In [45]:
cross_val(sample_train_data, 5, smoothing=True)

CV Pass 1 with smoothing
Accuracy: 67.38%

CV Pass 2 with smoothing
Accuracy: 68.62%

CV Pass 3 with smoothing
Accuracy: 68.62%

CV Pass 4 with smoothing
Accuracy: 69.94%

CV Pass 5 with smoothing
Accuracy: 67.88%



### Top 10 Words that predict Pos and Neg class

In [46]:
acc_pred = sample_dev_data[(sample_dev_data.sentiment == sample_dev_data.pred_sentiment) & (sample_dev_data.review.str.len() == 1)]

pos_preds = acc_pred[acc_pred.sentiment == "pos"].sort_values(by=['prob_pos'], ascending=False)
top_ten_pos = pos_preds.explode('review').review.unique()[:10].tolist()

print("Top 10 words that predict Positive class:")
for i, word in enumerate(top_ten_pos):
    print("{}. {}".format(i + 1, word))
print()

neg_preds = acc_pred[acc_pred.sentiment == "neg"].sort_values(by=['prob_neg'], ascending=False)
top_ten_neg = neg_preds.explode('review').review.unique()[:10].tolist()

print("Top 10 words that predict Negative class:")
for i, word in enumerate(top_ten_neg):
    print("{}. {}".format(i + 1, word))

Top 10 words that predict Positive class:
1. good
2. great
3. also
4. well
5. life
6. love
7. little
8. family
9. world
10. role

Top 10 words that predict Negative class:
1. like
2. bad
3. would
4. really
5. dont
6. get
7. much
8. could
9. acting
10. ever


### Final Accuracy using Test Dataset with Smoothing

In [47]:
naive_bayes(train_data, test_data, smoothing=True)

Accuracy: 65.88%


### References

1. https://towardsdatascience.com/unfolding-na%C3%AFve-bayes-from-scratch-2e86dcae4b01
2. https://lazyprogrammer.me/probability-smoothing-for-natural-language-processing/
3. https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/
4. https://machinelearningmastery.com/k-fold-cross-validation/
5. https://www.analyticsvidhya.com/blog/2019/08/how-to-remove-stopwords-text-normalization-nltk-spacy-gensim-python/