Loading Dependencies

In [3]:
import re

import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

import math

import nltk

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict

Mount the drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the data

In [5]:
data_file_path = '/content/drive/MyDrive/5334_Assignment2/rt_reviews.csv'
data = pd.read_csv(data_file_path, encoding='ISO-8859-1')
data

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...
...,...,...
479995,rotten,Zemeckis seems unable to admit that the motio...
479996,fresh,Movies like The Kids Are All Right -- beautif...
479997,rotten,Film-savvy audiences soon will catch onto Win...
479998,fresh,An odd yet enjoyable film.


Checking for class imbalance

In [6]:
pd.value_counts(data['Freshness'])

fresh     240000
rotten    240000
Name: Freshness, dtype: int64

Data preprocessing - Stopwords

In [18]:
#code taken from [5]
def remove_tags(string):
    removelist = ""
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^w'+removelist+']', ' ',result)    #remove non-alphanumeric characters 
    result = result.lower()
    return result

data['Review']=data['Review'].apply(lambda cw : remove_tags(cw)) 
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['Review'] = data['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data preprocessing - Lemmatization

In [19]:
#code taken from [5]
nltk.download('wordnet')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

data['Review'] = data.Review.apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Encoding labels

In [20]:
reviews = data['Review'].values
labels = data['Freshness'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

Data split into Train and Test sets

In [21]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

Building Naive Bayes Classifier

In [22]:
#code taken from [5]
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = sorted(vec.vocabulary_.keys())
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]

Defining Laplace Smoothing

In [23]:
#code taken from [5]
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

Defining fit function for Naive Bayes Classifier

In [24]:
#code taken from [5]
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

Defining predict function for Naive Bayes Classifier

In [25]:
#code taken from [5]
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

Fitting the model on the training set and evaluating accuracy on test set

In [26]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

Accuracy of prediction on test set :  0.5000333333333333


**My Contribution: removing the duplicate data in the dataset**

Checking for duplicates

In [41]:
#Checking for duplicates
data2 = pd.read_csv(data_file_path, encoding='ISO-8859-1')
data2.loc[data2.duplicated()]

Unnamed: 0,Freshness,Review
953,fresh,Uma ï¿½ï¿½tima releitura da sï¿½ï¿½rie de tev...
1775,rotten,Many complaints were lobbed at Fantastic Four...
1820,rotten,"Michelle Pfeiffer is quite good, but the scri..."
1957,rotten,This is a tedious tale badly told.
2144,fresh,Harkens back to the initial days of the ought...
...,...,...
479988,rotten,"Reaching for Terrence Malick territory, the v..."
479991,fresh,Disney brilliantly executed another film in t...
479992,rotten,Director John Crowley overestimates the comed...
479993,rotten,Here's a sobering thought: If every war gets ...


We can see that there are a lot of duplicate data

Dropping duplicate rows and reseting index

In [42]:
data_without_dups = data2.loc[~data2.duplicated()].reset_index(drop=True)
data_without_dups

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...
...,...,...
339711,rotten,Roland Joffe's deeply ridiculous movie is cau...
339712,fresh,Movies like The Kids Are All Right -- beautif...
339713,rotten,Film-savvy audiences soon will catch onto Win...
339714,fresh,An odd yet enjoyable film.


Performing data preprocessing - stopwords

In [43]:
data_without_dups['Review']=data_without_dups['Review'].apply(lambda cw : remove_tags(cw)) 
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data_without_dups['Review'] = data_without_dups['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Performing data preprocessing - lemmatization

In [45]:
data_without_dups['Review'] = data_without_dups.Review.apply(lemmatize_text)

In [46]:
reviews2 = data_without_dups['Review'].values
labels2 = data_without_dups['Freshness'].values
encoder2 = LabelEncoder()
encoded_labels2 = encoder2.fit_transform(labels2)

Data_without_dups split into Train, Test, and Validation sets

In [47]:
#splits into train = 90% and test = 10%
train_sentences2, test_sentences2, train_labels2, test_labels2 = train_test_split(reviews2, encoded_labels2, test_size=0.1, stratify=encoded_labels2)

encoder2_2 = LabelEncoder()
encoded_labels2_2 = encoder2_2.fit_transform(train_labels2)

#splits remaining train set into train = 79.92% overall and val = 10.08% overall
train_sentences2, val_sentences2, train_labels2, val_labels2 = train_test_split(train_sentences2, train_labels2, test_size=0.112, stratify=encoded_labels2_2) 

Building Naive Bayes Classifier for data_without_dups

In [48]:
vec2 = CountVectorizer(max_features = 3000)
X2 = vec2.fit_transform(train_sentences2)
vocab2 = sorted(vec2.vocabulary_.keys())
X2 = X2.toarray()
word_counts2 = {}
for l in range(2):
    word_counts2[l] = defaultdict(lambda: 0)
for i in range(X2.shape[0]):
    l = train_labels2[i]
    for j in range(len(vocab2)):
        word_counts2[l][vocab2[j]] += X2[i][j]

Fitting the model on training set without duplicates and evaluating accuracy on test set

In [49]:
labels2 = [0,1]
n_label_items2, log_label_priors2 = fit(train_sentences2,train_labels2,labels2)
pred2 = predict(n_label_items2, vocab, word_counts, log_label_priors2, labels2, test_sentences2)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels2,pred2))

Accuracy of prediction on test set :  0.5537795831861533


References list:
[1] https://www.kaggle.com/datasets/ulrikthygepedersen/rotten-tomatoes-reviews

[2] https://www.sciencedirect.com/topics/mathematics/text-classification

[3] https://levity.ai/blog/text-classification

[4] https://www.oreilly.com/library/view/practical-natural-language/9781492054047/ch04.html

[5] https://www.analyticsvidhya.com/blog/2022/03/building-naive-bayes-classifier-from-scratch-to-perform-sentiment-analysis/

[6] https://towardsdatascience.com/a-guide-to-text-classification-and-sentiment-analysis-2ab021796317

[7] https://monkeylearn.com/what-is-text-classification/

[8] https://monkeylearn.com/blog/text-classification-machine-learning/

[9] https://en.wikipedia.org/wiki/Bayes%27_theorem

[10] https://www.investopedia.com/terms/b/bayes-theorem.asp

[11] https://www.gigacalculator.com/calculators/bayes-theorem-calculator.php

[12] https://seeve.medium.com/machine-learning-bayes-theorem-2f48c33d51e5

[13] https://towardsdatascience.com/laplace-smoothing-in-na%C3%AFve-bayes-algorithm-9c237a8bdece

[14] https://en.wikipedia.org/wiki/Additive_smoothing

[15] https://en.wikipedia.org/wiki/Probability

[16] https://www.mathworksheetsland.com/7/35freqgen.html

[17] https://www.analyticsvidhya.com/blog/2021/06/5-techniques-to-handle-imbalanced-data-for-a-classification-problem/