# Sentiment Analysis with Naive Bayes Classifier

In [1]:
import ast
import math
import pandas as pd
import numpy as np
from collections import Counter
from utils import split_sentiment, split_train_test
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelBinarizer

In [2]:
df = pd.read_csv("cleaned_data/cleaned_reviews_2.csv")
df.head()

Unnamed: 0,review_id,text,sentiment
0,705b4be0c87fc1a69fbbd6a12d4973a7,"['ugh', 'hat', 'last', 'book', 'in', 'series',...",0
1,9c8dfa25aa1c02eaa1784558401ada6c,"['start', 'out', 'strong', 'but', 'go', 'downh...",0
2,674c63c16f14e97d0d5b92237d061e04,"['decently', 'write', 'ya', 'book', 'but', 'ca...",0
3,e59c12c5107de7baeffa922d37f9b862,"['book', 'simultaneously', 'bore', 'death', 'a...",0
4,6805d23d0e5030a6bb9b6666623bedea,"['ugh', 'try', 'honestly', 'try', 'huge', 'fan...",0


## Naive Bayes with SKLearn

In [14]:
# Training with Count Vectorizer

x_train, x_test, y_train, y_test = train_test_split(df["text"], df["sentiment"], test_size=0.2, random_state=1)

cv = CountVectorizer().fit(x_train)
x_train_vec = cv.transform(x_train)

MNB = MultinomialNB()
MNB.fit(x_train_vec, y_train)
predicted = MNB.predict(cv.transform(x_test))
accuracy = metrics.accuracy_score(predicted, y_test)
print(accuracy)

0.851


In [15]:
# Training with Term Frequency - Inverse Document Frequency

x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(df["text"], df["sentiment"], test_size=0.2, random_state=1)

tfidf = TfidfVectorizer().fit(x_train_2)
x_train_vec_2 = tfidf.transform(x_train_2)

MNB.fit(x_train_vec_2, y_train_2)
predicted_2 = MNB.predict(tfidf.transform(x_test_2))
accuracy_2 = metrics.accuracy_score(predicted_2, y_test_2)
print(accuracy_2)

0.852


In [16]:
# Pipeline - same thing as both of them

model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(x_train, y_train)
labels = model.predict(x_test)
accuracy_3 = metrics.accuracy_score(labels, y_test)
print(accuracy_3)

0.851


## Naive Bayes from Scratch

In [3]:
# Create x and y for training and testing
def create_xy(df_train, df_test):
    cv = CountVectorizer().fit(df_train["text"])

    x_train = cv.transform(df_train["text"]).toarray()
    x_test = cv.transform(df_test["text"]).toarray()

    y_train = np.array(df_train["sentiment"])
    y_test = np.array(df_test["sentiment"])

    return x_train, x_test, y_train, y_test


# Feature counts and log probabilities
def feature_log_probs(x_train, y_train):
    y = LabelBinarizer().fit_transform(y_train)

    if y.shape[1] == 1:
        y = np.concatenate((1 - y, y), axis=1)

    fc = np.matmul(y.T, x_train)

    smoothed_fc = fc + 1
    smoothed_cc = smoothed_fc.sum(axis=1)
    log_probs = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))

    return log_probs


# Predict using posterior probabilities
def predict(x_test, log_probs):
    posterior = np.matmul(x_test, log_probs.T)
    prediction = np.argmax(posterior, axis=1)

    return prediction

In [5]:
df_pos, df_neg = split_sentiment(df)
df_train, df_test = split_train_test(df_pos, df_neg, 222)

x_train, x_test, y_train, y_test = create_xy(df_train, df_test)
log_probs = feature_log_probs(x_train, y_train)

y_predict = predict(x_test, log_probs)
accuracy = metrics.accuracy_score(y_test, y_predict)
print(accuracy)

0.8585
