In [1]:
import ast
import math
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelBinarizer

In [23]:
df = pd.read_csv("cleaned_reviews.csv")
df.head()

Unnamed: 0,review_id,text,sentiment
0,31eac73fd3732e9fda22e6a2122e160c,"['check', 'out', 'other', 'book', 'review', 'b...",0
1,4fb438ca422c972b576f4b21772e5f81,['dnf'],0
2,454d50490492ff349e9c44ac941ee082,"['book', 'on', 'recommendation', 'shelf', 'sad...",0
3,173b777cd040cf0d3b9ea5f116a021fe,"['good', 'premise', 'but', 'halfway', 'through...",0
4,22fe845937341a1a797a7e33beb2b5e2,"['horrible', 'book', 'dont', 'know', 'buy', 'm...",0


In [52]:
# Split training data by sentiment

def split_sentiment(df):

    df_pos = df[df["sentiment"] == 1]
    df_neg = df[df["sentiment"] == 0]

    return df_pos, df_neg

# Split training and testing data

def split_data(df_pos, df_neg, random_state=None):

    df_train_pos, df_test_pos = train_test_split(df_pos, test_size=0.2, random_state=random_state)
    df_train_neg, df_test_neg = train_test_split(df_neg, test_size=0.2, random_state=random_state)

    df_train = pd.concat([df_train_pos, df_train_neg])
    df_test = pd.concat([df_test_pos, df_test_neg])

    return df_train, df_test

In [53]:
df_pos, df_neg = split_sentiment(df)
df_train, df_test = split_data(df_pos, df_neg)

In [54]:
# x_train, x_test, y_train, y_test = train_test_split(df["text"], df["sentiment"], test_size=0.2)

cv = CountVectorizer().fit(df_train["text"])

x_train_vec = cv.transform(df_train["text"]).toarray()
x_test_vec = cv.transform(df_test["text"]).toarray()

print(x_train_vec.shape)

(8000, 28586)


In [55]:
y = np.array(df_train["sentiment"])
y = LabelBinarizer().fit_transform(y)

if y.shape[1] == 1:
        y = np.concatenate((1 - y, y), axis=1)

fc = np.matmul(y.T, x_train_vec)

In [57]:
y.sum(axis=0)

array([4000, 4000])

In [58]:
smoothed_fc = fc + 1
smoothed_cc = smoothed_fc.sum(axis=1)
log_probs = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))

In [59]:
posterior = np.matmul(x_test_vec, log_probs.T)
prediction = np.argmax(posterior, axis=1)

In [60]:
prediction.shape

(2000,)

In [61]:
y_test = np.array(df_test["sentiment"])
accuracy = metrics.accuracy_score(y_test, prediction)
print(accuracy)

0.8575


In [9]:
fc = np.array([[2, 5, 3, 1],
                [3, 1, 1, 2]])
cc = fc.sum(axis=1)
cc_2 = cc.reshape(-1, 1)

logprobs = np.log(fc) - np.log(cc_2)
logprobs

array([[-1.70474809, -0.78845736, -1.29928298, -2.39789527],
       [-0.84729786, -1.94591015, -1.94591015, -1.25276297]])

In [22]:
train = ["this is sparta yeah",
         "this this yeah",
         "is is is sparta"]

test = ["is sparta yeah yeah no"]

cv = CountVectorizer().fit(train)
train_vec = cv.transform(train)
test_vec = cv.transform(test)

posterior = np.matmul(test_vec.toarray(), logprobs.T)
print(posterior)

index = np.argmax(posterior)
print(index)

[[-7.288996   -5.29873395]]
1


## (ref) Naive Bayes from Scratch

In [None]:
columns = ['sent', 'class']
rows = []

rows = [['This is my book', 'stmt'], 
        ['They are novels', 'stmt'],
        ['have you read this book', 'question'],
        ['who is the author', 'question'],
        ['what are the characters', 'question'],
        ['This is how I bought the book', 'stmt'],
        ['I like fictions', 'stmt'],
        ['what is your favorite book', 'question']]

dfx = pd.DataFrame(rows, columns=columns)
dfx.head()

In [None]:
def convert_to_bow(corpus):
    cv = CountVectorizer()

    X = cv.fit_transform(corpus)
    name_index = dict([(name, index) for index, name in enumerate(cv.get_feature_names_out())])

    return X.toarray(), name_index

In [None]:
def count_based_on_class(X, y):
    y = np.array(y)
    lb = LabelBinarizer()
    y = lb.fit_transform(y)

    if y.shape[1] == 1:
        y = np.concatenate((1 - y, y), axis=1)

    # Counts
    # y.T shape is (n_classes,n_datapoints) --> X shape is (n_datapoints*n_features)
    # Count matrix shape is (n_classes*n_features)
    count_matrix = np.matmul(y.T, X)
    class_count = y.sum(axis=0)
    return count_matrix, y, lb.classes_

In [None]:
def feature_log_probs(count_matrix, alpha=1):
    # adding alpha to the count
    smoothed = count_matrix + alpha

    # calculate number of words in a given class
    denumerator = smoothed.sum(axis=1)

    # reshape to 2D column
    denumerator = denumerator.reshape(-1, 1)

    # log prob = log(num) - log(den)
    log_probs = np.log(smoothed) - np.log(denumerator)

    return log_probs

In [None]:
def predicting(query, log_probs, classes):
    # calculate posterior probabilities
    output = np.matmul(log_probs, query.T)
    
    # find index using argmax and returns the specified class
    index = np.argmax(output)

    return classes[index]

In [None]:
# Convert into BOW
X, name_index = convert_to_bow(df["text"])

# Calculate the counts w.r.t to each class
count_matrix, y, classes = count_based_on_class(X, df['sentiment'])

# Calculate the log_probabilities
log_probabilities = feature_log_probs(count_matrix, alpha = 1)

# Using log_probabilities try to predict for a class
output = predicting(X[100], log_probabilities, classes)

print('Predicted class - ',output)
print('Actual class -', df['sentiment'][100])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["text"], df["sentiment"], test_size=0.2, random_state=42)

cv = CountVectorizer().fit(x_train)
x_train_vec = cv.transform(x_train)
x_test_vec = cv.transform(x_test)

In [None]:
y_train = np.array(y_train)
y_train

## (actual manual) Naive Bayes from Scratch

In [None]:
# Split training data by sentiment

def split_sentiment(df):

    df_pos = df[df["sentiment"] == 1]
    df_neg = df[df["sentiment"] == 0]

    return df_pos, df_neg

In [None]:
# Split training and testing data

def split_data(df_pos, df_neg):

    df_train_pos, df_test_pos = train_test_split(df_pos, test_size=0.2)
    df_train_neg, df_test_neg = train_test_split(df_neg, test_size=0.2)

    df_train = pd.concat([df_train_pos, df_train_neg])
    df_test = pd.concat([df_test_pos, df_test_neg])

    return df_train, df_test

In [None]:
# Get vocabulary from testing data

def get_vocab(df_train):

    vocab = []

    for i in range(df_train.shape[0]):
        list_ = df_train.iloc[i]["text"]
        list_ = ast.literal_eval(list_)
        vocab += list_

    vocab_set = set(vocab)

    return vocab_set

In [None]:
# Get frequency and probability of words in vocab

def get_prob(df_train):
    vocab = get_vocab(df_train)
    V = len(vocab)

    pos_words_list = []
    neg_words_list = []

    df_pos, df_neg = split_sentiment(df_train)

    # count all words in each set of data
    for i in range(df_pos.shape[0]):
        pos_text = df_pos.iloc[i]["text"]
        pos_text = ast.literal_eval(pos_text)
        pos_words_list += pos_text

        neg_text = df_neg.iloc[i]["text"]
        neg_text = ast.literal_eval(neg_text)
        neg_words_list += neg_text

    pos_words_freq = Counter(pos_words_list)
    neg_words_freq = Counter(neg_words_list)

    N_pos = sum(pos_words_freq.values())
    N_neg = sum(neg_words_freq.values())

    # count the probabilities of each word
    pos_prob = {}
    neg_prob = {}

    # use Laplacian smoothing
    for i, word in enumerate(vocab):
        pos_prob[word] = (pos_words_freq.get(word, 0) + 1) / (N_pos + V)
        neg_prob[word] = (neg_words_freq.get(word, 0) + 1) / (N_neg + V)

    return pos_prob, neg_prob

In [None]:
# Predict sentiment of text

def predict(text, vocab, pos_prob, neg_prob):
    score = 0

    for word in text:
        if word in vocab:
            score += math.log(pos_prob[word] / neg_prob[word])

    if score > 0:
        sentiment = 1
    else:
        sentiment = 0

    return sentiment

In [None]:
df_pos, df_neg = split_sentiment(df)
df_train, df_test = split_data(df_pos, df_neg)

vocab = get_vocab(df_train)

pos_prob, neg_prob = get_prob(df_train)

df_test["prediction"] = df_test["text"].apply(lambda x: predict(x, vocab, pos_prob, neg_prob))

In [None]:
accuracy_4 = metrics.accuracy_score(df_test["sentiment"], df_test["prediction"])
print(accuracy_4)