In [26]:
from datasets import load_dataset
import pandas as pd
from utils import *
import numpy as np

# Load SST dataset
dataset = load_dataset("sst")

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])


## Preprocessing

In [27]:
def generate_vocab(train_df):
    tokens = []
    for _, row in train_df.iterrows():
        row_tokens = row['tokens'].split('|')
        tokens.extend(row_tokens)
    return np.unique(tokens) 

def get_number_of_words_in_class(df):
    class_count = {}
    for _, row in df.iterrows():
        row_tokens = row['tokens'].split('|')
        if row['sentiment'] in class_count:
            class_count[row['sentiment']] += len(row_tokens)
        else:
            class_count[row['sentiment']] = len(row_tokens)
    return class_count

train_df['sentiment'] = train_df['label'].apply(map_classes)  
test_df['sentiment'] = test_df['label'].apply(map_classes)
# generate vocabulary
vocab = generate_vocab(train_df)

## Training

In [None]:
n_doc= len(train_df)
log_prior= [0] * 5
log_likelihood= [[0] * 5] * len(vocab)
class_count = get_number_of_words_in_class(train_df)
for c in range(5): # 5 classes
    n_c= (train_df['sentiment'] == c).sum()
    log_prior[c] = np.log2(n_c/n_doc)
    no_words_in_c= (train_df['sentiment'] == c).apply(lambda x: len(x.split())).sum()
    for index,w in vocab:
        n_wc= (train_df['sentiment'] == c) & (train_df['sentence'].apply(lambda x: w in x)).sum()
        log_likelihood[index][c] = np.log2(n_wc+1/(no_words_in_c+len(vocab)))
        



## Test Naive Bayes

In [1]:
import numpy as np

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_scores = np.zeros(len(C))  # Initialize scores for each class
    
    for c in C:
        # Initialize score with the log prior for the class
        sum_scores[c] = logprior[c]
        
        for word in testdoc.split():  # Split the test document into words
            if word in V:
                # Update score by adding loglikelihood of the word given the class
                sum_scores[c] += loglikelihood[V.index(word)][c]
                
    # Return the class with the maximum score
    return np.argmax(sum_scores)
