In [9]:
from datasets import load_dataset
import numpy as np
from collections import defaultdict
dataset = load_dataset("sst", "default")

In [10]:
#naive bayes
def train_naive_bayes(D, C):
    N_doc = len(D)
    logprior = {}
    loglikelihood = {}
    V = set()
    bigdoc = defaultdict(list)
    word_counts_per_class = defaultdict(lambda: defaultdict(int))

    # Calculate logprior and collect documents for each class
    for c in C:
        D_c = [doc_tokens for doc_tokens, cl in D if cl == c]
        N_c = len(D_c)
        logprior[c] = np.log(N_c / N_doc)
        for doc in D_c:
            bigdoc[c] += doc
            V.update(doc)
            for word in doc:
                word_counts_per_class[word][c] += 1

    V = list(V)

    # Calculate loglikelihood
    for w in V:
        for c in C:
            count_w_c = word_counts_per_class[w][c]
            loglikelihood[(w, c)] = np.log((count_w_c + 1) / (sum(word_counts_per_class[w].values()) + len(V)))

    return logprior, loglikelihood, V

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_scores = {c: logprior[c] for c in C}
    for word in testdoc:
        if word in V:
            for c in C:
                sum_scores[c] += loglikelihood.get((word, c), 0)
    return max(sum_scores, key=sum_scores.get)

In [11]:
classes = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive",
}
def map_dataset(data):
    if data <= 0.2:
        return 0  # very negative
    elif data <= 0.4:
        return 1  # negative
    elif data <= 0.6:
        return 2  # neutral
    elif data <= 0.8:
        return 3  # positive
    else:
        return 4  # very positive
def data_to_tokens(data):
    documents = []
    for entry in data:
        tokens = entry['tokens'].split("|")
        label = map_dataset(entry['label'])
        documents.append((tokens, label))
    return documents

In [12]:
# Prepare training and testing data
train_documents = data_to_tokens(dataset['train'])
test_documents = data_to_tokens(dataset['test'])

In [13]:
# Train Naive Bayes classifier
logprior, loglikelihood, vocab_list = train_naive_bayes(train_documents, [0, 1, 2, 3, 4])

# Test the classifier on the test dataset
correct_predictions = 0
total_predictions = len(test_documents)

for doc_tokens, actual_class in test_documents:
    predicted_class = test_naive_bayes(doc_tokens, logprior, loglikelihood, [0, 1, 2, 3, 4], vocab_list)
    if predicted_class == actual_class:
        correct_predictions += 1

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)

# Test the classifier on a sample document
random_phrase = "This movie was fantastic! I loved every minute of it."
random_score = np.random.uniform(0, 1)
actual_class = map_dataset(random_score)
predicted_class = test_naive_bayes(random_phrase.split(), logprior, loglikelihood, [0, 1, 2, 3, 4], vocab_list)
print("Random phrase:", random_phrase)
print("Predicted class:", predicted_class)
print("Actual class:", actual_class)

Accuracy: 0.36742081447963804
Random phrase: This movie was fantastic! I loved every minute of it.
Predicted class: 1
Actual class: 4


In [14]:
#comparison with sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Prepare training and testing data
train_texts = [' '.join(tokens) for tokens, _ in train_documents]
test_texts = [' '.join(tokens) for tokens, _ in test_documents]
train_labels = [label for _, label in train_documents]
test_labels = [label for _, label in test_documents]

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB(alpha=1.0))
])

# Train the classifier
pipeline.fit(train_texts, train_labels)

# Predict on the test set
predicted_labels = pipeline.predict(test_texts)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print("Accuracy using scikit-learn:", accuracy)


Accuracy using scikit-learn: 0.4090497737556561
