In [26]:
from datasets import load_dataset

dataset = load_dataset("sst", "default")

In [27]:
#naive bayes
import numpy as np

def train_naive_bayes(D, C):
    Ndoc = len(D)
    logprior = {}
    loglikelihood = {}
    V = set()
    bigdoc = {c: [] for c in C}

    # Calculate logprior
    for c in C:
     Ne = sum(1 for d in D if d[1] == c)
     logprior[c] = np.log(Ne / Ndoc)
     bigdoc[c] = [d[0] for d in D if d[1] == c]
     # Flatten the list of lists
     flattened_tokens = [token for sublist in bigdoc[c] for token in
      sublist]
     V.update(set(flattened_tokens))

    # Calculate loglikelihood
    for w in V:
        for c in C:
            count_w_c = sum(d.count(w) for d in bigdoc[c])
            loglikelihood[(w, c)] = np.log((count_w_c + 1) / (sum(bigdoc[c].count(w) for w in V) + len(V)))

    return logprior, loglikelihood, V

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_scores = {c: logprior[c] for c in C}
    for word in testdoc:
        if word in V:
            for c in C:
                sum_scores[c] += loglikelihood.get((word, c), 0)
    return max(sum_scores, key=sum_scores.get)


In [28]:
classes = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive",
}
def map_dataset(data):
    if data <= 0.2:
        return 0  # very negative
    elif data <= 0.4:
        return 1  # negative
    elif data <= 0.6:
        return 2  # neutral
    elif data <= 0.8:
        return 3  # positive
    else:
        return 4  # very positive

def map_to_label(data):
    return classes[data]

def data_to_tokens(data):
    documents = []
    for entry in data:
        tokens = entry['tokens'].split("|")
        label = map_dataset(entry['label'])
        documents.append((tokens, label))
    return documents



In [29]:
from random import random

train_documents = data_to_tokens(dataset['train'])
logprior, loglikelihood, vocab_list = train_naive_bayes(train_documents, [0, 1, 2, 3, 4])

# Preprocess the test dataset
test_documents = data_to_tokens(dataset['test'])

# Define a function to classify a single document
def classify_document(document):
    return test_naive_bayes(document, logprior, loglikelihood, [0, 1, 2, 3, 4], vocab_list)

# Test the classifier on the test dataset
predicted_classes = []
actual_classes = []

for doc_tokens, actual_class in test_documents:
    predicted_class = classify_document(doc_tokens)
    predicted_classes.append(predicted_class)
    actual_classes.append(actual_class)

# Compare the predicted classes with the actual classes
correct_predictions = sum(1 for pred, actual in zip(predicted_classes, actual_classes) if pred == actual)
total_predictions = len(predicted_classes)
accuracy = correct_predictions / total_predictions

print("Accuracy:", accuracy)
import random
# Generate a random phrase and predict its class
random_phrase = "This movie was fantastic! I loved every minute of it."
random_score = random.uniform(0, 1)
predicted_class = classify_document(random_phrase)
actual_class = map_dataset(random_score)  # Generating a random sentiment score and mapping it
print("Random phrase:", random_phrase)
print("Predicted class:", classes[predicted_class])
print("Actual class:", classes[actual_class])
# Test the classifier on a sample document
sample_document = "This movie was really good. I enjoyed it a lot."
predicted_class = classify_document(sample_document)
print("Predicted class:", classes[predicted_class])

KeyboardInterrupt: 