In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load the SST dataset
dataset = load_dataset("sst", "default")

# Access the training split of the dataset
train_dataset = dataset["train"]

# Extract features (X) and labels (y) from the training split
X = train_dataset["sentence"]
sentiment_scores = np.array(train_dataset["label"])

# Convert sentiment scores to discrete class labels using the mapping function
def mapping(a):
    if a <= 0.2:
        return "very negative"
    elif a <= 0.4:
        return 'negative'
    elif a <= 0.6:
        return 'neutral'
    elif a <= 0.8:
        return 'positive'
    elif a<=1 and a>0.8:
        return 'very positive'

y = np.array([mapping(score) for score in sentiment_scores])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

# Predict the class for new instances
new_sentences = ["I love this movie!", "This movie is terrible."]
predicted_classes = model.predict(new_sentences)
print("Predicted classes:", predicted_classes)


Accuracy: 0.3926272674078408
Predicted classes: ['positive' 'very negative']


In [2]:
import numpy as np
from datasets import load_dataset

# Load the SST dataset
dataset = load_dataset("sst", "default")

# Access the training split of the dataset
train_dataset = dataset["train"]

# Extract features (xs) and labels (ys) from the training split
sentence = train_dataset["sentence"]
labels = train_dataset["label"]

# Convert labels to numpy array if needed
labels = np.array(labels)

# Print the shapes to verify
print("Number of sentences in training dataset:", len(sentence))
print("Number of labels in training dataset:", len(labels))


def maping(a):
    if a <= 0.2:
        return "very negative"
    elif a <= 0.4:
        return 'negative'
    elif a <= 0.6:
        return 'neutral'
    elif a <= 0.8:
        return 'positive'
    elif a <= 1 and a > 0.8:
        return 'very positive'

# Apply mapping function to labels
classes = np.vectorize(maping)(labels)


def train_naive_bayes(D, C):
    # Calculate the number of documents in each class.
    class_counts = {c: 0 for c in C}
    for doc in D:
        class_counts[doc['class']] += 1

    # Calculate the log-prior probabilities of each class.
    log_prior = {c: np.log(class_counts[c] / len(D)) for c in C}

    # Create a dictionary to store the word counts for each class.
    word_counts_by_class = {c: {} for c in C}

    # Create a vocabulary of all unique words in the training data.
    vocabulary = set()
    for doc in D:
        for word in doc['text'].split():
            vocabulary.add(word)
            word_counts_by_class[doc['class']][word] = word_counts_by_class[doc['class']].get(word, 0) + 1  # .get(word,0)

    # Calculate the log-likelihood probabilities of each word given each class.
    log_likelihood = {}
    for word in vocabulary:
        log_likelihood[word] = {c: np.log((word_counts_by_class[c].get(word, 0) + 1) /
                                          (sum(word_counts_by_class[c].values()) + len(vocabulary)))
                                 for c in C}

    return log_prior, log_likelihood, vocabulary


# Prepare documents
documents = [{'text': s, 'class': c} for s, c in zip(sentence, classes)]


# Train the Naive Bayes classifier
log_prior, log_likelihood, vocabulary = train_naive_bayes(documents, set(classes))

# Convert vocabulary set to a list
vocabulary = list(vocabulary)


def TestNaiveBayes(TestDoc, logPrior, log_likelihood, vocabulary):
    Sum = np.zeros(len(logPrior))  # Get the number of classes from logPrior
    best_c = None
    max_Sum = -np.inf

    for c, class_label in enumerate(logPrior):
        Sum[c] = logPrior[class_label]
        for word in TestDoc:
            if word in vocabulary and c in log_likelihood.get(word, {}):  # Check if word exists for class c
                Sum[c] += log_likelihood[word][c]
        if Sum[c] > max_Sum:
            max_Sum = Sum[c]
            best_c = c
    return best_c


# Access the testing split of the dataset
test_dataset = dataset["test"]

# Extract features (xs) and labels (ys) from the testing split
test_sentences = test_dataset["sentence"]
test_labels = test_dataset["label"]

# Initialize variables for accuracy calculation
total_correct = 0
total_samples = len(test_sentences)

# Iterate over the test dataset and make predictions
for test_doc, true_class in zip(test_sentences, test_labels):
    predicted_class = TestNaiveBayes(test_doc.split(), log_prior, log_likelihood, vocabulary)
    if predicted_class == true_class:
        total_correct += 1
        print(f"Predicted: {predicted_class}, True: {true_class}, Document: {test_doc}")
    # else:
    #     print(f"Predicted: {predicted_class}, True: {true_class}, Document: {test_doc}")

# Calculate accuracy
accuracy = total_correct / total_samples
print("Accuracy:", accuracy)


Number of sentences in training dataset: 8544
Number of labels in training dataset: 8544
Predicted: 1, True: 1.0, Document: It 's the best film of the year so far , the benchmark against which all other Best Picture contenders should be measured .
Predicted: 1, True: 1.0, Document: Aside from being the funniest movie of the year , Simone , Andrew Niccol 's brilliant anti-Hollywood satire , has a wickedly eccentric enchantment to it .
Predicted: 1, True: 1.0, Document: A stunning piece of visual poetry that will , hopefully , be remembered as one of the most important stories to be told in Australia 's film history .
Predicted: 1, True: 1.0, Document: Invincible is a wonderful movie .
Accuracy: 0.0018099547511312218
