In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load the SST dataset
dataset = load_dataset("sst", "default")

# Access the training split of the dataset
train_dataset = dataset["train"]

# Extract features (X) and labels (y) from the training split
X = train_dataset["sentence"]
sentiment_scores = np.array(train_dataset["label"])

# Convert sentiment scores to discrete class labels using the mapping function
def mapping(a):
    if a <= 0.2:
        return "very negative"
    elif a <= 0.4:
        return 'negative'
    elif a <= 0.6:
        return 'neutral'
    elif a <= 0.8:
        return 'positive'
    elif a<=1 and a>0.8:
        return 'very positive'

y = np.array([mapping(score) for score in sentiment_scores])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

# Predict the class for new instances
# new_sentences = ["I love this movie!", "This movie is terrible."]
# predicted_classes = model.predict(new_sentences)
# print("Predicted classes:", predicted_classes)


Accuracy: 0.3926272674078408
Predicted classes: ['positive' 'very negative']


In [21]:
import numpy as np
from datasets import load_dataset

# Load the SST dataset
dataset = load_dataset("sst", "default")

# Access the training split of the dataset
train_dataset = dataset["train"]

# Extract features (xs) and labels (ys) from the training split
sentence = train_dataset["sentence"]
labels = train_dataset["label"]

# Convert labels to numpy array if needed
labels = np.array(labels)

# Print the shapes to verify
print("Number of sentences in training dataset:", len(sentence))
print("Number of labels in training dataset:", len(labels))


def maping(a):
    if a <= 0.2:
        return "very negative"
    elif a <= 0.4:
        return 'negative'
    elif a <= 0.6:
        return 'neutral'
    elif a <= 0.8:
        return 'positive'
    elif a <= 1 and a > 0.8:
        return 'very positive'

# Apply mapping function to labels
classes = np.vectorize(maping)(labels)


def train_naive_bayes(D, C):
    class_counts = {c: 0 for c in C}
    for doc in D:
        class_counts[doc['class']] += 1
    log_prior = {c: np.log(class_counts[c] / len(D)) for c in C}
    word_counts_by_class = {c: {} for c in C}
    vocabulary = set()
    for doc in D:
        for word in doc['text'].split():
            # print(word)
            vocabulary.add(word)
            word_counts_by_class[doc['class']][word] = word_counts_by_class[doc['class']].get(word, 0) + 1  # .get(word,0)
    
    # print(word_counts_by_class)
    log_likelihood = {}
    for word in vocabulary:
        log_likelihood[word] = {
        c: np.log((word_counts_by_class[c].get(word, 0) + 1) /
                  (sum(word_counts_by_class[c].values()) + len(vocabulary)))
        for c in C
        }

    return log_prior, log_likelihood, vocabulary



def TestNaiveBayes(TestDoc, logPrior, log_likelihood, C, vocabulary):
    Sum = {c: logPrior[c] for c in C}
    best_c = None
    max_Sum = -np.inf
    
    words = TestDoc.split()  
    
    for c in C:
        Sum[c] = logPrior[c]
        for word in words:  
            if word in vocabulary and c in log_likelihood[word]:  
                Sum[c] += log_likelihood[word][c]
        if Sum[c] > max_Sum:
            max_Sum = Sum[c]
            best_c = c
    
    return best_c


test_dataset = dataset["test"]
test_sentences = test_dataset["sentence"]
test_labels = test_dataset["label"]
total_correct = 0
total_samples = len(test_sentences)


# Prepare documents
documents = [{'text': s, 'class': c} for s, c in zip(sentence, classes)]


log_prior, log_likelihood, vocabulary = train_naive_bayes(documents, set(classes))
y_predicted=[]
vocabulary = list(vocabulary)
for test_doc, true_class in zip(test_sentences, test_labels):
    predicted_class  = TestNaiveBayes(test_doc, log_prior, log_likelihood, set(classes), vocabulary)
    y_predicted.append(predicted_class)
    # print(predicted_class)
    # print(true_class)
    if predicted_class == maping(true_class):
        
        total_correct += 1
        print(f"Predicted: {predicted_class}, True: {maping(true_class)}, Document: {test_doc}")
    # else:
    #     print ("False prediction")
    #     print(f"Predicted: {predicted_class}, True: {maping(true_class)}, Document: {test_doc}")

# Calculate accuracy
accuracy = total_correct / total_samples
print("Accuracy:", accuracy)


Number of sentences in training dataset: 8544
Number of labels in training dataset: 8544
Predicted: neutral, True: neutral, Document: Effective but too-tepid biopic
Predicted: positive, True: positive, Document: If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .
Predicted: very positive, True: very positive, Document: Offers that rare combination of entertainment and education .
Predicted: positive, True: positive, Document: Take Care of My Cat offers a refreshingly different slice of Asian cinema .
Predicted: positive, True: positive, Document: What really surprises about Wisegirls is its low-key quality and genuine tenderness .
Predicted: positive, True: positive, Document: ( Wendigo is ) why we go to the cinema : to be fed through the eye , the heart , the mind .
Predicted: very positive, True: very positive, Document: One of the greatest family-oriented , fantasy-adventure movies ever .
Predicted: very positive, True: very positive, Document: 

In [36]:
def confusion_matrix_impl(true_labels, predicted_labels, labels=None):
    if labels is None:
        labels = np.unique(np.concatenate((true_labels, predicted_labels)))
    num_labels = len(labels)
    cm = np.zeros((num_labels, num_labels), dtype=int)
    for i in range(len(true_labels)):
        true_idx = np.where(labels == true_labels[i])[0][0]
        pred_idx = np.where(labels == predicted_labels[i])[0][0]
        cm[true_idx, pred_idx] += 1
    return cm
y_test=np.vectorize(maping)(test_labels)
confusionmatrix=confusion_matrix_impl(y_test,y_predicted)

confusionmatrix

array([[409,  50, 147,  20,   7],
       [167,  32, 176,   8,   6],
       [ 96,  24, 356,   1,  33],
       [201,   7,  55,  16,   0],
       [ 38,  14, 281,   2,  64]])

In [37]:
import numpy as np

def precision_score_per_class(confusion_matrix):
    true_positives = np.diag(confusion_matrix)
    false_positives = np.sum(confusion_matrix, axis=0) - true_positives
    precision = true_positives / (true_positives + false_positives)
    return precision

def recall_score_per_class(confusion_matrix):
    true_positives = np.diag(confusion_matrix)
    false_negatives = np.sum(confusion_matrix, axis=1) - true_positives
    recall = true_positives / (true_positives + false_negatives)
    return recall

def f1_score_per_class(confusion_matrix):
    precision = precision_score_per_class(confusion_matrix)
    recall = recall_score_per_class(confusion_matrix)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def macro_averaged_precision(confusion_matrix):
    precision = precision_score_per_class(confusion_matrix)
    return np.mean(precision)

def macro_averaged_recall(confusion_matrix):
    recall = recall_score_per_class(confusion_matrix)
    return np.mean(recall)

def macro_averaged_f1_score(confusion_matrix):
    f1 = f1_score_per_class(confusion_matrix)
    return np.mean(f1)


print("Precision per class:", precision_score_per_class(confusion_matrix=confusionmatrix))
print("Recall per class:", recall_score_per_class(confusion_matrix=confusionmatrix))
print("F1 Score per class:", f1_score_per_class(confusion_matrix=confusionmatrix))

print("Macro-averaged Precision:", macro_averaged_precision(confusion_matrix=confusionmatrix))
print("Macro-averaged Recall:", macro_averaged_recall(confusion_matrix=confusionmatrix))
print("Macro-averaged F1 Score:", macro_averaged_f1_score(confusion_matrix=confusionmatrix))

Precision per class: [0.44895719 0.2519685  0.35073892 0.34042553 0.58181818]
Recall per class: [0.64612954 0.08226221 0.69803922 0.05734767 0.160401  ]
F1 Score per class: [0.52979275 0.12403101 0.46688525 0.09815951 0.25147348]
Macro-averaged Precision: 0.3947816647654897
Macro-averaged Recall: 0.32883592822089813
Macro-averaged F1 Score: 0.2940683972753402


In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
confusion_matrix(y_true=y_test,y_pred=y_predicted)

array([[409,  50, 147,  20,   7],
       [167,  32, 176,   8,   6],
       [ 96,  24, 356,   1,  33],
       [201,   7,  55,  16,   0],
       [ 38,  14, 281,   2,  64]])

In [39]:
precision_per_class_skitl = precision_score(y_test,y_predicted, average=None)

recall_per_class_skitl = recall_score(y_test,y_predicted, average=None)

f1_score_per_class_skitl = f1_score(y_test,y_predicted, average=None)

macro_avg_precision_skitl = precision_score(y_test,y_predicted, average='macro')

macro_avg_recall_skitl = recall_score(y_test,y_predicted, average='macro')

macro_avg_f1_score_skitl = f1_score(y_test,y_predicted, average='macro')

print("Precision per class:", precision_per_class_skitl)
print("Recall per class:",recall_per_class_skitl)
print("F1 Score per class:", f1_score_per_class_skitl)

print("Macro-averaged Precision:", macro_avg_precision_skitl)
print("Macro-averaged Recall:", macro_avg_recall_skitl)
print("Macro-averaged F1 Score:", macro_avg_f1_score_skitl)

Precision per class: [0.44895719 0.2519685  0.35073892 0.34042553 0.58181818]
Recall per class: [0.64612954 0.08226221 0.69803922 0.05734767 0.160401  ]
F1 Score per class: [0.52979275 0.12403101 0.46688525 0.09815951 0.25147348]
Macro-averaged Precision: 0.3947816647654897
Macro-averaged Recall: 0.32883592822089813
Macro-averaged F1 Score: 0.2940683972753402
