In [32]:
import pandas as pd
from utils import *
import numpy as np

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')


## Preprocessing

In [43]:
def generate_vocab(train_df):
    tokens = []
    for _, row in train_df.iterrows():
        row_tokens = row['sentence'].split()
        tokens.extend(row_tokens)
    return np.unique(tokens) 

def get_number_of_words_in_class(df):
    class_count = {}
    for _, row in df.iterrows():
        row_tokens = row['sentence'].split()
        if row['label'] in class_count:
            class_count[row['label']] += len(row_tokens)
        else:
            class_count[row['label']] = len(row_tokens)
    return class_count   

# generate vocabulary
vocab = generate_vocab(train_df)
print(f'Vocabulary size: {len(vocab)}')
train_df.head()

Vocabulary size: 18278


Unnamed: 0,sentence,label,tokens,score
0,The Rock is destined to be the 21st Century 's...,3,The|Rock|is|destined|to|be|the|21st|Century|'s...,0.69444
1,The gorgeously elaborate continuation of `` Th...,4,The|gorgeously|elaborate|continuation|of|``|Th...,0.83333
2,Singer\/composer Bryan Adams contributes a sle...,3,Singer\/composer|Bryan|Adams|contributes|a|sle...,0.625
3,You 'd think by now America would have had eno...,2,You|'d|think|by|now|America|would|have|had|eno...,0.5
4,Yet the act is still charming here .,3,Yet|the|act|is|still|charming|here|.,0.72222


## Training

In [44]:
n_doc = len(train_df)
log_prior = np.log(train_df['label'].value_counts().sort_index() / n_doc).tolist()
log_likelihood = np.zeros((len(vocab), 5))

class_count = get_number_of_words_in_class(train_df)

# Precompute counts for each word in each class
for c in range(5):  # 5 classes
    # Filter once by class
    class_df = train_df[train_df['label'] == c]
    
    # Count occurrences of each word in vocab for this class
    word_counts = {w: class_df['sentence'].apply(lambda x: x.count(w)).sum() for w in vocab}
    
    # Calculate log likelihood for each word in vocab
    for index, w in enumerate(vocab):
        n_wc = word_counts[w]
        log_likelihood[index][c] = np.log((n_wc + 1) / (class_count[c] + len(vocab)))


## Test Naive Bayes

In [45]:
import numpy as np

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    # Create a dictionary to map words to their indices for quick lookup
    word_to_index = {word: i for i, word in enumerate(V)}
    sum_scores = np.zeros(len(C))  # Initialize scores for each class
    
    for c in range(len(C)):
        # Initialize score with the log prior for the class
        sum_scores[c] = logprior[c]
        
        for word in testdoc.split():  # Split the test document into words
            if word in word_to_index:  # Check if the word is in V
                # Update score by adding loglikelihood of the word given the class
                index = np.where(vocab == word)[0]
                sum_scores[c] += loglikelihood[index[0]][c]
                
    # Return the class with the maximum score
    return np.argmax(sum_scores)


## Training Accuracy

In [36]:
cnt=0
for _, row in train_df.iterrows():
    predicted_class=test_naive_bayes(row['sentence'], log_prior, log_likelihood, [0, 1, 2, 3, 4], vocab)
    if predicted_class == row['label']:
        cnt+=1

# compute accuracy
accuracy = cnt/len(train_df)

print(f'Train Accuracy: {accuracy}')
    


Train Accuracy: 0.7468398876404494


## Test Accuracy

In [46]:
cnt=0
for _, row in test_df.iterrows():
    predicted_class=test_naive_bayes(row['sentence'], log_prior, log_likelihood, [0, 1, 2, 3, 4], vocab)
    if predicted_class == row['label']:
        cnt+=1

# compute accuracy
accuracy = cnt/len(test_df)

print(f'Test Accuracy: {accuracy}')
print(f'Number of wrong predicted sentences:{len(test_df)-cnt}')

Test Accuracy: 0.38552036199095024
Number of wrong predicted sentences:1358


In [41]:
print(len(vocab))

18280


## Metrics

In [None]:

def confusion_matrix(true_labels, pred_labels, num_classes):
    matrix = np.zeros((num_classes, num_classes), dtype=int)
    for t, p in zip(true_labels, pred_labels):
        matrix[t][p] += 1
    return matrix


def calculate_metrics(conf_matrix):
    num_classes = conf_matrix.shape[0]
    precision = []
    recall = []
    f1_score = []

    for c in range(num_classes):
        tp = conf_matrix[c, c]
        fp = conf_matrix[:, c].sum() - tp
        fn = conf_matrix[c, :].sum() - tp

        precision_c = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_c = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_c = 2 * precision_c * recall_c / (precision_c + recall_c) if (precision_c + recall_c) > 0 else 0

        precision.append(precision_c)
        recall.append(recall_c)
        f1_score.append(f1_c)

    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    macro_f1 = np.mean(f1_score)

    metrics_df = pd.DataFrame({
        "Class": ["Class 0", "Class 1", "Class 2", "Class 3", "Class 4"],
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score
    })

    metrics_df.loc[len(metrics_df)] = ["Macro Average", macro_precision, macro_recall, macro_f1]

    return metrics_df


In [None]:
def print_confusion_matrix_table(conf_matrix, class_labels):
    df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)
    print("Confusion Matrix:")
    print(df)

In [None]:
true_labels = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
pred_labels = [0, 1, 1, 3, 4, 0, 0, 2, 3, 4, 2, 1, 2, 3, 3]
# Define the class labels
class_labels = ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4']
num_classes = 5

conf_matrix = confusion_matrix(true_labels, pred_labels, num_classes)
# Print the confusion matrix in table format
print_confusion_matrix_table(conf_matrix, class_labels)

In [None]:
metrics = calculate_metrics(conf_matrix)
print(metrics.to_string(index=False))

## Comparison With Sklearn

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

pipeline.set_params(
    vectorizer__max_features=len(vocab),  
    vectorizer__ngram_range=(1, 1), 
    vectorizer__binary=False , 
)

pipeline.set_params(classifier__alpha=1)  # Laplace smoothing

pipeline.fit(train_df['sentence'], train_df['label'])
predictions = pipeline.predict(test_df['sentence'])

# compute accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy:", accuracy)





Accuracy: 0.4090497737556561
