In [5]:
import pandas as pd
from utils import *
import numpy as np

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')


## Preprocessing

In [7]:
# Function to tokenize and preprocess text
def tokenize(text):
    return text.lower().split()  # Simple split for tokenization; can enhance as needed

# Function to generate vocabulary
def generate_vocab(train_df):
    tokens = set()
    for _, row in train_df.iterrows():
        row_tokens = tokenize(row['sentence'])
        tokens.update(row_tokens)  # Use set to avoid duplicates
    return np.array(sorted(tokens))  # Return sorted array for consistent ordering

# Function to create the document-term matrix
def create_document_term_matrix(train_df, vocab):
    # Initialize a matrix with zeros
    doc_term_matrix = np.zeros((len(train_df), len(vocab)), dtype=int)
    
    # Create a mapping of words to their indices
    word_to_index = {word: i for i, word in enumerate(vocab)}
    
    for doc_index, (_, row) in enumerate(train_df.iterrows()):
        tokens = tokenize(row['sentence'])  # Tokenize the document
        for token in tokens:
            if token in word_to_index:  # Check if the token is in the vocabulary
                doc_term_matrix[doc_index][word_to_index[token]] += 1  # Increment count
                
    return doc_term_matrix

# Function to get the number of words in each class
def get_number_of_words_in_class(df):
    class_count = {}
    for _, row in df.iterrows():
        row_tokens = tokenize(row['sentence'])
        if row['label'] in class_count:
            class_count[row['label']] += len(row_tokens)
        else:
            class_count[row['label']] = len(row_tokens)
    return class_count

# Example usage
# Assuming train_df is your DataFrame with 'sentence' and 'label' columns
vocab = generate_vocab(train_df)
print(f'Vocabulary size: {len(vocab)}')

# Create the document-term matrix
doc_term_matrix = create_document_term_matrix(train_df, vocab)
print(f'Document-Term Matrix shape: {doc_term_matrix.shape}')

Vocabulary size: 16579
Document-Term Matrix shape: (8544, 16579)


Vocabulary size: 18278


Unnamed: 0,sentence,label,tokens,score
0,The Rock is destined to be the 21st Century 's...,3,The|Rock|is|destined|to|be|the|21st|Century|'s...,0.69444
1,The gorgeously elaborate continuation of `` Th...,4,The|gorgeously|elaborate|continuation|of|``|Th...,0.83333
2,Singer\/composer Bryan Adams contributes a sle...,3,Singer\/composer|Bryan|Adams|contributes|a|sle...,0.625
3,You 'd think by now America would have had eno...,2,You|'d|think|by|now|America|would|have|had|eno...,0.5
4,Yet the act is still charming here .,3,Yet|the|act|is|still|charming|here|.,0.72222


## Training

In [8]:
# Calculate class word counts
class_count = get_number_of_words_in_class(train_df)

# Calculate prior probabilities
n_doc = len(train_df)
log_prior = np.log(train_df['label'].value_counts().sort_index() / n_doc).tolist()

# Precompute log likelihood
log_likelihood = np.zeros((len(vocab), 5))  # Assuming 5 classes

for c in range(5):  # For each class
    # Get total word counts for the class using the document-term matrix
    n_wc = np.sum(doc_term_matrix[train_df['label'] == c], axis=0)
    for index, w in enumerate(vocab):
        # Calculate log likelihood for each word in vocab
        log_likelihood[index][c] = np.log((n_wc[index] + 1) / (class_count[c] + len(vocab)))

## Test Naive Bayes

In [10]:
import numpy as np

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    # Create a dictionary to map words to their indices for quick lookup
    word_to_index = {word: i for i, word in enumerate(V)}
    sum_scores = np.zeros(len(C))  # Initialize scores for each class
    
    for c in range(len(C)):
        # Initialize score with the log prior for the class
        sum_scores[c] = logprior[c]
        
        for word in testdoc.split():  # Split the test document into words
            if word in word_to_index:  # Check if the word is in V
                # Update score by adding loglikelihood of the word given the class
                index = np.where(vocab == word)[0]
                sum_scores[c] += loglikelihood[index[0]][c]
                
    # Return the class with the maximum score
    return np.argmax(sum_scores)


## Training Accuracy

In [11]:
cnt=0
for _, row in train_df.iterrows():
    predicted_class=test_naive_bayes(row['sentence'], log_prior, log_likelihood, [0, 1, 2, 3, 4], vocab)
    if predicted_class == row['label']:
        cnt+=1

# compute accuracy
accuracy = cnt/len(train_df)

print(f'Train Accuracy: {accuracy}')
    


Train Accuracy: 0.7400514981273408


## Test Accuracy

In [12]:

true_labels = []
predicted_labels = []

cnt=0
for _, row in test_df.iterrows():
    predicted_class=test_naive_bayes(row['sentence'], log_prior, log_likelihood, [0, 1, 2, 3, 4], vocab)
    true_labels.append(row['label'])
    predicted_labels.append(predicted_class)
    if predicted_class == row['label']:
        cnt+=1

# compute accuracy
accuracy = cnt/len(test_df)

print(f'Test Accuracy: {accuracy}')
print(f'Number of wrong predicted sentences:{len(test_df)-cnt}')

Test Accuracy: 0.40090497737556563
Number of wrong predicted sentences:1324


In [55]:
print(len(vocab))

18278


## Metrics

In [67]:

def compute_confusion_matrix(true_labels, pred_labels, num_classes):
    matrix = np.zeros((num_classes, num_classes), dtype=int)
    for t, p in zip(true_labels, pred_labels):
        matrix[t][p] += 1
    return matrix


def calculate_metrics(conf_matrix):
    num_classes = conf_matrix.shape[0]
    precision = []
    recall = []
    f1_score = []

    for c in range(num_classes):
        tp = conf_matrix[c, c]
        fp = conf_matrix[:, c].sum() - tp
        fn = conf_matrix[c, :].sum() - tp

        precision_c = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_c = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_c = 2 * precision_c * recall_c / (precision_c + recall_c) if (precision_c + recall_c) > 0 else 0

        precision.append(precision_c)
        recall.append(recall_c)
        f1_score.append(f1_c)

    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    macro_f1 = np.mean(f1_score)

    metrics_df = pd.DataFrame({
        "Class": ["Class 0", "Class 1", "Class 2", "Class 3", "Class 4"],
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score
    })

    metrics_df.loc[len(metrics_df)] = ["Macro Average", macro_precision, macro_recall, macro_f1]

    return metrics_df


In [58]:
def print_confusion_matrix_table(conf_matrix, class_labels):
    df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)
    print("Confusion Matrix:")
    print(df)

In [92]:

# Define the class labels
class_labels = ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4']
num_classes = 5

conf_matrix = compute_confusion_matrix(true_labels, predicted_labels, num_classes)
# Print the confusion matrix in table format
print_confusion_matrix_table(conf_matrix, class_labels)

Confusion Matrix:
         Class 0  Class 1  Class 2  Class 3  Class 4
Class 0       11      205        5       58        0
Class 1        9      395       38      181       10
Class 2        2      167       21      193        6
Class 3        1       98       11      366       34
Class 4        1       39        7      293       59


In [93]:
metrics = calculate_metrics(conf_matrix)
print(metrics.to_string(index=False))

        Class  Precision   Recall  F1-Score
      Class 0   0.458333 0.039427  0.072607
      Class 1   0.436947 0.624013  0.513988
      Class 2   0.256098 0.053985  0.089172
      Class 3   0.335472 0.717647  0.457214
      Class 4   0.541284 0.147870  0.232283
Macro Average   0.405627 0.316588  0.273053


## Comparison With Sklearn

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

pipeline.set_params(
    vectorizer__max_features=len(vocab),  
    vectorizer__ngram_range=(1, 1), 
    vectorizer__binary=False , 
)

pipeline.set_params(classifier__alpha=1)  # Laplace smoothing

pipeline.fit(train_df['sentence'], train_df['label'])
predictions = pipeline.predict(test_df['sentence'])

# compute accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy:", accuracy)





Accuracy: 0.4090497737556561


In [231]:
from sklearn.metrics import confusion_matrix
conf_matrix_sk = confusion_matrix(true_labels, predictions)
print_confusion_matrix_table(conf_matrix_sk, class_labels)

Confusion Matrix:
         Class 0  Class 1  Class 2  Class 3  Class 4
Class 0       10      215        7       47        0
Class 1       15      429       35      147        7
Class 2        4      167       17      193        8
Class 3        1       92       17      371       29
Class 4        1       41        9      295       53


In [232]:
metrics = calculate_metrics(conf_matrix_sk)
print(metrics.to_string(index=False))

        Class  Precision   Recall  F1-Score
      Class 0   0.322581 0.035842  0.064516
      Class 1   0.454449 0.677725  0.544071
      Class 2   0.200000 0.043702  0.071730
      Class 3   0.352327 0.727451  0.474728
      Class 4   0.546392 0.132832  0.213710
Macro Average   0.375150 0.323510  0.273751


In [165]:
metrics = calculate_metrics(conf_matrix)
print(metrics.to_string(index=False))

        Class  Precision   Recall  F1-Score
      Class 0   0.458333 0.039427  0.072607
      Class 1   0.436947 0.624013  0.513988
      Class 2   0.256098 0.053985  0.089172
      Class 3   0.335472 0.717647  0.457214
      Class 4   0.541284 0.147870  0.232283
Macro Average   0.405627 0.316588  0.273053
