In [6]:

from google.colab import files
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


uploaded = files.upload()
filename = list(uploaded.keys())[0]


data = pd.read_csv(filename)
X_text = data['Message'].values
y = np.where(data['Category'] == 'spam', 1, 0)  # spam → 1, ham → 0


X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

# Step 5: Define Multinomial Naive Bayes from scratch
class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Laplace smoothing factor

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = {}
        self.word_counts = {}
        self.class_word_totals = {}
        self.vocab_size = X.shape[1]

        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = X_c.shape[0] / X.shape[0]
            self.word_counts[c] = np.sum(X_c, axis=0) + self.alpha
            self.class_word_totals[c] = np.sum(self.word_counts[c])

    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = {}
            for c in self.classes:
                log_prior = np.log(self.class_priors[c])
                log_likelihood = np.sum(x * np.log(self.word_counts[c] / self.class_word_totals[c]))
                class_probs[c] = log_prior + log_likelihood
            predictions.append(max(class_probs, key=class_probs.get))
        return np.array(predictions)


results = []

for vectorizer_name, Vectorizer in [('CountVectorizer', CountVectorizer), ('TfidfVectorizer', TfidfVectorizer)]:
    vectorizer = Vectorizer()
    X_train = vectorizer.fit_transform(X_train_text).toarray()
    X_test = vectorizer.transform(X_test_text).toarray()

    nb = MultinomialNaiveBayes(alpha=1.0)
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    results.append({
        'Model': 'Naive Bayes',
        'Vectorizer': vectorizer_name,
        'Reg. λ': '-',
        'Accuracy': round(acc, 4),
        'Precision': round(prec, 4),
        'Recall': round(rec, 4),
        'F1': round(f1, 4)
    })

    print(f"\n=== Results using {vectorizer_name} ===")
    print(f"Accuracy: {acc}")
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1-score: {f1}")
    print(f"Confusion Matrix:\n{cm}")

results_df = pd.DataFrame(results)
print("\n\n=== Summary of Results ===")
print(results_df)



Saving spam - spam.csv to spam - spam (1).csv

=== Results using CountVectorizer ===
Accuracy: 0.9919282511210762
Precision: 1.0
Recall: 0.9395973154362416
F1-score: 0.9688581314878892
Confusion Matrix:
[[966   0]
 [  9 140]]

=== Results using TfidfVectorizer ===
Accuracy: 0.9650224215246637
Precision: 1.0
Recall: 0.738255033557047
F1-score: 0.8494208494208494
Confusion Matrix:
[[966   0]
 [ 39 110]]


=== Summary of Results ===
         Model       Vectorizer Reg. λ  Accuracy  Precision  Recall      F1
0  Naive Bayes  CountVectorizer      -    0.9919        1.0  0.9396  0.9689
1  Naive Bayes  TfidfVectorizer      -    0.9650        1.0  0.7383  0.8494
