In [2]:
import pandas as pd

df = pd.read_csv('spam_classify.csv' , encoding='latin-1')

df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(df.shape)
df['label'].value_counts()

(5572, 2)


label
0    4825
1     747
Name: count, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (4457,)
Testing set size: (1115,)


In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Get the English stopwords set
stop_words = set(stopwords.words('english'))

# Define the stopwords removal function
def stopwords_removal(text):
    tokens = text.lower().split()  # simple whitespace tokenizer + lowercase
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)


def remove_punctuation(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    text = remove_punctuation(text)
    text = stopwords_removal(text)
    # Convert to lowercase
    text = text.lower()
    tokens = text.split()
    return tokens

class CustomCountVectorizer:
    def __init__(self):
        self.vocab = {}

    def fit(self,texts):
        idx = 0
        for text in texts:
            tokens = preprocess_text(text)
            for token in tokens:
                if token not in self.vocab:
                    self.vocab[token] = idx
                    idx += 1
        return self

    def transform(self, texts):
        vectors = []
        for text in texts:
            tokens = preprocess_text(text)   
            vector = [0] * len(self.vocab)
            for token in tokens:
                if token in self.vocab:
                    index = self.vocab[token]
                    vector[index] += 1
            vectors.append(vector)
        return vectors
    
my_vectorizer = CustomCountVectorizer()
my_vectorizer.fit(X_train)
X_train_vec = my_vectorizer.transform(X_train)
X_test_vec = my_vectorizer.transform(X_test)

print("Number of features (unique words):", len(my_vectorizer.vocab))
print("Shape of training set:", len(X_train_vec), "messages with", len(X_train_vec[0]), "features each")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of features (unique words): 8317
Shape of training set: 4457 messages with 8317 features each


In [None]:
print("Shape of training set:", X_train_vec.shape)

Shape of training set: (4457, 8317)


In [16]:
import numpy as np
class CustomNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_counts = {}
        self.feature_counts = {}
        self.class_priors = {}

        for c in self.classes:
            X_c = X[y == c]
            self.class_counts[c] = X_c.shape[0]
            self.feature_counts[c] = np.sum(X_c, axis=0) + 1  # Laplace smoothing
            self.class_priors[c] = X_c.shape[0] / X.shape[0]

        self.feature_totals = {
            c: np.sum(self.feature_counts[c]) for c in self.classes
        }

    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = {}
            for c in self.classes:
                log_prob = np.log(self.class_priors[c])
                log_prob += np.sum(x * np.log(self.feature_counts[c] / self.feature_totals[c]))
                class_probs[c] = log_prob
            predictions.append(max(class_probs, key=class_probs.get))
        return np.array(predictions)

model = CustomNaiveBayes()
model.fit(X_train_vec, y_train)

# Predict on test set
predictions = model.predict(X_test_vec)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [21]:
test_message_spam = ["you have won a lottery! Click here to claim your prize."]
test_message_ham = ["You schlorarship application has been received. We will get back to you soon."]

# Vectorize it using your custom vectorizer
test_vec_spam = my_vectorizer.transform(test_message_spam)
test_vec_ham = my_vectorizer.transform(test_message_ham)


# Predict using your trained classifier
prediction_spam = model.predict(test_vec_spam)
prediction_ham = model.predict(test_vec_ham)

# Check the result
print("Spam" if prediction_spam[0] == 1 else "Ham")
print("Spam" if prediction_ham[0] == 1 else "Ham")


Spam
Ham


In [19]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Convert custom vectors to numpy arrays
import numpy as np
X_train_vec_np = np.array(X_train_vec)
X_test_vec_np = np.array(X_test_vec)

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_vec_np, y_train)
mnb_pred = mnb.predict(X_test_vec_np)

print("MultinomialNB Results:")
print(classification_report(y_test, mnb_pred))

# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train_vec_np, y_train)
bnb_pred = bnb.predict(X_test_vec_np)

print("BernoulliNB Results:")
print(classification_report(y_test, bnb_pred))

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train_vec_np, y_train)
gnb_pred = gnb.predict(X_test_vec_np)

print("GaussianNB Results:")
print(classification_report(y_test, gnb_pred))


MultinomialNB Results:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

BernoulliNB Results:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

GaussianNB Results:
              precision    recall  f1-score   support

           0       0.99      0.89      0.93       965
           1       0.55      0.91      0.69       150

    accuracy                           0.89      1115
   macro avg       0.77      0.90      0.81      1115
weighted a