In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [4]:
data = pd.read_csv('dataset.csv', encoding='latin-1')

print(data.columns)

Index(['label', 'message'], dtype='object')


In [5]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/zero/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/zero/nltk_data...


True

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # Tokenize, remove stopwords, and lemmatize
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(text)

In [8]:
data['message'] = data['message'].apply(preprocess_text)

In [9]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)


In [10]:
vectorizer = CountVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [11]:
# train naive bayes classifier
model = MultinomialNB() # initialize classifier
model.fit(X_train_transformed, y_train)

In [12]:
# evaluate the model

y_pred = model.predict(X_test_transformed) # predict on the test set

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [13]:
# classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       103
        spam       1.00      1.00      1.00        97

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [14]:
# test spam filter 


# Function to predict if a message is spam or ham
def predict_spam(message):
    message = preprocess_text(message)
    message_transformed = vectorizer.transform([message])
    return model.predict(message_transformed)[0]

# Test the function
print(predict_spam("You've won a $1000 Apple gift card. Click here to claim now."))
print(predict_spam("Is the meeting still on? Cannot see you on Zoom!"))


spam
ham
