Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


Step 2: Load the Dataset

In [None]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()


Saving spam.csv to spam (1).csv


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Step 3: Data Cleaning

In [None]:
# Keep only the useful columns
df = df[['v1', 'v2']]

# Rename columns
df.columns = ['label', 'message']

# Encode labels: ham = 0, spam = 1
df['label_num'] = df.label.map({'ham':0, 'spam':1})
df.head()


Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Step 4: Split Data into Train and Test Sets

In [None]:
X = df['message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Step 5: Convert Text to Numerical Data (Bag of Words)

In [None]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


Step 6: Train the Naive Bayes Classifier

In [None]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)


Step 7: Make Predictions and Evaluate

In [None]:
y_pred = model.predict(X_test_cv)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9838565022421525

Confusion Matrix:
 [[963   2]
 [ 16 134]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



Step 8: Test with Your Own Messages

In [None]:
sample_emails = [
    "Congratulations! You've won a free iPhone! Click here to claim now!",
    "Hi John, can we meet for lunch tomorrow?",
    "You have been selected for a $1000 Amazon voucher. Visit our site now!",
    "Hey, just wanted to confirm our meeting at 3 PM."
]

# Convert to the same vector form used in training
sample_emails_cv = cv.transform(sample_emails)

# Predict
predictions = model.predict(sample_emails_cv)

# Show results
for email, label in zip(sample_emails, predictions):
    print(f"\nEmail: {email}")
    print("Prediction:", "🚨 SPAM" if label == 1 else "✅ HAM (Not Spam)")



Email: Congratulations! You've won a free iPhone! Click here to claim now!
Prediction: 🚨 SPAM

Email: Hi John, can we meet for lunch tomorrow?
Prediction: ✅ HAM (Not Spam)

Email: You have been selected for a $1000 Amazon voucher. Visit our site now!
Prediction: 🚨 SPAM

Email: Hey, just wanted to confirm our meeting at 3 PM.
Prediction: ✅ HAM (Not Spam)
