Load Dataset

Text Preprocessing

Vectorization (Convert text to numbers)

Train Naive Bayes Model

Evaluate Accuracy

Predict on New Emails

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#  1. Load Dataset

In [13]:
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]


# 2. Preprocessing
Convert text to lowercase

Remove punctuation, numbers, stopwords (optional)

Use LabelEncoder to convert ham/spam to 0/1

In [14]:
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})  # convert labels to 0/1


In [3]:
df

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
import string
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    text = re.sub('\d+', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)


  text = re.sub('\d+', '', text)


# 3. Train-Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)


# 4. Vectorization (Bag of Words)
We'll convert text into numbers using CountVectorizer:

In [6]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


#  5. Train Naive Bayes Classifier

In [7]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)


# 📊 6. Evaluation

In [8]:
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [9]:
def predict_spam(email):
    cleaned = clean_text(email)
    vec = vectorizer.transform([cleaned])
    return "Spam" if model.predict(vec)[0] == 1 else "Ham"

# Example:
print(predict_spam("Win a free iPhone now!!! Click here!"))
print(predict_spam("Meeting scheduled at 10am tomorrow."))


Spam
Ham


In [12]:
tr = input("Enter an email to classify: ")
print(predict_spam(tr))  # Classify user input email
print("Thank you for using the spam classifier!")  # Acknowledgment message
# End of the spam classifier script
# This script classifies emails as spam or ham using a Naive Bayes classifier.
# It includes text preprocessing, model training, and prediction functionalities.


Ham
Thank you for using the spam classifier!


In [11]:
print(predict_spam("how was your day? Hope you are doing well."))

Ham
