In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
df = pd.read_csv("spam_ham_dataset.csv")

In [19]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [17]:
X = df['text']
y = df['label_num']  # 0 for ham, 1 for spam

In [21]:
X

Unnamed: 0,text
0,Subject: enron methanol ; meter # : 988291\r\n...
1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,"Subject: photoshop , windows , office . cheap ..."
4,Subject: re : indian springs\r\nthis deal is t...
...,...
5166,Subject: put the 10 on the ft\r\nthe transport...
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,Subject: calpine daily gas nomination\r\n>\r\n...
5169,Subject: industrial worksheets for august 2000...


In [24]:
y

Unnamed: 0,label_num
0,0
1,0
2,0
3,1
4,0
...,...
5166,0
5167,0
5168,0
5169,0


In [25]:
df.shape

(5171, 4)

In [26]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
label,0
text,0
label_num,0


In [28]:
pip install nltk



In [31]:
import nltk
nltk.download('stopwords')

import string
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

# Assuming 'text' is the column name in your DataFrame
df['text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [32]:
# Convert text to vectors
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [34]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [36]:
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.978743961352657

📊 Confusion Matrix:
 [[731  11]
 [ 11 282]]

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.96      0.96      0.96       293

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [37]:
def predict_mail(mail):
    mail_vec = vectorizer.transform([mail])
    result = model.predict(mail_vec)[0]
    return "SPAM" if result == 1 else "HAM"

In [None]:
while True:
    mail = input("\nEnter an email message to classify (or type 'exit'): ")
    if mail.lower() == 'exit':
        break
    print("Prediction:", predict_mail(mail))


Enter an email message to classify (or type 'exit'): this is smap email
Prediction: HAM

Enter an email message to classify (or type 'exit'): smap alert
Prediction: SPAM

Enter an email message to classify (or type 'exit'): ham alert
Prediction: SPAM

Enter an email message to classify (or type 'exit'): ham email
Prediction: SPAM

Enter an email message to classify (or type 'exit'): ham
Prediction: HAM

Enter an email message to classify (or type 'exit'): an email is electronic 
Prediction: HAM
