Spam email classifier using Naive Bayes

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [5]:
# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')


dataset information

In [6]:
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
print(df.tail)

<bound method NDFrame.tail of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham              Will Ã¼ b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
df.shape

(5572, 2)

In [10]:
X = df['Message']
y = df['Category']

In [11]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ã¼ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [13]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [16]:
# Split the data
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [24]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# Vectorize the data
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [26]:
# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [27]:

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)


In [28]:
# Make predictions
y_pred = model.predict(X_test_tfidf)


In [29]:
#results
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))


Accuracy:  0.9632286995515695
Confusion Matrix: 
 [[966   0]
 [ 41 108]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [30]:
## Test the model with some example emails
example_emails = [
    "Congratulations! You've won a free ticket to the Bahamas. Call now to claim your prize.",
    "Hey, are we still meeting for lunch tomorrow?",
    "Limited-time offer! Get 50% off your next purchase at our store.",
    "Please review the attached document and let me know if you have any questions."
]

example_counts = count_vectorizer.transform(example_emails)
example_tfidf = tfidf_transformer.transform(example_counts)
predictions = model.predict(example_tfidf)

for email, prediction in zip(example_emails, predictions):
    print(f'\nEmail: "{email}"\nPrediction: {"Spam" if prediction == 1 else "Ham"}')



Email: "Congratulations! You've won a free ticket to the Bahamas. Call now to claim your prize."
Prediction: Spam

Email: "Hey, are we still meeting for lunch tomorrow?"
Prediction: Ham

Email: "Limited-time offer! Get 50% off your next purchase at our store."
Prediction: Ham

Email: "Please review the attached document and let me know if you have any questions."
Prediction: Ham
