# Spam mail detection using Naive Bayes

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
data = [["Congratulations! You've won a $1000 gift card.", "spam"],
["Meeting agenda for tomorrow",	"not spam"],
["Limited time offer! Buy now and get 50% off.", "spam"],
["Your Amazon order has been shipped",	"not spam"],
["Update your account information or risk suspension!",	"spam"],
["Join us for the webinar on AI trends next week",	"not spam"],
["You are pre-approved for a loan of up to $5000!",	"spam"],
["Your invoice for last month is attached",	"not spam"],
["Claim your free vacation to Hawaii now!",	"spam"],
["Reminder: Your dentist appointment is tomorrow",	"not spam"],
["Urgent! Your account has been compromised",	"spam"],
["Weekly newsletter: Industry insights and updates",	"not spam"],
["Congratulations! You've been selected for a free iPhone",	"spam"],
["Your subscription is about to expire, renew now",	"not spam"],
["Act now and get a free trial of our service!",	"spam"],
["Invitation to our annual company retreat",	"not spam"],
["Final notice! Your car warranty is about to expire!",	"spam"],
["Can we reschedule our meeting for Friday?",	"not spam"],
["Get rich quick with this simple investment strategy!",	"spam"],
["Project update: See the latest changes to the report",	"not spam"]]

In [6]:
df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,0,1
0,Congratulations! You've won a $1000 gift card.,spam
1,Meeting agenda for tomorrow,not spam
2,Limited time offer! Buy now and get 50% off.,spam
3,Your Amazon order has been shipped,not spam
4,Update your account information or risk suspen...,spam


In [8]:
df.columns = ["Email", "Label"]

In [9]:
df

Unnamed: 0,Email,Label
0,Congratulations! You've won a $1000 gift card.,spam
1,Meeting agenda for tomorrow,not spam
2,Limited time offer! Buy now and get 50% off.,spam
3,Your Amazon order has been shipped,not spam
4,Update your account information or risk suspen...,spam
5,Join us for the webinar on AI trends next week,not spam
6,You are pre-approved for a loan of up to $5000!,spam
7,Your invoice for last month is attached,not spam
8,Claim your free vacation to Hawaii now!,spam
9,Reminder: Your dentist appointment is tomorrow,not spam


In [10]:
vec = CountVectorizer()

In [15]:
X = vec.fit_transform(df["Email"])

In [16]:
X.toarray()

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X.toarray().shape

(20, 102)

In [19]:
vec.get_feature_names_out()

array(['1000', '50', '5000', 'about', 'account', 'act', 'agenda', 'ai',
       'amazon', 'and', 'annual', 'appointment', 'approved', 'are',
       'attached', 'been', 'buy', 'can', 'car', 'card', 'changes',
       'claim', 'company', 'compromised', 'congratulations', 'dentist',
       'expire', 'final', 'for', 'free', 'friday', 'get', 'gift', 'has',
       'hawaii', 'industry', 'information', 'insights', 'investment',
       'invitation', 'invoice', 'iphone', 'is', 'join', 'last', 'latest',
       'limited', 'loan', 'meeting', 'month', 'newsletter', 'next',
       'notice', 'now', 'of', 'off', 'offer', 'on', 'or', 'order', 'our',
       'pre', 'project', 'quick', 'reminder', 'renew', 'report',
       'reschedule', 'retreat', 'rich', 'risk', 'see', 'selected',
       'service', 'shipped', 'simple', 'strategy', 'subscription',
       'suspension', 'the', 'this', 'time', 'to', 'tomorrow', 'trends',
       'trial', 'up', 'update', 'updates', 'urgent', 'us', 'vacation',
       've', 'warran

In [22]:
X_tr, X_te, y_tr, y_te = train_test_split(X, df["Label"], train_size=0.75, random_state=43)

In [23]:
X_tr

<15x102 sparse matrix of type '<class 'numpy.int64'>'
	with 105 stored elements in Compressed Sparse Row format>

In [24]:
X_te

<5x102 sparse matrix of type '<class 'numpy.int64'>'
	with 43 stored elements in Compressed Sparse Row format>

In [25]:
model = MultinomialNB()

In [26]:
model.fit(X_tr, y_tr)

In [27]:
model.score(X_tr, y_tr)

1.0

In [28]:
y_pr = model.predict(X_te)

In [29]:
accuracy_score(y_te, y_pr)

0.8

In [30]:
new_mail1 = np.array(["You are eligible for a cashback of $500! Click here to claim now."])

In [33]:
def predictor(mail):
    mail_vec = vec.transform(mail)
    return model.predict(mail_vec)

In [34]:
predictor(new_mail1)

array(['spam'], dtype='<U8')

In [35]:
new_mail2 = np.array(["Don't forget about the team meeting at 2 PM tomorrow."])

In [36]:
predictor(new_mail2)

array(['not spam'], dtype='<U8')

In [37]:
new_mail3 = np.array(["Hurry! Last chance to get a 70% discount on our new product line."])

In [38]:
predictor(new_mail3)

array(['not spam'], dtype='<U8')