In [135]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [136]:
# Read in the data and replace missing values with an empty string
df=pd.read_csv('../input/spam-or-not-spam-dataset/spam_or_not_spam.csv')
df=df.replace(np.nan,'')

In [137]:
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [138]:
# Convert emails to a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['email'])
X=X.toarray()

In [139]:
# Train the classifier
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [140]:
from sklearn.metrics import accuracy_score 
accuracy_score (y_test,y_pred )

0.9516666666666667

In [141]:
# Make a prediction on the new email (Not Spam example)
new_email = """Hi,

I hope this finds you well. I just wanted to remind you about our meeting tomorrow at 10 am in the conference room. Please bring any materials that you have prepared so far.

Best regards,
sara"""
prediction = gnb.predict(vectorizer.transform([new_email]).toarray())

# Print the prediction
print(prediction[0])
print("Spam" if prediction[0] else "Not Spam")

0
Not Spam


In [142]:
# Make a prediction on the new email (Spam example)
new_email = """Dear Friend,

I hope you are doing well. I wanted to let you know about a great opportunity to make money online. All you have to do is click this link and follow the simple instructions. You can earn thousands of dollars in just a few short weeks!

Don't miss out on this chance to change your life. Click the link now!

Sincerely,
Mark"""
prediction = gnb.predict(vectorizer.transform([new_email]).toarray())

# Print the prediction
print(prediction[0])
print("Spam" if prediction[0] else "Not Spam")

1
Spam
