# Email Spam Detection
Email spams are unsolicited messages sent in bulk by email (spamming). In most cases, spammers use this technique to scam his targets. In this project, I created a model that can detect email spam using Machine Learning technique called Natural Language Processing and Python.


In [14]:
#Import the libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [15]:
#Load the data
df = pd.read_csv('/content/drive/MyDrive/ML/emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [16]:
#Now let’s explore the data and get the number of rows & columns 
df.shape

(5728, 2)

In [17]:
# To get the column names in the data set
df.columns

Index(['text', 'spam'], dtype='object')

In [18]:
# To check for duplicates and remove them
df.drop_duplicates(inplace=True)
df.shape

(5695, 2)

In [19]:
#To see the number of missing data for each column
df.isnull().sum()

text    0
spam    0
dtype: int64

In [20]:
# download the stopwords package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
# to show the tokenization
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [12]:
#convert the text into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['text'])

In [13]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0)
# To see the shape of the data
print(message.shape)

(5695, 37229)


In [23]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)
classifier

MultinomialNB()

In [27]:
# Make a Prediction
predictions = classifier.predict(xtest)
ytrain.values

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
#Checking the precision, recall and f1-score
from sklearn.metrics import classification_report
print(classification_report(ytest, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139



In [30]:
# To know the score from the confussion matrix, use accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(ytest, predictions)

0.9920983318700615