# EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [12]:
import numpy as np
import pandas as pd

df=pd.read_csv("archive (2)/spam.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [15]:
df.shape

(5572, 5)

In [16]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [17]:
df.drop(columns=df[["Unnamed: 2","Unnamed: 3","Unnamed: 4"]],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df.shape

(5572, 2)

In [20]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [21]:
df['spam']=df['v1'].apply(lambda x:1 if x=='spam' else 0)
df.head()

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [22]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df.v2, df.spam)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

v= CountVectorizer()
x_train_count=v.fit_transform(x_train.values)
x_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_count,y_train)

In [33]:
emails=['hey mohan, can we get together to watch fotball game tomorrow?',
        'upto 20% discount on parking exclusive offer just for you, dont miss this reward']
emails_count=v.transform(emails)
print("1 - Spam\n0 - Not Spam\nResults:")
print(model.predict(emails_count))

1 - Spam
0 - Not Spam
Results:
[0 1]


In [34]:
from sklearn.pipeline import Pipeline
clf=Pipeline([('vectorizer',CountVectorizer()),
             ('nb',MultinomialNB())])

In [35]:
clf.fit(x_train,y_train)

In [40]:
y_pred = clf.predict(x_test)
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

cm = confusion_matrix(y_test, y_pred)
print(cm,"\n")
print("Accuracy: ", accuracy_score(y_test, y_pred).round(3))
print("Precision: ", precision_score(y_test, y_pred, average='macro').round(3))
print("Recall: ", recall_score(y_test, y_pred, average='macro').round(3)) 
print("F1-Score: ", f1_score(y_test, y_pred, average='macro').round(3))

[[1203    7]
 [  11  172]] 

Accuracy:  0.987
Precision:  0.976
Recall:  0.967
F1-Score:  0.971


In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1210
           1       0.96      0.94      0.95       183

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393

