# **EMAIL SPAM DETECTION WITH MACHINE LEARNING**

In [1]:
import pandas as pd
import numpy as np

In [2]:
email_data = pd.read_csv("spam.csv",encoding='ISO-8859-1')
email_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm 

In [8]:
Y = email_data["v1"]
X = email_data["v2"]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25)

In [9]:
cv = CountVectorizer()
features = cv.fit_transform(x_train)
print(features.shape)


(4179, 7414)


In [10]:
#Support Vector Machine
model1 = svm.SVC()
model1.fit(features,y_train)

SVC()

In [11]:
features_test = cv.transform(x_test)


In [14]:
from sklearn import metrics
pred1 = model1.predict(features_test)
print('Accuracy:',metrics.accuracy_score(pred1,y_test))

Accuracy: 0.9820531227566404


In [15]:
#Confusion matrix
from sklearn.metrics import confusion_matrix,classification_report
confusion_mat = confusion_matrix(pred1,y_test)
print("Confusion matrix: \n",confusion_mat)
print(classification_report(pred1,y_test))

Confusion matrix: 
 [[1213   25]
 [   0  155]]
              precision    recall  f1-score   support

         ham       1.00      0.98      0.99      1238
        spam       0.86      1.00      0.93       155

    accuracy                           0.98      1393
   macro avg       0.93      0.99      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [16]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=5)

model2.fit(features,y_train)

y_pred2 = model2.predict(features_test)

from sklearn.metrics import accuracy_score

print("Accuracy:",accuracy_score(y_pred2,y_test))

Accuracy: 0.927494615936827


In [18]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

model3 = DecisionTreeClassifier(criterion='entropy',random_state=6)

model3.fit(features,y_train)

y_pred3 = model3.predict(features_test)

from sklearn.metrics import accuracy_score

print("Accuracy:",accuracy_score(y_pred3,y_test))

Accuracy: 0.9734386216798278


In [22]:
#Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

model4 = GaussianNB()

features2 = features.todense()

model4.fit(features2,y_train)

features_test2 = features_test.todense()

y_pred4 = model4.predict(features_test2)

from sklearn.metrics import accuracy_score

print("Accuracy:",accuracy_score(y_pred4,y_test))



Accuracy: 0.8923187365398421


#### **We can clearly notice that SVM is giving better performance of 98% and next to SVM is Decision Tree with 97% Accuracy**