# Email Spam Detection 

## Antriksh Tomer 2K18/SE/030 , Anubhav Gupta 2K18/SE/031

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
# Loading the dataset in dataframe df
df=pd.read_csv('emails.csv')

df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.columns 

Index(['text', 'spam'], dtype='object')

In [5]:
# Permanently deleting the redundant data
df.drop_duplicates(inplace=True)


In [6]:
#New shape of dataset
df.shape

(5695, 2)

In [7]:
#divide the dataset into 2 equal i.e. equal spam and equal ham emails
df['spam'].value_counts()


0    4327
1    1368
Name: spam, dtype: int64

In [8]:
# to remove 4327-1368=2959 rows from the end of dataframe
df.drop(df.tail(2959).index, inplace = True)
df['spam'].value_counts()

1    1368
0    1368
Name: spam, dtype: int64

In [9]:
#removing the first word 'subject ' from all the emails as it is useless
df['text']=df['text'].map(lambda text: text[8:])

df.head(5)
# displays now only the starting of an email 

Unnamed: 0,text,spam
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


In [10]:
df.isnull().sum() #check if null values

text    0
spam    0
dtype: int64

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\ANUBHAV
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [13]:
def process_text(text):
    # remove all punctuations  
    punc_less=[ch for ch in text if ch not in string.punctuation]
    punc_less=''.join(punc_less)
    
    #remove all numerical values
    res = ''.join([i for i in punc_less if not i.isdigit()])
    
    # remove all stopwords
    new_words=[word for word in res.split() if word.lower() not in stopwords.words('english')]
    
    #apply stemming 
    final_words=[ps.stem(w) for w in new_words]
    
    return final_words


In [14]:
#Example to show working of our function
process_text('hello 3344 [phone] visited ,visits 112 worlds: 345661 on a this is number prograMIng program')

['hello', 'phone', 'visit', 'visit', 'world', 'number', 'program', 'program']

# Splitting the Data

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['text'],df['spam'],test_size=0.30,random_state=18)

# BoW Model

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
temp_train_bow=CountVectorizer(analyzer=process_text).fit(X_train)

In [17]:
train_bow=temp_train_bow.transform(X_train)

In [18]:
train_bow.shape

(1915, 16392)

In [19]:
temp_test_bow=CountVectorizer(analyzer=process_text).fit(X_train)#need to remove not needed can use temp_train_bow only

In [20]:
test_bow=temp_test_bow.transform(X_test)

In [21]:
test_bow.shape

(821, 16392)

In [22]:
# Creating and Training of the Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB


In [23]:
clf_bow=MultinomialNB().fit(train_bow,y_train)

In [24]:
# Measuring the performance of our model on train data
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [25]:
pred_train_bow=clf_bow.predict(train_bow)

print('Confusion Matrix -> \n',confusion_matrix(y_train,pred_train_bow))
print('\nAccuracy-> ',accuracy_score(y_train,pred_train_bow))

Confusion Matrix -> 
 [[944   3]
 [  1 967]]

Accuracy->  0.997911227154047


In [26]:
# Measuring the performance of our model on test data
pred_test_bow=clf_bow.predict(test_bow)

print('Confusion Matrix -> \n',confusion_matrix(y_test,pred_test_bow))
print('\nAccuracy-> ',accuracy_score(y_test,pred_test_bow))

Confusion Matrix -> 
 [[416   5]
 [  6 394]]

Accuracy->  0.9866017052375152


In [27]:
ex_docs=['hi! call me ','free money call']
#for pre processing we can here use the process_text function and then transform the message
abc=temp_test_bow.transform(ex_docs)
ex_pred=clf_bow.predict(abc)
print(ex_pred)

[0 1]


In [28]:
abc.shape

(2, 16392)

In [29]:
print(abc)

  (0, 2032)	1
  (0, 6532)	1
  (1, 2032)	1
  (1, 5549)	1
  (1, 9377)	1


In [30]:
# to show how our program is working
# with preprocessing
a=['this is the most frequent 9999 place places placed visited to visit','health is health + wealth']
a1=CountVectorizer(analyzer=process_text).fit(a)
a1.vocabulary_

{'frequent': 0, 'place': 2, 'visit': 3, 'health': 1, 'wealth': 4}

In [31]:
atemp=['most visited visit place 9999 places place placed visits frequent','most frequent','health visits visited is wealth']
a2=a1.transform(atemp)
print(a2)

  (0, 0)	1
  (0, 2)	4
  (0, 3)	3
  (1, 0)	1
  (2, 1)	1
  (2, 3)	2
  (2, 4)	1


In [32]:
#without preprocessing
a=['this is the most frequent 9999 place places placed visited to visit','health is health + wealth']
a1=CountVectorizer().fit(a)
a1.vocabulary_

{'this': 9,
 'is': 3,
 'the': 8,
 'most': 4,
 'frequent': 1,
 '9999': 0,
 'place': 5,
 'places': 7,
 'placed': 6,
 'visited': 12,
 'to': 10,
 'visit': 11,
 'health': 2,
 'wealth': 13}

In [33]:
atemp=['most visited visit place 9999 places place placed visits frequent','most frequent','health visits visited is wealth']
a2=a1.transform(atemp)
print(a2)

  (0, 0)	1
  (0, 1)	1
  (0, 4)	1
  (0, 5)	2
  (0, 6)	1
  (0, 7)	1
  (0, 11)	1
  (0, 12)	1
  (1, 1)	1
  (1, 4)	1
  (2, 2)	1
  (2, 3)	1
  (2, 12)	1
  (2, 13)	1


# TF-IDF Model

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer

In [33]:
train_tfidf=TfidfTransformer().fit_transform(train_bow)

In [34]:
train_tfidf.shape

(1915, 16392)

In [35]:
test_tfidf=TfidfTransformer().fit_transform(test_bow)

In [36]:
test_tfidf.shape

(821, 16392)

In [37]:
clf_tfidf=MultinomialNB().fit(train_tfidf,y_train)

In [38]:
pred_train_tfidf=clf_tfidf.predict(train_tfidf)

print('Confusion Matrix -> \n',confusion_matrix(y_train,pred_train_tfidf))
print('\nAccuracy-> ',accuracy_score(y_train,pred_train_tfidf))

Confusion Matrix -> 
 [[942   5]
 [  1 967]]

Accuracy->  0.9968668407310705


In [39]:
pred_test_tfidf=clf_tfidf.predict(test_tfidf)

print('Confusion Matrix -> \n',confusion_matrix(y_test,pred_test_tfidf))
print('\nAccuracy-> ',accuracy_score(y_test,pred_test_tfidf))

Confusion Matrix -> 
 [[416   5]
 [  8 392]]

Accuracy->  0.9841656516443362
