### Spam Classfication using SVM

In [1]:
import re
import numpy as np
from nltk.stem import PorterStemmer
from scipy.io import loadmat

#### Andrew Ng Coursera data

In [2]:
file_contents = open("emailSample1.txt","r").read()
vocabList = open("vocab.txt","r").read()

#### Viewing Vocab list

In [3]:
vocabList

'1\taa\n2\tab\n3\tabil\n4\tabl\n5\tabout\n6\tabov\n7\tabsolut\n8\tabus\n9\tac\n10\taccept\n11\taccess\n12\taccord\n13\taccount\n14\tachiev\n15\tacquir\n16\tacross\n17\tact\n18\taction\n19\tactiv\n20\tactual\n21\tad\n22\tadam\n23\tadd\n24\taddit\n25\taddress\n26\tadministr\n27\tadult\n28\tadvanc\n29\tadvantag\n30\tadvertis\n31\tadvic\n32\tadvis\n33\tae\n34\taf\n35\taffect\n36\taffili\n37\tafford\n38\tafrica\n39\tafter\n40\tag\n41\tagain\n42\tagainst\n43\tagenc\n44\tagent\n45\tago\n46\tagre\n47\tagreement\n48\taid\n49\tair\n50\tal\n51\talb\n52\talign\n53\tall\n54\tallow\n55\talmost\n56\talon\n57\talong\n58\talreadi\n59\talsa\n60\talso\n61\taltern\n62\talthough\n63\talwai\n64\tam\n65\tamaz\n66\tamerica\n67\tamerican\n68\tamong\n69\tamount\n70\tamp\n71\tan\n72\tanalysi\n73\tanalyst\n74\tand\n75\tani\n76\tanim\n77\tannounc\n78\tannual\n79\tannuiti\n80\tanoth\n81\tanswer\n82\tanti\n83\tanumb\n84\tanybodi\n85\tanymor\n86\tanyon\n87\tanyth\n88\tanywai\n89\tanywher\n90\taol\n91\tap\n92\tapolog\

#### Split Vocab list by new line character (\n)

In [4]:
vocabList = vocabList.split("\n")[:-1]

In [5]:
vocabList

['1\taa',
 '2\tab',
 '3\tabil',
 '4\tabl',
 '5\tabout',
 '6\tabov',
 '7\tabsolut',
 '8\tabus',
 '9\tac',
 '10\taccept',
 '11\taccess',
 '12\taccord',
 '13\taccount',
 '14\tachiev',
 '15\tacquir',
 '16\tacross',
 '17\tact',
 '18\taction',
 '19\tactiv',
 '20\tactual',
 '21\tad',
 '22\tadam',
 '23\tadd',
 '24\taddit',
 '25\taddress',
 '26\tadministr',
 '27\tadult',
 '28\tadvanc',
 '29\tadvantag',
 '30\tadvertis',
 '31\tadvic',
 '32\tadvis',
 '33\tae',
 '34\taf',
 '35\taffect',
 '36\taffili',
 '37\tafford',
 '38\tafrica',
 '39\tafter',
 '40\tag',
 '41\tagain',
 '42\tagainst',
 '43\tagenc',
 '44\tagent',
 '45\tago',
 '46\tagre',
 '47\tagreement',
 '48\taid',
 '49\tair',
 '50\tal',
 '51\talb',
 '52\talign',
 '53\tall',
 '54\tallow',
 '55\talmost',
 '56\talon',
 '57\talong',
 '58\talreadi',
 '59\talsa',
 '60\talso',
 '61\taltern',
 '62\talthough',
 '63\talwai',
 '64\tam',
 '65\tamaz',
 '66\tamerica',
 '67\tamerican',
 '68\tamong',
 '69\tamount',
 '70\tamp',
 '71\tan',
 '72\tanalysi',
 '73\tan

#### Store the Vocab in a dictionary

In [6]:
vocab ={}

for i in vocabList:
    value,key = i.split("\t")
    vocab[key] = value

In [7]:
# Checking the index
vocab['how']

'794'

#### PreProcessing Email

In [8]:
def PreProcessing_email(email_content):
    """
    Preprocesses the body of an email. 
    """
    
    #Converting to lower case
    email_content = email_content.lower()
    #Handling numbers
    email_content = re.sub("[\d]+","number",email_content)
    #Handling URLs
    email_content = re.sub("https?://[^\s]+","httpaddr",email_content)
    #Handling EmailAddresses
    email_content = re.sub("[^\s]+@[^\s]+","emailaddr",email_content)
    #Handling Special Characters
    email_content = re.sub("[^A-Za-z0-9\s]+","",email_content)
    # Handling new lines
    email_content = re.sub("\n{2,2}","",email_content)
    email_content = re.sub("\s\n"," ",email_content)
    email_content = re.sub("\n"," ",email_content)
    #Handling extra space at the beginning
    email_content = re.sub("^\s","",email_content)
    
    return email_content

#### Word Stemming:- 
Words are reduced to their stemmed form. For example, “discount”, “discounts”, “discounted” and “discounting” are all replaced with “discount”. Sometimes, the Stemmer actually strips off additional characters from the end, so “include”, “includes”, “included”, and “including” are all replaced with “includ”.

In [24]:
def Word_Stemming(file_contents):
    """ Stemming email body """
    email_content = PreProcessing_email(file_contents)
    ps = PorterStemmer()
    email_content = [ps.stem(token) for token in email_content.split(" ")]
    email_content= " ".join(email_content)
    return email_content

In [16]:
def Check_Word_In_Vocablist(vocab):
    """Returns a list of indices of the words contained in vocab list"""
    
    CleanedText = Word_Stemming(file_contents)
    word_index = []
    for i in CleanedText.split():
        if len(i)>0 and i in vocab:
            word_index.append(int(vocab[i]))
    return word_index

In [17]:
word_index = Check_Word_In_Vocablist(vocab)

### Extracting features from email

In [20]:
def extractfeatures(vocab):
    word_index = Check_Word_In_Vocablist(vocab)
    n =len(vocab)
    features = np.zeros((n,1))
    for i in word_index:
        features[i] = 1
    return features

In [21]:
features = extractfeatures(vocab)

In [22]:
print("length of feature vectors:",len(features))
print("Number of Non-Zero entities:",np.count_nonzero(features))

length of feature vectors: 1899
Number of Non-Zero entities: 42


## Training SVM for Spam classification

In [23]:
from sklearn.svm import SVC
import pandas as pd

spam_mat = loadmat("spamTrain.mat")
X_train =spam_mat["X"]
y_train = spam_mat["y"]

In [None]:
C =0.1
spam_svc = SVC(C=0.1,kernel ="linear")
spam_svc.fit(X_train,y_train.ravel())
print("Training Accuracy:",(spam_svc.score(X_train,y_train.ravel()))*100,"%")

In [None]:
spam_mat_test = loadmat("spamTest.mat")
X_test = spam_mat_test["Xtest"]
y_test =spam_mat_test["ytest"]

spam_svc.predict(X_test)
print("Test Accuracy:",(spam_svc.score(X_test,y_test.ravel()))*100,"%")

### Top Predictors of Spam

In [None]:
weights = spam_svc.coef_[0]
data = np.hstack((np.arange(1,1900).reshape(1899,1),weights.reshape(1899,1)))

dataframe = pd.DataFrame(data)

dataframe.sort_values([1],ascending = False,inplace = True)

In [None]:
Top10Predictors = dataframe.reset_index().iloc[0:10]

In [None]:
# list out keys and values separately
key_list = list(vocab.keys())
val_list = list(vocab.values())

In [None]:
word = []
for i in Top10Predictors[0].values:
        position = val_list.index(str(int(i)))
        word.append(key_list[position])

In [None]:
word

In [None]:
Top10Predictors["Word"] = word

In [None]:
Top10Predictors[[0,'Word']]