In [None]:
import os
import zipfile
import numpy
from pandas import DataFrame


zip_path = 'dataset.zip'
#reading the zip file
zip_ref = zipfile.ZipFile(zip_path, 'r')
#extracting the zip file 
# this creates a dataset folder which contains three sub folders - 
# a) easy_ham
# b) hard_ham
# c) spam
zip_ref.extractall()
zip_ref.close()

In [None]:
NEWLINE_CHARACTER = '\n'
HAM = 'ham'
SPAM = 'spam'

# indicates which folders contain spam and which folders contain ham 
SOURCES = [
    ('dataset/spam',        SPAM),
    ('dataset/easy_ham',    HAM),
    ('dataset/hard_ham',    HAM)
]

SKIP_FILES = {'cmds'}

In [None]:
# returns filepath and file content
# skips the header of the content and keeps the body only
def filename_filecontent(path):
    for root_directory, directory_names, f_names in os.walk(path):
        for path in directory_names:
            filename_filecontent(os.path.join(root_directory, path))
        for f_name in f_names:
            if f_name not in SKIP_FILES:
                f_path = os.path.join(root_directory, f_name)
                if os.path.isfile(f_path):
                    header_past_flag, list_lines = False, []
                    f = open(f_path, encoding="latin-1")
                    for line in f:
                        if header_past_flag:
                            list_lines.append(line)
                        elif line == NEWLINE_CHARACTER:
                            header_past_flag = True
                    f.close()
                    content = NEWLINE_CHARACTER.join(list_lines)
                    yield f_path, content

In [None]:
# this creates the data frame that will store the dataset
def create_data_frame(l, path, classification):
    rows = []
    index = []
    for i, (file_name, text) in enumerate(filename_filecontent(path)):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)
   
    data_frame = DataFrame(rows, index=index)
    return data_frame, len(rows)

In [None]:
# loads data and randomly shuffles the dataset
def load_data():
    data = DataFrame({'text': [], 'class': []})
    l = 0
    for path, classification in SOURCES:
        data_frame, nrows = create_data_frame(l, path, classification)
        data = data.append(data_frame)
        l += nrows
    data = data.reindex(numpy.random.permutation(data.index))
    return data

In [None]:
# loads the whole dataset into a pandas dataframe
data=load_data()
len(data)

6849

In [None]:
# we have 6849 rows
# data is a pandas dataframe, it has 6849 rows and 
# three columns - text, class and name
data.describe()

Unnamed: 0,text,class
count,6849,6849
unique,5969,2
top,------000000000000000000000\n\nContent-Type: t...,ham
freq,8,4451


In [None]:
data.iloc[2]      #retreives the content of a particular row

text     Isn't eBays trust the feedback?\n\n\n\n----- O...
class                                                  ham
Name: dataset/easy_ham/00652.be6b3138d3d7304c73ebba1ba3f687d1, dtype: object

In [None]:
data.iloc[2]['text']      # an example email from the dataset

'Isn\'t eBays trust the feedback?\n\n\n\n----- Original Message -----\n\nFrom: "Chip Paswater" <turk182@chipware.net>\n\nTo: "Patrick" <patrick@stealthgeeks.net>\n\nCc: <razor-users@example.sourceforge.net>\n\nSent: Wednesday, August 14, 2002 9:51 PM\n\nSubject: Re: [Razor-users] Re: What\'s wrong with the Razor servers now?\n\n\n\n\n\n> > > It\'s not my desire to second-guess you Vipul (however much my missives\n\n> > > may appear otherwise) or question the hard work you and the other\n\n> > > developers have put into the system, however it seems that every\n\nrequest\n\n> > > for such information has been met with silence.\n\n> >\n\n> > There are no plans for releasing details about TeS. Before the\n\n> > release of Razor2, I\'d pointed out that Razor2 backend (specially\n\n> > TeS) will be closed.\n\n>\n\n> Thanks for the clarification. Guess it\'s time to find something that is\n\n> open.\n\n>\n\n> Good luck.\n\n\n\nWhy do the details of the backend need to be open?\n\n\n\nI don\'t

In [None]:
data.iloc[2]['class']       # corresponding label 

'ham'

In [None]:
import re                                           # Python has a module named re to work with Regular Expressions
from string import punctuation                      # Gives us a collection of all the punctuations
from nltk.stem.snowball import SnowballStemmer      # For the purpose of word stemming

In [None]:
# vocabulary.txt stores the most frequently occurring words in our dataset
# this function reads all the words from vocabulary.txt and creates a dictionary
def vocabulary_dictionary():
    with open('vocabulary.txt', 'r') as vocabulary:
        # vocab_dict contains all the words
        vocab_dict = {}
        for line in vocabulary.readlines():
            index, word = line.split()
            vocab_dict[word] = int(index)

    return vocab_dict

In [None]:
vocab = vocabulary_dictionary()
print(vocab)

{'aa': 1, 'ab': 2, 'abil': 3, 'abl': 4, 'about': 5, 'abov': 6, 'absolut': 7, 'abus': 8, 'ac': 9, 'accept': 10, 'access': 11, 'accord': 12, 'account': 13, 'achiev': 14, 'acquir': 15, 'across': 16, 'act': 17, 'action': 18, 'activ': 19, 'actual': 20, 'ad': 21, 'adam': 22, 'add': 23, 'addit': 24, 'address': 25, 'administr': 26, 'adult': 27, 'advanc': 28, 'advantag': 29, 'advertis': 30, 'advic': 31, 'advis': 32, 'ae': 33, 'af': 34, 'affect': 35, 'affili': 36, 'afford': 37, 'africa': 38, 'after': 39, 'ag': 40, 'again': 41, 'against': 42, 'agenc': 43, 'agent': 44, 'ago': 45, 'agre': 46, 'agreement': 47, 'aid': 48, 'air': 49, 'al': 50, 'alb': 51, 'align': 52, 'all': 53, 'allow': 54, 'almost': 55, 'alon': 56, 'along': 57, 'alreadi': 58, 'alsa': 59, 'also': 60, 'altern': 61, 'although': 62, 'alwai': 63, 'am': 64, 'amaz': 65, 'america': 66, 'american': 67, 'among': 68, 'amount': 69, 'amp': 70, 'an': 71, 'analysi': 72, 'analyst': 73, 'and': 74, 'ani': 75, 'anim': 76, 'announc': 77, 'annual': 78, '

In [None]:
print(vocab.keys())

dict_keys(['aa', 'ab', 'abil', 'abl', 'about', 'abov', 'absolut', 'abus', 'ac', 'accept', 'access', 'accord', 'account', 'achiev', 'acquir', 'across', 'act', 'action', 'activ', 'actual', 'ad', 'adam', 'add', 'addit', 'address', 'administr', 'adult', 'advanc', 'advantag', 'advertis', 'advic', 'advis', 'ae', 'af', 'affect', 'affili', 'afford', 'africa', 'after', 'ag', 'again', 'against', 'agenc', 'agent', 'ago', 'agre', 'agreement', 'aid', 'air', 'al', 'alb', 'align', 'all', 'allow', 'almost', 'alon', 'along', 'alreadi', 'alsa', 'also', 'altern', 'although', 'alwai', 'am', 'amaz', 'america', 'american', 'among', 'amount', 'amp', 'an', 'analysi', 'analyst', 'and', 'ani', 'anim', 'announc', 'annual', 'annuiti', 'anoth', 'answer', 'anti', 'anumb', 'anybodi', 'anymor', 'anyon', 'anyth', 'anywai', 'anywher', 'aol', 'ap', 'apolog', 'app', 'appar', 'appear', 'appl', 'appli', 'applic', 'appreci', 'approach', 'approv', 'apt', 'ar', 'archiv', 'area', 'aren', 'argument', 'arial', 'arm', 'around', '

In [None]:
print(vocab.values())

dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 

In [None]:
# this function takes an email from the dataset as input and returns a word vector
# this word vector indicates which vocabulary words are present in the email
def email_preprocessing(email):

    #print(email)

    # Vocabulary dictionary
    vocab_dict = vocabulary_dictionary()

    # Word vector
    word_vector = []

    # converting the entire email into lower case
    email = email.lower()

    # removing all the HTML tags from the email
    email = re.sub('<[^<>]+>', ' ', email)

    #replacing numbers with the text 'number'
    email = re.sub('[0-9]+', 'number', email)

    #replacing URLs with the text 'httpaddr'
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)

    #replacing email addresses with the text 'emailaddr'
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)

    #replacing dollar sign($) with the text "dollar".
    email = re.sub('[$]+', 'dollar', email)

    #print(email)

    counter = 0
    
    # removing punctuations
    email = email.translate(str.maketrans('', '', punctuation))

    email = email.split()
    for t in email:

        t = re.sub('[^a-zA-Z0-9]', '', t)
        
        stemmer = SnowballStemmer("english")
        t = stemmer.stem(t.strip())

        if len(t) < 1:
           continue

        if t in vocab_dict:
            id = vocab_dict[t]
            word_vector.append(id)

        if counter + len(t) + 1 > 78:
            #print()
            counter = 0
        #print(t, end=' ')
        counter = counter + len(t) + 1

    return word_vector

In [None]:
word_vector = email_preprocessing(data.iloc[2]['text'])

WE NEED HELP.  We are a 14 year old fortune 500 company, and we have 

grown 1000%!  We cannot keep up.  We are looking for individuals who 

want to work at home, and make a good living.



So if you are looking to be employed from home with a career that has

vast opportunities, then go:



http://www.lotsonet.com/opportunity



and fill out our info form.  NO EXPERIENCE REQUIRED, WE WILL TRAIN YOU.

NO COMMITTMENT IS REQUIRED BY FILLING OUT THE FORM, IT IS FOR INFO

ONLY.



http://www.lotsonet.com/opportunity



You want to be independent?  THEN MAKE IT HAPPEN!

HAPPEN!



SIMPLY CLICK ON THE LINK BELOW FOR FREE, NO OBLIGATED INFORMATION!

GUARANTEED!



http://www.lotsonet.com/opportunity





To be removed from our link simple go to:



http://www.lotsonet.com/opportunity/remove.html





4171uCAC8-021gCco3337qYKc4-050NVjZ1161Zl37





we need help.  we are a number year old fortune number company, and we have 

grown number%!  we cannot keep up.  we are looking for individuals w

In [None]:
import numpy as np
print(np.shape(word_vector))

(108,)


In [None]:
print(word_vector)

[1819, 1093, 771, 1819, 1120, 1889, 1170, 675, 1120, 321, 74, 1819, 756, 1120, 1819, 240, 904, 1760, 1819, 976, 666, 837, 1844, 1809, 1699, 1869, 124, 787, 74, 997, 724, 964, 1538, 810, 1893, 976, 1699, 162, 534, 688, 787, 1860, 248, 1665, 1178, 1671, 718, 799, 74, 640, 1192, 1191, 839, 672, 1106, 594, 1406, 1819, 1852, 1720, 1893, 1106, 877, 1406, 227, 640, 1192, 1666, 672, 883, 877, 666, 839, 1173, 799, 1893, 1809, 1699, 162, 832, 1671, 997, 883, 750, 750, 1518, 298, 1171, 1666, 959, 174, 666, 681, 1106, 1156, 840, 739, 799, 1699, 162, 1398, 688, 1191, 959, 1517, 718, 1699, 799]


In [None]:
import numpy as np

# this function takes a word vector as input and returns a feature vector
def create_feature_vector(word_vector):
  
    num_of_words = 1899

    feature_vector = np.zeros((num_of_words, 1))

    for i in word_vector:
        if i == 1899:
          pass
        else:
          feature_vector[i] = 1

    return feature_vector

In [None]:
features = create_feature_vector(word_vector)
print(np.shape(features))

(1899, 1)


In [None]:
# creating train_x and train_y
train_x = np.zeros((6849, 1899))
train_y = np.zeros((6849, 1))
for i in range(6849):
  word_vector = email_preprocessing(data.iloc[i]['text'])
  if data.iloc[i]['class'] == 'spam':
    train_y[i] = 1
  features = create_feature_vector(word_vector)
  train_x[i] = features.transpose()

In [None]:
print(np.shape(train_x))
print(np.shape(train_y))

(6849, 1899)
(6849, 1)


In [None]:
print(train_y[10:20, :])

[[0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]


In [None]:
for i in range(10, 20):
  print(data.iloc[i]['class'])

ham
spam
spam
ham
ham
spam
spam
spam
spam
spam


In [None]:
# I have used SVM to build the spam classifier,
# As per the question "The algorithms (except SVM) need to be coded from scratch"
# So I have used "sklearn" for building the SVM classifier

from sklearn import svm

X = train_x
y = train_y

C = 0.1
y = y.ravel()
svc = svm.SVC(C = C, kernel = 'linear')
svc.fit(X, y)
p = svc.predict(X)

print('Training Accuracy: {0:.2f}%'.format(np.mean((p == y).astype(int)) * 100))

Training Accuracy: 99.77%


In [36]:
# this code extracts the test.zip folder
# if your test folder is not a zip folder then do not run this block
zip_path = 'test.zip'
zip_ref = zipfile.ZipFile(zip_path, 'r')
zip_ref.extractall()
zip_ref.close()

In [40]:
import os
directory = 'test'

# this function reads the emails of the 'test' folder 
# generates prediction for each of the test emails
# and returns a prediction vector
def read_and_predict() :

    i = 0
    for filename in os.listdir(directory):
      if filename.endswith(".txt"):
        i += 1

    predictions = np.zeros((i, ))

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
          #print(filename)
          s = filename[:-4]
          #print(s)
          email_number = s[5:]
          #print(email_number)
          with open('test/' + filename, 'r') as email:
            file_contents = email.read()
            #print(file_contents)
          word_vector  = email_preprocessing(file_contents)
          feature_vector = create_feature_vector(word_vector)
          pr = svc.predict(feature_vector.transpose())
          predictions[int(email_number)-1] = pr
    
    return predictions

In [42]:
read_and_predict()       

array([0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0.])