In [1]:
import pandas as pd
from Preprocessing import Preprocessing

## Prepare the data for features extraction

In [33]:
# read email body only
def get_body(mail):
    '''
    param  -- generator object from file reader
    return -- list of text lines
    '''
    body = []
    for i, line in enumerate(mail):
        # mail body in the third line:
        if i == 2:
            body.append(line)
    return body

def prepare_text_data(files_lst):
    '''
    param  -- list of directory to be opened
    return -- dictionary contains index and message of every email
    '''
    data = {'index': [], 'msg': []}
    for i, email in enumerate(files_lst):
        # open every file and read the contents
        with open(email) as mail:
            data['index'].append(i)
            data['msg'].append(' '.join(line.rstrip('\n') for line in get_body(mail)))
    return data

# empty dict that will be used later for pandas
train_data = {'index': [], 'msg': []}
test_data  = {'index': [], 'msg': []}

# get list of all emails text files from the given directory
train_preprocessor = Preprocessing()
test_preprocessor = Preprocessing()

# training/test folder
train_preprocessor.set_directory('train-mails/')
test_preprocessor.set_directory('test-mails/')

# now we have all email files
train_files = train_preprocessor.get_emails()
test_files = test_preprocessor.get_emails()

# get training data and text data in form of dictionary
train_data = prepare_text_data(train_files)
test_data = prepare_text_data(test_files)

# set training and test labels(spam vs non-spam)
train_data['spam'] = np.zeros(train_preprocessor.get_emails_size(), dtype=int)
test_data['spam'] = np.zeros(test_preprocessor.get_emails_size(), dtype=int)

# label=1 is spam label=0 non-spam
train_data['spam'][351:] = 1
test_data['spam'][130:] = 1

## Let's take a look at the data 

In [34]:
train_set = pd.DataFrame.from_dict(train_data)
test_set = pd.DataFrame.from_dict(test_data)
train_set.head(5)

Unnamed: 0,index,msg,spam
0,0,- - - - - swiss linguistic society organize su...,0
1,1,"> deat : sun , 15 dec 91 2 : 25 : 2 est > : mi...",0
2,2,"discussion s - > np + np remind ago read , sou...",0
3,3,". . . 's much restrictive s - > np np . 's "" ""...",0
4,4,""" listserv "" international conference 1992 sec...",0


In [35]:
test_set.head(5)

Unnamed: 0,index,msg,spam
0,0,"shigeru kiritanus , hajime hirose hiroya fujis...",0
1,1,susanne winkler focus secondary predication 19...,0
2,2,johan elsness perfect preterite contemporary e...,0
3,3,"alan c . harri , ph . d . telnos : main off : ...",0
4,4,la jeune equipe syntaxe anglaise et syntaxe co...,0


In [36]:
# full text sample
print(train_set['msg'][60])
print('---------------------------')
print(test_set['msg'][60])

next 's nels conference jointly host harvard university mit . hop set conference date conflict major , nearby conference . host conference next fall , already set date , please send e-mail wednesday , nov . 16 . martha jo mcginni , mit
---------------------------
abstract due : 15 . february esslli-98 workshop current topics constraint-based theories germanic syntax august 17 - 21 , 1998 workshop hold part 10th european summer school logic , language information ( esslli-98 ) august 17 - 28 , 1998 , saarbrueken , germany * * second call papers * * organizers : tibor kiss detmar meurer ( ibm germany univ . tuebingen ) web site : http : / / www . dc . warwick . ac . uk / ~ esslli98 / workshop . html background : number approach germanic language ( exclude english ) develop constraint-base theory hpsg lfg . apart issue empirical adequacy , formal issue raise , among : - nature complex predicate mechanism formalize - linearization versus movement analysis various phenomenon - nature functi

## build bag of word model 

In [37]:
def remove_punctuation(text):
    '''
    param  -- string text to be cleaned
    return -- filtered text without punctuations
    '''
    import string
    for punc in string.punctuation:
        text = text.replace(punc, "")
    return text

# apply for all rows in the dataset
train_set['msg'] = train_set['msg'].apply(lambda row: remove_punctuation(row))
test_set['msg'] = test_set['msg'].apply(lambda row: remove_punctuation(row))
# here is sample after removing punctuations
train_set['msg'][60]

'next s nels conference jointly host harvard university mit  hop set conference date conflict major  nearby conference  host conference next fall  already set date  please send email wednesday  nov  16  martha jo mcginni  mit'

## Build bag of word vector

In [39]:
# import sklearn preprocessing library
from sklearn.feature_extraction.text import CountVectorizer
# max features is 3000 words count
count_vect = CountVectorizer(max_features=3000)
# count all words and build matrix features from words counts
train_count = count_vect.fit_transform(train_set['msg'])
test_count = count_vect.fit_transform(test_set['msg'])
# get sparse matrix
train_features = count_vect.transform(train_set['msg']).toarray()
test_features = count_vect.transform(test_set['msg']).toarray()
# now we have sparse matrix in the form [n_samples, n_features]
print(train_count.shape)
print(test_count.shape)
# labels
train_labels = train_set['spam']
test_labels = test_set['spam']

(702, 3000)
(260, 3000)


## it's time to train the model

In [41]:
# import SVC library
from sklearn.svm import LinearSVC
# initialize SVM classifier
svm_classifier = LinearSVC()
# fit the model to the training data
svm_classifier.fit(train_features, train_labels)

# predict the result
result = svm_classifier.predict(test_features)

# make confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_labels, result))

[[118  12]
 [ 11 119]]
