# Spam Classifier using Naive Bayes

Use sklearn.naive_bayes to train a spam classifier! 

Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [0]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Method to iterate through files, build path name, skip header and return path, msg
def readFiles(path):
    
    # walk through currently directory
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            
            # build path name
            path = os.path.join(root, filename)

            # identify if in email body
            inBody = False
            
            # line array
            lines = []
            
            # open file
            f = io.open(path, 'r', encoding='latin1')
            
            # for every line in the file
            for line in f:
                
                # if in body, append to line array
                if inBody:
                    lines.append(line)
                    
                # if line is empty, signal as in body
                elif line == '\n':
                    inBody = True
            f.close()
            
            # join all lines followed up next line
            message = '\n'.join(lines)
            
            # return constructed path name and body of email
            yield path, message

# Method to build dataframe by appending message
def dataFrameFromDirectory(path, classification):
    
    # rows and index to store data
    rows = []
    index = []
    
    # for every email found
    for filename, message in readFiles(path):
        
        # append to dictionary
        rows.append({'message': message, 'class': classification})
        
        # add index key
        index.append(filename)

    # final dataframe
    return DataFrame(rows, index=index)

# create DF obj with initial construction
data = DataFrame({'message': [], 'class': []})

# read in files
data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))


Let's have a look at that DataFrame:

In [2]:
# explore dataframe
data.head()

Unnamed: 0,message,class
emails/spam/00164.8536500ed9cadc8397a63b697d043c0b,Help wanted. We are a 14 year old fortune 500...,spam
emails/spam/00217.43b4ef3d9c56cf42be9c37b546a19e78,"<html><xbody>\n\n<hr width = ""100%"">\n\n<cente...",spam
emails/spam/00080.5a7386cb47846dfef68429241ad80354,This is a multi-part message in MIME format.\n...,spam
emails/spam/00037.21cc985cc36d931916863aed24de8c27,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont...",spam
emails/spam/00085.f63a9484ac582233db057dbb45dc0eaf,HELLO...By reading the short Summary just belo...,spam


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [3]:
#count how many times each word is found within each email
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

# classify spam or ham by number of words found 
classifier = MultinomialNB()
targets = data['class'].values

# fit model with actual data
classifier.fit(counts, targets)

MultinomialNB()

Let's try it out:

In [4]:
# sample emails
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]

# to count words
example_counts = vectorizer.transform(examples)

# to use classifier
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

Apply train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [8]:
from sklearn.model_selection import train_test_split

# split up datasets 80/20
X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.2)

# train model
model = classifier.fit(X_train, y_train)

# use model to make predictions
predictions = classifier.predict(X_test)

# generate score
model.score(X_test, y_test)

0.955