# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [122]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        #parses through all files in path
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            #read each line of the file
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            #this way it keeps on returning rows back to parent function "dataFrameFromDirectory"
            #and hence keeps the parents for loop running
            #EXCELLENT STUFF BY PYTHON! eh! 
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})
dataTest = DataFrame({'message': [], 'class': []})


data = data.append(dataFrameFromDirectory('E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/ham', 'ham'))

dataHam = dataTest.append(dataFrameFromDirectory('E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/ham', 'ham'))
dataSpam = dataTest.append(dataFrameFromDirectory('E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam', 'spam'))

trainData = data[80:]
testData = data[:80]


Let's have a look at that DataFrame:

In [42]:
trainData.head()

Unnamed: 0,class,message
E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam\00081.123b29a781b2e8c83763e5d440e672a3,spam,=================================\n\n\n\nGuara...
E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam\00082.0341a767bbaca01fd89b6236ef681257,spam,This message is in MIME format. Since your mai...
E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam\00083.c1891c507954e5b75b72b16712e799bf,spam,<HTML><HEAD><TITLE>Lowest Rate Services</TITLE...
E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam\00084.a9f5b3a9b7feb7070f25ae76320c8ec6,spam,"Dear Sir,\n\n\n\nI am Dr James Alabi, the chai..."
E:/2- GoogleDrive (Total 15GB)/Interests/Machine Learning/Udemy - Data Science and Machine Learning with Python/DataScience/emails/spam\00085.f63a9484ac582233db057dbb45dc0eaf,spam,HELLO...By reading the short Summary just belo...


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [32]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(trainData['message'].values)
targets = trainData['class'].values

classifier = MultinomialNB()
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [56]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], 
      dtype='|S4')

----------
Now we are going to input all HAM/SPAM emails and see how many are correctly marked as SPAM/HAM
----------

In [123]:
ham_counts = vectorizer.transform(dataHam['message'])
predictions_ham = classifier.predict(ham_counts)


from collections import Counter
c = Counter(predictions_ham)
#c.items() #this simply gives you count of distinct values in the array

results = DataFrame(sorted(c.items()))
print str(100-(results[1][1]*100)/(results[1][0]+results[1][1])) + "% HAM emails are correctly identified"



100% HAM emails are correctly identified


In [125]:
spam_counts = vectorizer.transform(dataSpam['message'])
predictions_spam = classifier.predict(spam_counts)


from collections import Counter
c = Counter(predictions_spam)
#c.items() #this simply gives you count of distinct values in the array

results = DataFrame(sorted(c.items()))
print str(100-(results[1][0]*100)/(results[1][0]+results[1][1])) + "% SPAM emails are correctly identified"




83% SPAM emails are correctly identified


## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [39]:
testData.count()

class      80
message    80
dtype: int64