# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [5]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

path_to_resourse = '/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails'

data = data.append(dataFrameFromDirectory(path_to_resourse + '/spam', 'spam'))
data = data.append(dataFrameFromDirectory(path_to_resourse + '/ham', 'ham'))


Let's have a look at that DataFrame:

In [12]:
data.head()

Unnamed: 0,class,message
/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails/spam/00196.dd21040c7757d477c967ae71b537810e,spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=3D""Co..."
/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails/spam/00498.48c3098854d339353f1a28a13b196017,spam,This is an HTML email message. If you see thi...
/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails/spam/00053.d88d8b162ca1b7108221fb338cd7d0a5,spam,<html>\n\n<body>\n\n<p>SEE US FOR FREE! <br>\n...
/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails/spam/00376.f4ed5f002f9b6b320a67f1da9cacbe72,spam,"<html>\n\n<head>\n\n <meta http-equiv=3D""Con..."
/home/bobsira/Documents/Python/udemy AI course/DataScience-Python3/emails/spam/00247.4f7c67c9792706fa90fe218d4b092b7a,spam,MR MARTIN FRANCIS\n\nABIDJAN COTE DIVOIRE\n\n...


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [7]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [8]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], 
      dtype='<U4')

In [10]:
examples_test = [
    'Please do not hesitate to contact us for possible business co-operation if you are interested',
    "Entrance to the club is free"
]
example_counts_tests = vectorizer.transform(examples)
predictions_tests = classifier.predict(example_counts)
predictions_tests

array(['spam', 'ham'], 
      dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [None]:
#SMS Spam Classifier

In [15]:
import pandas as pd
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t',
                   header=None,
                   names=['label', 'sms_message'])

df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#1.1 Process the data set
#We need to transform the labels to binary values so we can run the regression. Here 1 = "spam" and 0 = "ham"

In [16]:
#Map applies a function to all the items in an input list or df column.
df['label'] = df.label.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#2.1 Enter Bag of Words

In [19]:
import string #punctuation
import pprint
from collections import Counter #frequencies

#Bag of Words from scratch
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []

for i in documents:
    lower_case_documents.append(i.lower())
print("lower case:", lower_case_documents)

# Remove punctuation.
sans_punctuation_documents = []

for i in lower_case_documents:
    sans_punctuation_documents = ["".join( j for j in i if j not in string.punctuation) for i in  lower_case_documents]
print("no punctuation:", (sans_punctuation_documents))

#Break each word
preprocessed_documents = []

for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' ')) #split on space
print("break words:", (preprocessed_documents))

#Count frequency of words using counter
frequency_list = []

for i in preprocessed_documents:
    frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)
print("tokenized counts:", pprint.pprint(frequency_list))

lower case: ['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']
no punctuation: ['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']
break words: [['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]
[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]
tokenized counts: None


In [None]:
#2.2 SciKit-Learn Feature Extraction

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer() #set the variable

count_vector.fit(documents) #fit the function
count_vector.get_feature_names() #get the outputs

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [21]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [22]:
frequency_matrix = pd.DataFrame(doc_array,
                                columns = count_vector.get_feature_names()
                               )
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [None]:
#3.1 Training & Testing Sets
#We'll split our dataset using scikit's train_test_split method into 
#training and testing sets so we can make inferences about the model's 
#accuracy on data it hasn't been trained on.

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                    df['label'],
                                                    random_state=1)

print("Our original set contains", df.shape[0], "observations")
print("Our training set contains", X_train.shape[0], "observations")
print("Our testing set contains", X_test.shape[0], "observations")

Our original set contains 5572 observations
Our training set contains 4179 observations
Our testing set contains 1393 observations


In [None]:
#Fit the training & testing data to the CountVectorizer() method and return a matrix

In [28]:
train = count_vector.fit_transform(X_train)
test = count_vector.transform(X_test)

In [None]:
#4.1 Implementing Baye's Theorem from Scratch

In [29]:
#performing calculations:


p_hiv = .015 #P(HIV) assuming 1.5% of the population has HIV

p_no_hiv = .98 # P(~HIV)

p_positive_hiv = .95 #sensitivity

p_negative_hiv = .95#specificity

#P(Positive)
p_positive = (p_hiv * p_positive_hiv) + (p_no_hiv * (1-p_negative_hiv))
print("The probability of getting a positive test result is:", p_positive, "this is our prior")

The probability of getting a positive test result is: 0.06325000000000004 this is our prior


In [31]:
#P(HIV | Positive)
p_hiv_positive = (p_hiv * p_positive_hiv) / p_positive

print ("The probability of a person having HIV, given a positive test result is:", p_hiv_positive)

#P(~HIV | Positive)
p_positive_no_hiv = 1 - p_positive_hiv
p_no_hiv_positive = (p_no_hiv * p_positive_no_hiv) / p_positive

print ("The probability of an individual not having HIV given getting a positive test result is:", p_no_hiv_positive)

The probability of a person having HIV, given a positive test result is: 0.22529644268774687
The probability of an individual not having HIV given getting a positive test result is: 0.7747035573122532


In [32]:
posterior_sum = p_no_hiv_positive + p_hiv_positive
posterior_sum #sum to 1, looks good!

1.0

In [33]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB() #call the method
naive_bayes.fit(train, y_train) #train the classifier on the training set
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
predictions = naive_bayes.predict(test) #predic using the model on the testing set

In [36]:
from sklearn.metrics import accuracy_score, precision_score,f1_score

print(('accuracy score: '),format(accuracy_score(y_test,predictions)))
print(('precision score: '),format(precision_score(y_test,predictions)))

accuracy score:  0.9885139985642498
precision score:  0.9720670391061452
