In [82]:
import numpy as np
import pandas as pd
import string
import pprint
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [48]:
# import data, label
data_source = 'smsspamcollection/SMSSpamCollection'
data = pd.read_table(data_source, sep='\t', names=['label','sms_message'])
data.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
# convert ham/spam labels to 0s and 1s for use in scikitlearn
data['label'] = data.label.map({'ham':0, 'spam':1})
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
# implement bag of words from scratch for understanding purposes

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = [x.lower() for x in documents] # convert all characters to lower case

# remove all punctuation
trans_punc = str.maketrans(dict.fromkeys(string.punctuation))
sans_punctuation_documents = [x.translate(trans_punc) for x in lower_case_documents] 

preprocessed_documents = [x.split() for x in sans_punctuation_documents] # tokenize the documents

frequency_list = [Counter(x) for x in preprocessed_documents]
frequency_list

[Counter({'are': 1, 'hello': 1, 'how': 1, 'you': 1}),
 Counter({'from': 1, 'home': 1, 'money': 1, 'win': 2}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'call': 1, 'hello': 2, 'tomorrow': 1, 'you': 1})]

In [62]:
# use count vectorizer to implement Bag of Words
count_vector = CountVectorizer()
count_matrix = count_vector.fit(documents)
feature_names = count_matrix.get_feature_names()
doc_array = count_vector.transform(documents).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=feature_names)
frequency_matrix


Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [80]:
X_train, X_test, y_train, y_test = train_test_split(data['sms_message'], data['label'], random_state=1)

# bag of words applied on the testing and training data
spam_training_data = count_vector.fit_transform(X_train)
spam_testing_data = count_vector.transform(X_test)

In [83]:
# naive bayes classifier for spam detection based on SMS messages
naive_bayes = MultinomialNB()
naive_bayes.fit(spam_training_data, y_train)
predictions = naive_bayes.predict(spam_testing_data)

print('Accuracy: ' + str(accuracy_score(y_test, predictions)))
print('Precision: ' + str(precision_score(y_test, predictions)))
print('Recall: ' + str(recall_score(y_test, predictions)))
print('F1 score: ' + str(f1_score(y_test, predictions)))

Accuracy: 0.988513998564
Precision: 0.972067039106
Recall: 0.940540540541
F1 score: 0.956043956044
