In [1]:
#import required libraries
import pandas as pd

In [7]:
#Get the spam data collection
spam_collection = pd.read_csv('SpamCollection.csv', delimiter='\t',header=None, names = ['response','message'])

In [8]:
#Pull data information 
spam_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   response  5572 non-null   object
 1   message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
#Describe the data frequencies and counts
spam_collection.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [10]:
#View the first five records
spam_collection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#View response using group by and describe method 
spam_collection.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [18]:
#Verify length of the messages and also add a new column
spam_collection['message_length'] = spam_collection['message'].apply(lambda x: len(x))

In [19]:
#View first 6 records with new column
spam_collection.head(6)

Unnamed: 0,response,message,message_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147


In [26]:
#Define a function to get rid of stopwords present in the messages 
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/palombaa/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/palombaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
#Remove stop words and punctuations
def remove_stopwords(message):
    message = message.lower()
    chars = [char for char in message if char not in string.punctuation]
    message = ''.join(chars)
    words = nltk.tokenize.word_tokenize(message)
    return [word for word in words if word not in stop_words]

In [28]:
#Verify whether the function is working
spam_collection.message.head().apply(remove_stopwords)

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [29]:
#Start text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
#Use bag of words by applying the function and fit the data into it 
bag_of_words_transformer = CountVectorizer(analyzer=remove_stopwords).fit(spam_collection.message)

In [33]:
#Print length of bag of words stored in the vocabulary_attribute
print(len(bag_of_words_transformer.vocabulary_))

9506


In [34]:
#Store bag of words for message using transform methods 
bag_of_words_sparse_matrix = bag_of_words_transformer.transform(spam_collection.message)
bag_of_words_sparse_matrix

<5572x9506 sparse matrix of type '<class 'numpy.int64'>'
	with 50198 stored elements in Compressed Sparse Row format>

In [37]:
#Apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(bag_of_words_sparse_matrix)

In [38]:
message_tfidf = tfidf_transformer.transform(bag_of_words_sparse_matrix)

In [39]:
#Print shape of the tfidf
message_tfidf.shape

(5572, 9506)

In [40]:
#Choose Naive BAyes Model to detect the spam and fit the tfidf data into it 
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(message_tfidf, spam_collection.response)

In [41]:
#Check model for the predicted and expected value for top 10 messages
spam_collection['prediction'] = spam_detection_model.predict(message_tfidf)

In [42]:
for i in range(0,10):
    print('prediction of message', i, ' : ', spam_collection.prediction[i], 'expected: ', spam_collection.response[i])

prediction of message 0  :  ham expected:  ham
prediction of message 1  :  ham expected:  ham
prediction of message 2  :  spam expected:  spam
prediction of message 3  :  ham expected:  ham
prediction of message 4  :  ham expected:  ham
prediction of message 5  :  ham expected:  spam
prediction of message 6  :  ham expected:  ham
prediction of message 7  :  ham expected:  ham
prediction of message 8  :  spam expected:  spam
prediction of message 9  :  spam expected:  spam
