In [1]:
#import required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [4]:
#Get the spam data collection using pandas
df_spamCollection = pd.read_csv('C:\dataset\SMSSpamCollection',sep='\t',names=['response','message'])

In [5]:
#view first 5 records
df_spamCollection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# view more information about the spam data using describe method
df_spamCollection.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
# view response using group by and describe method
df_spamCollection.groupby('response').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
response,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4825
ham,unique,4516
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


In [8]:
# Verify length of the messages and also add it also as a new column (feature)
df_spamCollection['length'] = df_spamCollection['message'].apply(len)

In [9]:
# view first 5 messages with length
df_spamCollection.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [10]:
# define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # Check characters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # now form the sentence.
    no_punctuation = ''.join(no_punctuation)
    # Now eliminate any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [11]:
# verify that function is working
df_spamCollection['message'].head(5).apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [12]:
# start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
# bag of words by applying the function and fit the data (message) into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spamCollection['message'])



In [14]:
# print length of bag of words stored in the vocabulary_ attribute
print len(bag_of_words_transformer.vocabulary_)

11425


In [15]:
# store bag of words for messages using transform method
message_bagofwords = bag_of_words_transformer.transform(df_spamCollection['message'])



In [16]:
# apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bagofwords)

In [17]:
# print shape of the tfidf 
message_tfidf = tfidf_transformer.transform(message_bagofwords)
print message_tfidf.shape

(5572, 11425)


In [18]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf,df_spamCollection['response'])

In [21]:
# check model for the predicted  and expected value say for message#2 and message#5
message =df_spamCollection['message'][4]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)

print 'predicted',spam_detect_model.predict(tfidf)[0]
print 'expected',df_spamCollection.response[4]

predicted ham
expected ham
