In [1]:
# Import Libraries
import pandas as pd
import string
from nltk.corpus import stopwords


In [5]:
# get spam data collection
df_spamCollection = pd.read_csv('https://raw.githubusercontent.com/ammishra08/MachineLearning/master/Datasets/SpamCollection',
                               sep = '\t', names = ['response','message'])

# names are colon names

In [6]:
# view 5 records
df_spamCollection.head()


Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# view more information about the spam data using describe method
df_spamCollection.describe()


Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [11]:
# view response using group by and describe method
df_spamCollection.groupby('response').describe().T


Unnamed: 0,response,ham,spam
message,count,4825,747
message,unique,4516,653
message,top,"Sorry, I'll call later",Please call our customer service representativ...
message,freq,30,4


In [13]:
# Verify length of the messages and also add it also as a new column (feature)
df_spamCollection['length'] = df_spamCollection['message'].apply(len)
df_spamCollection['length']

0       111
1        29
2       155
3        49
4        61
       ... 
5567    160
5568     36
5569     57
5570    125
5571     26
Name: length, Length: 5572, dtype: int64

In [14]:
# view first 5 messages with length
df_spamCollection.head()


Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [15]:
# Define a function to get rid of stopwords present in the messages
def message_text_process(message):
    # Check characters to see if there are punctuations
    no_punctuation = [char for char in message if char not in string.punctuation]
    # now form the sentence
    no_punctuation = ''.join(no_punctuation)
    # Now Elliminate stop words 
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [16]:
df_spamCollection['message'].head(5).apply(message_text_process)


0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [19]:
# start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer


In [20]:
# bag of words by applying the function and fit the data (message) into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spamCollection['message'])


In [21]:
# print length of bag of words stored in the vocabulary_ attribute
len(bag_of_words_transformer.vocabulary_)


11425

In [22]:
# Vectorize the entire bag of words using transform method
message_bagofwords = bag_of_words_transformer.transform(df_spamCollection['message'])


In [23]:
print(message_bagofwords.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [24]:
message_bagofwords.shape

(5572, 11425)

#### Term Frequency & Inverse Document Frequency (TF-IDF)


In [25]:
# apply tfidf () transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bagofwords)


In [26]:
# print shape of the tfidf
message_tfidf = tfidf_transformer.transform(message_bagofwords)
print(message_tfidf)


  (0, 11163)	0.23026685592418913
  (0, 10965)	0.19073428545061483
  (0, 8917)	0.24704652376837993
  (0, 8336)	0.17046869292195632
  (0, 7668)	0.26403384065473806
  (0, 7555)	0.31253856260694546
  (0, 6937)	0.1834692413608692
  (0, 6906)	0.15158474664662352
  (0, 6217)	0.18915557732842803
  (0, 5769)	0.24984711892976424
  (0, 5218)	0.26870593862526665
  (0, 5217)	0.29835184088197164
  (0, 4653)	0.31253856260694546
  (0, 2060)	0.24203960256420656
  (0, 1483)	0.31253856260694546
  (0, 1110)	0.2882862016308418
  (1, 11072)	0.40061560982443056
  (1, 10698)	0.2063637481323008
  (1, 8590)	0.5043405901305854
  (1, 7701)	0.3767401070812794
  (1, 3064)	0.2911995411244838
  (1, 2451)	0.561988811929381
  (2, 11123)	0.19104387220509106
  (2, 11084)	0.15898145347176754
  (2, 10686)	0.13995540820792943
  :	:
  (5568, 6882)	0.31367469776242124
  (5568, 6691)	0.47781076401785183
  (5568, 6354)	0.5575721048646767
  (5568, 4880)	0.3853122086093004
  (5569, 10199)	0.520467167163554
  (5569, 8252)	0.432829

### Naive Bayes' Algorithm

In [42]:
# choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, df_spamCollection['response'])


In [45]:
# Check model for the predicted & expected value 
message = df_spamCollection['message'][1678]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)


In [46]:
# model predicted response
spam_detect_model.predict(tfidf)[0]


'ham'

In [47]:
# actual response
df_spamCollection['response'][1678]


'ham'

In [49]:
# Performance of model
spam_detect_model.score(message_tfidf, df_spamCollection['response'])


0.9793610911701364