In [1]:
import pandas as pd
import string 
import re
    # re is regular expression
    # it is used here by cleaning input by clearing all non-alphanumeric characters

In [2]:
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names = ['Lable', 'SMS'])

In [3]:
# you use sept='t' due to the lay out of the data, it is set up in tabs
# header= None due to the data not having a header row
# lastly you use names = ['Lable', 'SMS'] to lable the columns

In [4]:
print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Lable,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms_spam['Lable'].value_counts(normalize=True)

Lable
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [6]:
#next we train and test
#we will take 80% of it to use to train and the last to test what we trained
#first we have to scramble the dataset, to make sure that everything is properly spread out

In [7]:
#randomize
# .sample is used to randomly sample rows from a DataFrame
# frac specifies the fraction of rows to return in the sample, when it =1 that means you want to shuffle all rows
# random_fraction is a seed value for the random number generator, providing a value makes sure it is able ot be reproduced
random = sms_spam.sample(frac=1, random_state=1)

In [8]:
# the index for the split
training_test_index = round(len(random) * 0.80)

In [9]:
# splitting the train and test
training_set = random[:training_test_index].reset_index(drop= True)
test_set = random[training_test_index:].reset_index(drop= True)

In [10]:
print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [11]:
training_set['Lable'].value_counts(normalize=True)

Lable
ham     0.86541
spam    0.13459
Name: proportion, dtype: float64

In [12]:
test_set['Lable'].value_counts(normalize=True)

Lable
ham     0.868043
spam    0.131957
Name: proportion, dtype: float64

In [13]:
#data cleaning
# when we get a new message, it will read the message and decide if it is spam or ham based on what the subject line is
# words like winner, secret, prize, and party will be marked 

In [14]:
#before cleaning
training_set.head()

Unnamed: 0,Lable,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [15]:
#after training
# it removes the punctuation and makes everything lowercase
training_set['SMS'] = training_set['SMS'].str.translate(str.maketrans('', '', string.punctuation))
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head(3)

Unnamed: 0,Lable,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired


In [16]:
# we make each column a list byt spliting it 
training_set['SMS'] = training_set['SMS'].str.split()

In [17]:
# we make an empty list that will hold each unique word in the set

In [18]:
vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

uniqueVocab = list(set(vocabulary))

In [19]:
# this shows us how many words are in the list
len(uniqueVocab)

8515

In [20]:
#we need to build a dictionary so we can later create a dataframe that we will need

In [21]:
word_counts_sms =  {'secret': [2,1,1],
                       'prize': [2,0,1],
                       'claim': [1,0,1],
                       'now': [1,0,1],
                       'coming': [0,1,0],
                       'to': [0,1,0],
                       'my': [0,1,0],
                       'party': [0,1,0],
                       'winner': [0,0,1]
                      }

word_counts = pd.DataFrame(word_counts_sms)
word_counts.head()

Unnamed: 0,secret,prize,claim,now,coming,to,my,party,winner
0,2,2,1,1,0,0,0,0,0
1,1,0,0,0,1,1,1,1,0
2,1,1,1,1,0,0,0,0,1


In [22]:
# to create a dictionary for our set, we start by creating a dictionary where each key is a unique word from our vocabuarly 
# each value is a list of the length of the training set
# we loop through training_set using enumerate() function to get the index and the SMS message 

In [23]:
word_counts_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_sms[word][index] += 1

In [24]:
word_counts = pd.DataFrame(word_counts_sms)
word_counts.head()

Unnamed: 0,yep,by,the,pretty,sculpture,yes,princess,are,you,going,...,beauty,hides,secrets,n8,jewelry,related,trade,arul,bx526,wherres
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#we have no lable columns, so we use pd.concate 
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Lable,SMS,yep,by,the,pretty,sculpture,yes,princess,are,...,beauty,hides,secrets,n8,jewelry,related,trade,arul,bx526,wherres
0,ham,"[yep, by, the, pretty, sculpture]",1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, theres, a, c...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#now we start to build the filter

In [27]:
#isolating the spam and not spam (ham) message
spam_messages = training_set_clean[training_set_clean['Lable'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Lable'] == 'ham']

In [28]:
# using the multinomial Naive Bayes algorithm to calcualte the constant terms
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# laplace smoothing
# this is a techinique used to handle zero probability issues in algorithms
alpha = 1

In [29]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

In [30]:
# the parameters are classified 
# the filter takes the header of the message, calucatles the number of spam and ham words, then compairs the number
    # if spam > ham then it is spam
    # if spam < ham then it is not spam
    # if spam = ham then the program may need human help to decide

In [31]:
def classify(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message) # this takes all not letters and replaces then with a space
   message = message.lower().split() # makes everything lowercase and splits the string to individual words

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal proabilities, have a human classify this!')

In [32]:
# now to test and obvious spam and an obvious ham
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.2180230330772012e-30
P(Ham|message): 5.402341610870019e-30
Label: Ham


In [33]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 1.8792477080146232e-29
P(Ham|message): 2.869845573570632e-23
Label: Ham


In [34]:
# to make it more readable to everyone, this returns the classification instead of printing them
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'

In [35]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Lable,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [37]:
# to measure how accurate this is
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['Lable'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1069
Incorrect: 45
Accuracy: 0.9596050269299821


In [38]:
# it has a 95% accuracy 