# ANAS MASROOR

## Spam Message Classifier Model

In [1]:
import pandas as pd
sms_spam = pd.read_csv("SMSSpamCollection", sep = "\t", header=None, names = ["Label","SMS"])

print(sms_spam.shape)
sms_spam.head()


(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
sms_spam["Label"].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

## Training and Test Set

We're now going to split our dataset into a training and a test set, where the training set accounts for 80% of the data, and the test set for the remaining 20%.

In [3]:
data_randomized = sms_spam.sample(frac=1, random_state=1)


training_test_index = round(len(data_randomized) * 0.8)
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)


training_set.shape, test_set.shape

((4458, 2), (1114, 2))

In [4]:
training_set["Label"].value_counts(normalize = True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [5]:
test_set["Label"].value_counts(normalize = True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

## Data Cleaning

In [6]:
training_set["SMS"] = training_set["SMS"].str.replace("\W"," ").str.lower()
training_set.head()

  training_set["SMS"] = training_set["SMS"].str.replace("\W"," ").str.lower()


Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


### Creating the Vocabulary

In [7]:
training_set["SMS"] = training_set["SMS"].str.split()

training_set["SMS"]

0                       [yep, by, the, pretty, sculpture]
1       [yes, princess, are, you, going, to, make, me,...
2                         [welp, apparently, he, retired]
3                                                [havent]
4       [i, forgot, 2, ask, ü, all, smth, there, s, a,...
                              ...                        
4453    [sorry, i, ll, call, later, in, meeting, any, ...
4454    [babe, i, fucking, love, you, too, you, know, ...
4455    [u, ve, been, selected, to, stay, in, 1, of, 2...
4456    [hello, my, boytoy, geeee, i, miss, you, alrea...
4457                              [wherre, s, my, boytoy]
Name: SMS, Length: 4458, dtype: object

In [8]:
vocabulary = []

for sms in training_set["SMS"]:
    for word in sms:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))

In [9]:
len(vocabulary)

7783

### The Final Training Set

In [10]:
pd.DataFrame({"yep": [0] * len(training_set["SMS"]),"to": [0] * len(training_set["SMS"])})

Unnamed: 0,yep,to
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
4453,0,0
4454,0,0
4455,0,0
4456,0,0


In [11]:
training_set["SMS"].head(2)

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
Name: SMS, dtype: object

In [12]:
word_counts_per_sms = {unique_word: [0] * len(training_set["SMS"]) for unique_word in vocabulary}
for index, sms in enumerate(training_set["SMS"]):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [13]:
word_counts_per_sms["retired"][2]

1

In [14]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,astoundingly,midnight,195,she,throwin,thedailydraw,afraid,oli,ain,borderline,...,highest,87077,ola,rubber,help,m39m51,frndshp,true,basq,cost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
word_counts["yep"]

0       1
1       0
2       0
3       0
4       0
       ..
4453    0
4454    0
4455    0
4456    0
4457    0
Name: yep, Length: 4458, dtype: int64

In [16]:
training_set_clean = pd.concat([training_set, word_counts], axis = 1)
training_set_clean.head()

Unnamed: 0,Label,SMS,astoundingly,midnight,195,she,throwin,thedailydraw,afraid,oli,...,highest,87077,ola,rubber,help,m39m51,frndshp,true,basq,cost
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculating Constants First

In [17]:
spam_message = training_set_clean[training_set_clean["Label"] == "spam"]
ham_message = training_set_clean[training_set_clean["Label"] == "ham"]

p_spam = len(spam_message)/len(training_set_clean)
p_ham = len(ham_message)/len(training_set_clean)

n_word_per_spam_message = spam_message["SMS"].apply(len)
n_spam = n_word_per_spam_message.sum()

n_word_per_ham_message = ham_message["SMS"].apply(len)
n_ham = n_word_per_ham_message.sum()

n_vocabulary = len(vocabulary)

alpha = 1

## Calculating Parameters

In [18]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}


for word in vocabulary:
    n_word_given_spam = spam_message[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha)/(n_spam + alpha * n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_message[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha)/(n_ham + alpha * n_vocabulary)
    parameters_ham[word] = p_word_given_ham

## Classifying A New Message

In [19]:
import re

def classify(message):
    """
    message: a string
    """
    
    message = re.sub("\W"," ", message)
    message = message.lower().split()
    
    p_spam_given_messeage = p_spam
    p_ham_given_messeage = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_messeage *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_messeage *= parameters_ham[word]
            
    if p_ham_given_messeage > p_spam_given_messeage:
        return "ham"
    elif p_ham_given_messeage < p_spam_given_messeage:
        return "spam"
    else:
        return "needs human classification"

## Measuring the Spam Filter's Accuracy

In [20]:
test_set["predicted"] = test_set["SMS"].apply(classify)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


Now, we'll write a function to measure the accuracy of our spam filter to find out how well our spam filter does.

In [21]:
correct = 0
total = test_set.shape[0]

for index, row in test_set.iterrows():
    if row["Label"] == row["predicted"]:
        correct += 1
        
print("Correct: ", correct)
print("Incorrect: ", total - correct)
print("Accuracy: ", (correct/total) * 100)

Correct:  1100
Incorrect:  14
Accuracy:  98.74326750448833


The accuracy is close to 98.74%, which is really good. 

In [22]:
import pickle
pickle.dump(training_set_clean,open('words.pkl','wb'))
pickle.dump(vocabulary,open('vocab.pkl','wb'))