# Spam filter

### Goal: Develop a program to determine if SMS message is spam or ham (not spam) using Naive Bayes algorithm

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("SMSSpamCollection", sep= "\t", header= None, names= ["Label", "SMS"])
data

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
data.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
data["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [6]:
# percentage of messages labelled "spam" in dataset

(747/5572)*100

13.406317300789663

In [7]:
data["SMS"].value_counts()

Sorry, I'll call later                                                    30
I cant pick the phone right now. Pls send a message                       12
Ok...                                                                     10
Okie                                                                       4
Ok                                                                         4
                                                                          ..
That sucks. I'll go over so u can do my hair. You'll do it free right?     1
Excellent, I'll see what riley's plans are                                 1
Dai what this da.. Can i send my resume to this id.                        1
I know that my friend already told that.                                   1
Good evening! How are you?                                                 1
Name: SMS, Length: 5169, dtype: int64

### Initial comments on dataset

- 5572 rows, 2 columns
- No missing values
- 13.4% of messages are Spam
- Most common SMS is "Sorry, I'll call later" at 30 instances

### Revised goal: Create a spam filter with > 80% accuracy

In [8]:
# splits data into train and test splits (80:20)

train = data.sample(frac= 1, random_state= 1)[round(5522*0.2):].reset_index()
test = data.sample(frac= 1, random_state= 1)[:round(5522*0.2)].reset_index()

In [9]:
# percentage of spam messages in train split

train["Label"].value_counts()["spam"]/ len(train) *100

13.473589973142344

In [10]:
# percentage of spam messages in test split

test["Label"].value_counts()["spam"]/ len(test) *100

13.134057971014492

In [11]:
train

Unnamed: 0,index,Label,SMS
0,1540,ham,You're not sure that I'm not trying to make xa...
1,3017,ham,"&lt;#&gt; is fast approaching. So, Wish u a v..."
2,2677,ham,* Am on a train back from northampton so i'm a...
3,4834,spam,"New Mobiles from 2004, MUST GO! Txt: NOKIA to ..."
4,5283,ham,"Yeah, probably here for a while"
...,...,...,...
4463,905,ham,"We're all getting worried over here, derek and..."
4464,5192,ham,Oh oh... Den muz change plan liao... Go back h...
4465,3980,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
4466,235,spam,Text & meet someone sexy today. U can find a d...


In [12]:
# removes punctuation marks & symbols and sets all text to lower case

train["SMS"] = train["SMS"].str.replace("'", "")
train["SMS"] = train["SMS"].str.replace("\W", " ")
train["SMS"] = train["SMS"].str.lower()
train["SMS"]

  train["SMS"] = train["SMS"].str.replace("\W", " ")


0       youre not sure that im not trying to make xavi...
1        lt   gt   is fast approaching  so  wish u a v...
2         am on a train back from northampton so im af...
3       new mobiles from 2004  must go  txt  nokia to ...
4                         yeah  probably here for a while
                              ...                        
4463    were all getting worried over here  derek and ...
4464    oh oh    den muz change plan liao    go back h...
4465    ceri u rebel  sweet dreamz me little buddy   c...
4466    text   meet someone sexy today  u can find a d...
4467                              k k   sms chat with me 
Name: SMS, Length: 4468, dtype: object

In [13]:
# turns "SMS" string object into list of strings (i.e list of words)

train["SMS"] = train["SMS"].str.split()

In [14]:
# creates list of all words from all SMS messages
vocabulary = []

for list_ in train["SMS"]:
    for word in list_:
        vocabulary.append(word)

In [15]:
# drop duplicates from "vocabulary" list

vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

vocabulary

['canceled',
 'permanent',
 'fast',
 'gnarls',
 'clarification',
 'ready',
 'jaykwon',
 'leading',
 'progress',
 'hi',
 'supplies',
 'select',
 'maybe',
 'badrith',
 'spaces',
 'thatll',
 'careless',
 '80155',
 'heard',
 'adrink',
 'buzz',
 'hehe',
 'representative',
 'tok',
 'sky',
 '2b',
 'little',
 'pap',
 '50perwksub',
 'mentionned',
 '2309',
 'detailed',
 'barring',
 'soz',
 'dreams',
 'fated',
 'boytoy',
 'stifled',
 'coulda',
 'teenager',
 'ppm',
 'gailxx',
 'specialise',
 'town',
 'b4',
 'bin',
 'far',
 'emily',
 'frens',
 'finest',
 'din',
 'wasted',
 'cheap',
 'chat80155',
 'vewy',
 'go',
 '5k',
 'omw',
 '87077',
 'listed',
 'hearted',
 'bsnl',
 'aaooooright',
 'looked',
 'sw7',
 '09094646899',
 '22',
 'dom',
 'nikiyu4',
 'temple',
 'individual',
 'whole',
 'hun',
 'mobiles',
 'timing',
 'hour',
 'tick',
 'holla',
 'abj',
 'lennon',
 '09065989180',
 'lovely',
 'music',
 'uks',
 '07734396839',
 'fredericksburg',
 'b4190604',
 'box385',
 'tired',
 'grinule',
 'edhae',
 'trying'

In [16]:
base_lib = {}

for word in vocabulary:
    base_lib[word] = 0

In [17]:
messages = []

for message in train["SMS"]:
    new_lib = base_lib.copy()
    
    for word in message:
        new_lib[word] += 1
        
    messages.append(new_lib)

In [18]:
binary_columns = pd.DataFrame(messages)
binary_columns

Unnamed: 0,canceled,permanent,fast,gnarls,clarification,ready,jaykwon,leading,progress,hi,...,njan,compare,jump,operator,rael,improved,mac,eyed,catches,storming
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4464,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
train = pd.concat([train, binary_columns], axis= 1)
train

Unnamed: 0,index,Label,SMS,canceled,permanent,fast,gnarls,clarification,ready,jaykwon,...,njan,compare,jump,operator,rael,improved,mac,eyed,catches,storming
0,1540,ham,"[youre, not, sure, that, im, not, trying, to, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3017,ham,"[lt, gt, is, fast, approaching, so, wish, u, a...",0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2677,ham,"[am, on, a, train, back, from, northampton, so...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4834,spam,"[new, mobiles, from, 2004, must, go, txt, noki...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5283,ham,"[yeah, probably, here, for, a, while]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4463,905,ham,"[were, all, getting, worried, over, here, dere...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4464,5192,ham,"[oh, oh, den, muz, change, plan, liao, go, bac...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4465,3980,ham,"[ceri, u, rebel, sweet, dreamz, me, little, bu...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4466,235,spam,"[text, meet, someone, sexy, today, u, can, fin...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
p_spam = train["Label"].value_counts()["spam"] / len(train)
p_ham = train["Label"].value_counts()["ham"] / len(train)

In [41]:
n_spam = 0

for message in train[train["Label"] == "spam"]["SMS"]:
    for word in message:
        n_spam += 1
        
n_spam

15242

In [42]:
n_ham = 0

for message in train[train["Label"] == "ham"]["SMS"]:
    for word in message:
        n_ham += 1
        
n_ham

55569

In [43]:
n_vocabulary = len(vocabulary)
n_vocabulary

7833