# Classification of messages as spam or not spam using Naive Bayes algorithm 

In [97]:
import pandas as pd

In [98]:
# Import Dataset
df = pd.read_table('SMS', sep='\t', header=None,names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [99]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham':0,'spam':1})

In [100]:
# Randomize the dataset
df = df.sample(frac=1, random_state=1)
df

Unnamed: 0,label,sms_message
1078,0,"Yep, by the pretty sculpture"
4028,0,"Yes, princess. Are you going to make me moan?"
958,0,Welp apparently he retired
4642,0,Havent.
4674,0,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
905,0,"We're all getting worried over here, derek and..."
5192,0,Oh oh... Den muz change plan liao... Go back h...
3980,0,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,1,Text & meet someone sexy today. U can find a d...


In [101]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [102]:
#  data cleaning 
training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
training['sms_message'] = training['sms_message'].str.lower() ### making all the words lowercase

test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation
test['sms_message'] = test['sms_message'].str.lower() ### making all the words lowercase
training

  training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
  test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation


Unnamed: 0,label,sms_message
0,0,yep by the pretty sculpture
1,0,yes princess are you going to make me moan
2,0,welp apparently he retired
3,0,havent
4,0,i forgot 2 ask ü all smth there s a card on ...
...,...,...
4453,0,sorry i ll call later in meeting any thing re...
4454,0,babe i fucking love you too you know fuck...
4455,1,u ve been selected to stay in 1 of 250 top bri...
4456,0,hello my boytoy geeee i miss you already a...


In [103]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []

# Step 2
n_ham = 0
n_spam = 0

# Step 3
n_w_ham = {}
n_w_spam = {}

# Commandeering this nested for loop to do the things we want it to do :)
i = 0
for sms in training['sms_message']:
    for word in sms:
        vocabulary.append(word)
        
        # Check if given sms is ham
        if training['label'][i] == 0:
            # Calculating the number of words in a spam message
            n_ham += 1
            
            # Check if the current word is in the n_w_ham dictionary
            # If so, increment its value
            if word in n_w_ham.keys():
                n_w_ham[word] += 1
            # Otherwise, append it to the dictionary with a value of 1
            else:
                n_w_ham[word] = 1
        # Otherwise, given sms is spam
        else:
            # Calculating the number of words in a spam message
            n_spam += 1
            
            # Check if the current word is in the n_w_spam dictionary
            # If so, increment its value
            if word in n_w_spam.keys():
                n_w_spam[word] += 1
            # Otherwise, append it to the dictionary with a value of 1
            else:
                n_w_spam[word] = 1
    i += 1

vocabulary = list(set(vocabulary))  ### only count the number of unique words

In [104]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,whens,yourjob,gary,hundreds,address,increase,cme,veggie,jus,fantastic,...,sleepwell,ava,theirs,floor,an,ironing,good,946,pro,totes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
training_new = pd.concat([training, word_counts], axis=1)

training_new

Unnamed: 0,label,sms_message,whens,yourjob,gary,hundreds,address,increase,cme,veggie,...,sleepwell,ava,theirs,floor,an,ironing,good,946,pro,totes
0,0,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4455,1,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
# Laplace smoothing
alpha = 1

# You will start from here.

In [109]:
# Hints:
# Step 1: calculate P(Spam) and P(Ham)
spam_count = 0
ham_count = 0
label_col = test['label']
sms_col = test['sms_message']

#label_col.shape[0]
for i in range(label_col.shape[0]):
    # ham row
    if label_col[i] == 0:
        ham_count += 1
                
    else: #spam row 
        spam_count +=1

p_spam = spam_count / len(label_col)
p_ham = ham_count / len(label_col)


# Step 2: count N_Spam, N_Ham, N_Vocabulary
n_vocabulary = len(vocabulary)

# Step 4: p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)

p_w_given_spam = {}
p_w_given_ham = {}

# Calculates probability for each unique word in vocabulary
for word in vocabulary:
    if word in n_w_spam:
        p_w_given_spam[word] = (n_w_spam.get(word)+alpha)/(n_spam+alpha*n_vocabulary)
    
    if word in n_w_ham:
         p_w_given_ham[word] = (n_w_ham.get(word)+alpha)/(n_ham+alpha*n_vocabulary)

prediction_col = []
# calculates if an SMS_Message is spam or ham
for i in range(label_col.shape[0]):
    sms_message = sms_col[i]
    tmp_spam = 1
    tmp_ham = 1
    for word in sms_message.lower().split():
        if word in p_w_given_spam:
            tmp_spam *= p_w_given_spam[word]
        if word in p_w_given_ham:
            tmp_ham *= p_w_given_ham[word]

    if tmp_spam > tmp_ham:
        prediction_col.append(1)
    else:
        prediction_col.append(0)

later
i
guess
i
needa
do
mcat
study
too
but
i
haf
enuff
space
got
like
4
mb
had
your
mobile
10
mths
update
to
latest
orange
camera
video
phones
for
free
save
s
with
free
texts
weekend
calls
text
yes
for
a
callback
orno
to
opt
out
all
sounds
good
fingers
makes
it
difficult
to
type
all
done
all
handed
in
don
t
know
if
mega
shop
in
asda
counts
as
celebration
but
thats
what
i
m
doing
but
my
family
not
responding
for
anything
now
am
in
room
not
went
to
home
for
diwali
but
no
one
called
me
and
why
not
coming
it
makes
me
feel
like
died
u
too
boo
what
time
u
get
out
u
were
supposed
to
take
me
shopping
today
genius
what
s
up
how
your
brother
pls
send
his
number
to
my
skype
i
liked
the
new
mobile
for
my
family
happiness
if
i
let
you
do
this
i
want
you
in
the
house
by
8am
do
you
know
why
god
created
gap
between
your
fingers
so
that
one
who
is
made
for
you
comes
amp
fills
those
gaps
by
holding
your
hand
with
love
k
and
you
re
sure
i
don
t
have
to
have
consent
forms
to
do
it
v
try
neva
mate
haha
th

babe
my
friend
had
to
cancel
still
up
for
a
visit
no
let
me
do
the
math
your
not
good
at
it
miss
call
miss
call
khelate
kintu
opponenter
miss
call
dhorte
lage
thats
d
rule
one
with
great
phone
receiving
quality
wins
i
m
not
smoking
while
people
use
wylie
smokes
too
much
to
justify
ruining
my
shit
eatin
my
lunch
am
also
doing
in
cbe
only
but
have
to
pay
i
m
there
and
i
can
see
you
but
you
can
t
see
me
maybe
you
should
reboot
ym
i
seen
the
buzz
i
call
you
later
don
t
have
network
if
urgnt
sms
me
dunno
he
jus
say
go
lido
same
time
930
or
ü
go
buy
wif
him
then
i
meet
ü
later
can
got
but
got
2
colours
lor
one
colour
is
quite
light
n
e
other
is
darker
lor
actually
i
m
done
she
s
styling
my
hair
now
no
thank
you
you
ve
been
wonderful
urgent
we
are
trying
to
contact
u
todays
draw
shows
that
you
have
won
a
800
prize
guaranteed
call
09050001808
from
land
line
claim
m95
valid12hrs
only
please
protect
yourself
from
e
threats
sib
never
asks
for
sensitive
information
like
passwords
atm
sms
pin
thru


only
take
2
out
shopping
at
once
anything
lor
but
toa
payoh
got
place
2
walk
meh
its
just
the
effect
of
irritation
just
ignore
it
then
u
going
ikea
str
aft
dat
only
if
you
promise
your
getting
out
as
soon
as
you
can
and
you
ll
text
me
in
the
morning
to
let
me
know
you
made
it
in
ok
that
means
get
the
door
sorry
i
din
lock
my
keypad
i
like
you
peoples
very
much
but
am
very
shy
pa
the
world
s
most
happiest
frnds
never
have
the
same
characters
dey
just
have
the
best
understanding
of
their
differences
no
on
the
way
home
so
if
not
for
the
long
dry
spell
the
season
would
have
been
over
there
are
some
nice
pubs
near
here
or
there
is
frankie
n
bennys
near
the
warner
cinema
get
ur
1st
ringtone
free
now
reply
to
this
msg
with
tone
gr8
top
20
tones
to
your
phone
every
week
just
1
50
per
wk
2
opt
out
send
stop
08452810071
16
and
my
man
carlos
is
definitely
coming
by
mu
tonight
no
excuses
yup
hey
then
one
day
on
fri
we
can
ask
miwa
and
jiayin
take
leave
go
karaoke
i
hope
you
that
s
the
result
of
be

keep
it
from
getting
slippery
over
there
k
fyi
i
m
back
in
my
parents
place
in
south
tampa
so
i
might
need
to
do
the
deal
somewhere
else
i
wonder
if
your
phone
battery
went
dead
i
had
to
tell
you
i
love
you
babe
xy
trying
smth
now
u
eat
already
we
havent
finished
class
where
are
you
urgent
we
are
trying
to
contact
u
todays
draw
shows
that
you
have
won
a
2000
prize
guaranteed
call
09066358361
from
land
line
claim
y87
valid
12hrs
only
aight
sounds
good
when
do
you
want
me
to
come
down
with
my
sis
lor
we
juz
watched
italian
job
i
love
to
cuddle
i
want
to
hold
you
in
my
strong
arms
right
now
as
a
sim
subscriber
you
are
selected
to
receive
a
bonus
get
it
delivered
to
your
door
txt
the
word
ok
to
no
88600
to
claim
150p
msg
exp
30apr
trust
me
even
if
isn
t
there
its
there
i
love
you
you
set
my
soul
on
fire
it
is
not
just
a
spark
but
it
is
a
flame
a
big
rawring
flame
xoxo
tunji
how
s
the
queen
how
are
you
doing
this
is
just
wishing
you
a
great
day
abiola
raji
pls
do
me
a
favour
pls
convey
my
b

ur
fortune
to
love
the
one
who
loves
u
but
its
a
miracle
to
love
a
person
who
can
t
love
anyone
except
u
gud
nyt
umma
my
life
and
vava
umma
love
you
lot
dear
aight
should
i
just
plan
to
come
up
later
tonight
anyway
i
don
t
think
i
can
secure
anything
up
here
lemme
know
if
you
want
me
to
drive
down
south
and
chill
no
dude
its
not
fake
my
frnds
got
money
thts
y
i
m
reffering
u
if
u
member
wit
my
mail
link
u
vl
be
credited
lt
gt
rs
and
il
be
getiing
lt
gt
rs
i
can
draw
my
acc
wen
it
is
lt
gt
rs
i
think
it
s
all
still
in
my
car
hi
baby
im
sat
on
the
bloody
bus
at
the
mo
and
i
wont
be
home
until
about
7
30
wanna
do
somethin
later
call
me
later
ortxt
back
jess
xx
wat
makes
u
thk
i
ll
fall
down
but
actually
i
thk
i
m
quite
prone
2
falls
lucky
my
dad
at
home
i
ask
him
come
n
fetch
me
already
tell
me
whos
this
pls
fuck
babe
i
miss
you
already
you
know
can
t
you
let
me
send
you
some
money
towards
your
net
i
need
you
i
want
you
i
crave
you
have
a
safe
trip
to
nigeria
wish
you
happiness
and
very
s

t
be
answering
everyones
calls
if
i
get
one
more
call
i
m
not
babysitting
on
monday
yep
the
great
loxahatchee
xmas
tree
burning
of
lt
gt
starts
in
an
hour
wat
s
da
model
num
of
ur
phone
ok
i
am
a
gentleman
and
will
treat
you
with
dignity
and
respect
if
you
hear
a
loud
scream
in
about
lt
gt
minutes
its
cause
my
gyno
will
be
shoving
things
up
me
that
don
t
belong
hmv
bonus
special
500
pounds
of
genuine
hmv
vouchers
to
be
won
just
answer
4
easy
questions
play
now
send
hmv
to
86688
more
info
www
100percent
real
com
neva
mind
it
s
ok
good
afternoon
my
boytoy
how
are
you
feeling
today
better
i
hope
are
you
being
my
good
boy
are
you
my
obedient
slave
do
you
please
your
queen
yes
i
m
small
kid
and
boost
is
the
secret
of
my
energy
ringtoneking
84484
even
i
cant
close
my
eyes
you
are
in
me
our
vava
playing
umma
d
is
that
seriously
how
you
spell
his
name
will
have
two
more
cartons
off
u
and
is
very
pleased
with
shelves
thanks
for
your
ringtone
order
ref
number
k718
your
mobile
will
be
charged
4
5

# Calculate accuracy, precision, recall and F1_score. 

In [108]:
# Model Evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#step 1
print('Accuracy score: {}'.format(accuracy_score(label_col, prediction_col)))
print('Precision score: {}'.format(precision_score(label_col, prediction_col)))
print('Recall score: {}'.format(recall_score(label_col, prediction_col)))
print('F1 score: {}'.format(f1_score(label_col, prediction_col)))

# step 2
from sklearn.naive_bayes import MultinomialNB
X, y = training_new.iloc[:,2:], training_new.loc[:,"label"]
clf = MultinomialNB().fit(X, y)

mult_predict = clf.predict(training_new.iloc[:,2:])
                           
print("\nMultinomialNB Metrics")
print('Accuracy score: {}'.format(accuracy_score(training_new.loc[:,"label"], mult_predict)))
print('Precision score: {}'.format(precision_score(training_new.loc[:,"label"], mult_predict)))
print('Recall score: {}'.format(recall_score(training_new.loc[:,"label"], mult_predict)))
print('F1 score: {}'.format(f1_score(training_new.loc[:,"label"], mult_predict)))

Accuracy score: 0.18940754039497307
Precision score: 0.03217821782178218
Recall score: 0.17687074829931973
F1 score: 0.05445026178010472

MultinomialNB Metrics
Accuracy score: 0.9925975773889637
Precision score: 0.9780775716694773
Recall score: 0.9666666666666667
F1 score: 0.972338642078793
