### SPAM Ham Detection

In [29]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [30]:
## Reading the given dataset
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])

In [31]:
print(spam.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [32]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set = []
for index,row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [33]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


In [34]:
print(len(data_set))

5572


### Preprocessing

In [35]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [36]:
def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [37]:
## - Performing the preprocessing steps on all messages
messages_set = []
for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [38]:
print(messages_set[:5])

[(['jurong', 'point', 'crazy..', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham')]


### Preparing to create features

In [39]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [40]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [42]:
## - creating the word features for the entire dataset
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

8393


In [47]:
wl = nltk.FreqDist(get_words_in_messages(messages_set))

for k in wl.keys():
    print(k, wl[k])

jurong 1
point 33
crazy.. 1
available 16
bugis 7
great 112
world 35
buffet 2
... 1261
cine 7
get 702
amore 1
wat 102
lar 37
joke 16
wif 27
oni 4
free 275
entry 26
wkly 14
comp 11
win 82
cup 9
final 18
tkts 4
21st 3
may 51
2005. 2
text 189
87121 4
receive 45
question 35
std 9
txt 169
rate 23
apply 33
08452810075over18 2
dun 55
say 228
early 31
hor 2
already 90
nah 10
n't 365
think 211
usf 11
live 55
around 61
though 26
freemsg 14
hey 111
darling 5
week 117
word 59
back 152
like 256
fun 31
still 155
xxx 34
chgs 1
send 274
£1.50 18
rcv 2
even 91
brother 20
speak 45
treat 19
aid 2
patent 1
per 56
request 11
'melle 3
melle 3
oru 4
minnaminunginte 3
nurungu 3
vettam 3
set 22
callertune 10
callers 5
press 13
copy 11
friends 50
winner 16
value 17
network 34
customer 58
select 31
receivea 2
£900 7
prize 94
reward 12
claim 115
call 655
09061701461. 2
code 30
kl341 2
valid 23
hours 21
mobile 138
months 10
entitle 8
update 24
latest 39
colour 23
mobiles 12
camera 36
08002986030 2
gon 59
home 160
s

sport 9
fit 4
funky 2
82468 2
garage 3
key 7
bookshelf 1
accept 9
day..u 3
sister 16
dear1 3
best1 3
clos1 3
lvblefrnd 3
jstfrnd 3
cutefrnd 3
lifpartnr 3
belovd 4
swtheart 3
bstfrnd 3
enemy 3
smart 3
£200 9
weekly 24
85222 1
winnersclub 1
m26 3
3uz 3
gbp1.50/week 1
2day 8
normal 9
rest 11
mylife 1
wot 25
k.k 4
advance 3
pongal 5
hmmm 10
power 8
yoga 7
dunno 31
tahan 2
anot 2
dude 23
afraid 4
december 5
11mths+ 5
08002986906 4
cake 9
merry 10
christmas 13
*kisses* 2
cud 4
ppl 5
gona 3
waitin 4
pete 11
guild 1
bristol 2
problem 33
track 6
record 9
read 28
women 3
light 16
apo 2
k.good 1
movie 20
return 16
immediately 10
germany 3
line 47
via 11
access 9
0844 2
861 2
85. 2
prepayment 2
evaporate 1
violate 2
privacy 2
steal 3
employer 1
paperwork 2
report 6
supervisor 3
valentines 9
lifetime 3
83600 3
150p/msg 15
rcvd 7
custcare:08718720201 2
ta-daaaaa 1
din 10
sleeping..and 3
surf 6
post 27
1000s 4
texts 26
wiv 8
caroline 2
favourite 3
stranger 3
interest 11
two 33
round 9
gudnite 3
.tc 2

dled 1
imp 2
smokin 1
boooo 1
bec 2
ugh 12
s.nervous 1
costume 2
gift 28
yowifes 1
hint 3
£100 22
borrow 2
bag 8
outbid 1
simonwatson5120 1
shinco 1
plyr 1
bid 9
ac/smsrewards 1
notifications 1
ones 8
model 6
youi 1
galileo 2
dobby 2
boo 6
enjoyin 2
yourjob 1
gettin 10
hunny 6
illspeak 1
soonlots 1
loveme 2
xxxx 5
starshine 1
ache 4
*sips 1
cappuccino* 1
*teasing 3
kiss* 12
smsservices 1
yourinclusive 1
www.comuk.net 5
login= 4
3qxj9 4
extra 12
08702840625.comuk 3
220-cm2 3
9ae 4
alfie 3
moon 8
m8s 3
charity 9
8007 21
nokias 3
polys 8
zed 6
08701417012 3
profit 4
bits 1
hahaha..use 3
brain 4
green 5
burial 1
mojibiola 2
reveal 10
09065174042. 2
cust 4
07821230901 2
mate 14
hol 2
havent 2
rvx 1
skyped 2
sura 5
ultimatum 2
countin 2
aburo 2
surely 6
successful 3
five 6
08002888812 2
inconsiderate 2
nag 2
recession 2
hence 2
wed 9
vouchers-text 1
savamob-member 1
soo 2
understand 17
2morrow 3
comprehensive 1
prashanthettan 1
mother 6
09066350750 2
ibiza 4
10,000 2
await 28
collection 26

prone 1
07801543489 1
latests 1
no:83355 1
tc-llc 1
ny-usa 1
150p/mt 1
msgrcvd18+ 1
permission 1
lucy 2
hubby 3
meetins 1
cumin 1
09099726395 1
calls£1/minmobsmorelkpobox177hp51fl 2
7th 4
6th 4
5th 4
4th 6
personality 4
nature 6
dose 1
tablet 1
incomm 1
blake 3
doesn\'t 2
lotr 3
google 5
modules 2
browsin 1
compulsory 1
purity 4
message..its 4
musthu 2
gautham.. 1
investigate 1
receive.. 1
80488 2
www.80488.biz 1
vomit 7
aint 5
.how 1
sha 4
vitamin 1
gals.. 1
centre 5
crucial 1
jsco 2
2channel 1
leadership 1
skills 1
psychic 1
w/question 1
host-based 1
idps 1
linux 1
systems 1
0870 6
total 4
converter 1
along 4
karaoke 3
sayy 1
except 7
nit 2
leanne.what 1
eight 3
format 3
disc 1
internet 6
hows 16
champ 1
glasgow 1
lovin 1
arrive 8
shortly 4
install 1
browse 1
artists 1
corect 1
speling 1
i.e 1
sarcasm 1
1/08/03 1
0871-4719-523 1
forum 2
ron 4
cts 1
employee 1
aunt 2
nike 1
sooo 1
shouting.. 1
dang 1
unfortunately 2
airport 5
7250i 4
auction 17
86021 6
row/w1jhl 4
mr. 3
difficult 10
e

whilltake 1
zogtorius 1
ive 1
whether 5
financial 1
problem.i 1
alian 1
b/tooth 1
free-nokia 1
or2optout/hv9d 1
posible 1
century 1
frwd 1
unnecessarily 2
affectionate 1
ringtones 5
restrictions 1
buddys 1
80082 3
08712402902 1
*possessive 1
no..he 1
clarification 1
spook 5
halloween 3
08701417012150p 2
logo/pic 2
issue 7
football 4
sky 7
gamestar 2
active 2
£250k 2
88088 2
coimbatore 1
re-sub 1
monoc 1
monos 1
polyc 1
stream 1
0871212025016 1
opinions 2
categories 1
measure 3
ethnicity 1
census 1
transcribe 1
propsd 2
lttrs 2
thm 3
aproach 2
truck 2
'hw 2
instantly 3
thy 2
happily 2
2gthr 2
evrydy 2
paragon 2
arent 2
bluff 2
impossible 3
goodmate 1
sary 2
asusual 1
franyxxxxx 1
batt 1
www.fullonsms.com 2
me. 1
pussy 6
becausethey 1
09058098002. 1
pobox1 1
w14rg 1
message..no 3
responce..what 3
happend 4
wiskey 2
brandy 2
rum 2
gin 2
vodka 3
scotch 2
shampain 2
kudi 2
yarasu 2
dhina 2
vaazhthukkal 2
gain 3
sugar 4
pressure 1
limit 5
dumb 2
thangam 3
okey 3
doke 1
dress 5
neshanth..tel

crisis 2
monster 1
____ 2
ths 3
ias 2
obey 1
re-send 1
uhhhhrmm 1
enuff 2
gbp/sms 1
600 1
400 1
deltomorrow 1
09066368470 1
e.g 1
24m 1
1-month 1
smartcall 1
68866. 1
subscriptn3gbp/wk 1
08448714184 1
landlineonly 1
mths 5
camera/video 2
texts/weekend 1
callback 3
orno 1
fink 1
09099726553 1
carlie 1
calls£1/minmobsmore 1
lkpobox177hp51fl 1
youphone 1
athome 1
youwanna 1
jack 1
say/ask 1
helpful 1
pretend 1
hypotheticalhuagauahahuagahyuhagga 1
brainy 1
occasion 1
reflection 1
desire 1
affections 1
traditions 1
cantdo 1
anythingtomorrow 1
myparents 1
aretaking 1
outfor 1
katexxx 1
level 1
gate 1
89105. 1
lingerie 1
www.bridal.petticoatdreams.co.uk 1
weddingfriend 1
board 1
overheat 1
reslove 1
inst 1
pending 2
western 1
incident.. 1
outta 4
notixiquating 1
laxinorficated 1
bambling 1
entropication 1
oblisingately 1
masteriastering 1
amplikater 1
fidalfication 1
champlaxigating 1
atrocious.. 1
wotz 1
junna 1
knickers 1
01223585236 1
nikiyu4.net 1
accident 3
a30 1
divert 1
wadebridge.i 1


njan 1
vilikkam 1
sudn 1
maths 1
chapter 1
chop 1
noooooooo 1
09065171142-stopsms-08718727870150ppm 1
firsg 1
split 1
wasnt 1
heat 1
sumfing 1
86888 2
subscribe6gbp/mnth 2
3hrs 2
txtstop 2
hopeso 1
amnow 1
ithink 1
tonsolitusaswell 1
layin 1
bedreal 1
lotsof 1
hiphop 1
oxygen 1
resort 1
roller 1
4.30 1
recorder 1
canname 1
capital 2
australia 1
mquiz 1
82277. 1
showr 1
upon 1
ceiling 1
presnts 1
bcz 1
mis 2
jeevithathile 1
irulinae 1
neekunna 1
prakasamanu 1
sneham 1
prakasam 1
ennal 1
prabha 2
'that 1
mns 1
is'love 1
7634 1
7684 1
firmware 1
vijaykanth 1
comedy 3
tv..he 1
anythiing 1
www.clubmoby.com 1
08717509990 1
poly/true/pix/ringtones/games 1
keypad 1
btwn 1
hands.. 1
happened.. 1
decades.. 1
goverment 1
loyal 2
customers 2
09066380611 2
spice 1
prasanth 1
ettans 1
08718738002 1
48922 1
21/11/04 1
appy 1
fizz 1
contain 1
genus 1
robinson 1
not..tel 1
name.. 1
out 1
soz 1
imat 1
mums 1
freinds 1
msg..sometext 1
07099833605 1
ref:9280114 1
smth.. 2
chloe 1
150p/text 1
wewa 1
130. 1

### Preparing to create a train and test set

In [14]:
## - creating slicing index at 80% threshold
sliceIndex = int((len(messages_set)*.8))

In [15]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(messages_set)

In [16]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [17]:
print(len(train_messages))
print(len(test_messages))

4457
1115


### Preparing to create feature maps for train and test data

In [18]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [19]:
print(extract_features)

<function extract_features at 0x10ae8f400>


In [20]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [21]:
print(training_set[:5])

[({'contains(jurong)': False, 'contains(point)': False, 'contains(crazy..)': False, 'contains(available)': False, 'contains(bugis)': False, 'contains(great)': False, 'contains(world)': False, 'contains(buffet)': False, 'contains(...)': False, 'contains(cine)': False, 'contains(get)': False, 'contains(amore)': False, 'contains(wat)': False, 'contains(lar)': False, 'contains(joke)': False, 'contains(wif)': False, 'contains(oni)': False, 'contains(free)': False, 'contains(entry)': False, 'contains(wkly)': False, 'contains(comp)': False, 'contains(win)': False, 'contains(cup)': False, 'contains(final)': False, 'contains(tkts)': False, 'contains(21st)': False, 'contains(may)': False, 'contains(2005.)': False, 'contains(text)': False, 'contains(87121)': False, 'contains(receive)': False, 'contains(question)': False, 'contains(std)': False, 'contains(txt)': False, 'contains(rate)': False, 'contains(apply)': False, 'contains(08452810075over18)': False, 'contains(dun)': False, 'contains(say)': 

In [22]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  4457
Test set size :  1115


### Training

In [23]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [24]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9916984518734575


In [25]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9820627802690582


In [26]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  spam


In [27]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    190.2 : 1.0
       contains(service) = True             spam : ham    =    150.8 : 1.0
        contains(urgent) = True             spam : ham    =    137.7 : 1.0
          contains(code) = True             spam : ham    =    102.7 : 1.0
         contains(await) = True             spam : ham    =     89.6 : 1.0
      contains(delivery) = True             spam : ham    =     76.5 : 1.0
      contains(landline) = True             spam : ham    =     74.8 : 1.0
         contains(nokia) = True             spam : ham    =     74.0 : 1.0
           contains(txt) = True             spam : ham    =     73.4 : 1.0
          contains(rate) = True             spam : ham    =     72.1 : 1.0
       contains(private) = True             spam : ham    =     67.8 : 1.0
          contains(club) = True             spam : ham    =     67.8 : 1.0
        contains(latest) = True             spam : ham    =     64.3 : 1.0

In [28]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
