In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Styles
plt.style.use('ggplot')
sns.set_style('whitegrid')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 12
plt.rcParams['patch.force_edgecolor'] = True

# Text Preprocessing
import nltk
# nltk.download("all")
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize

messages = pd.read_csv("./data/spam.csv", encoding = 'latin-1')

# Drop the extra columns and rename columns

messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["category", "text"]
display(messages.head(n = 10))


Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [9]:
topMessages = messages.groupby("text")["category"].agg([len, np.max]).sort_values(by = "len", ascending = False).head(n = 10)
display(topMessages)

Unnamed: 0_level_0,len,amax
text,Unnamed: 1_level_1,Unnamed: 2_level_1
"Sorry, I'll call later",30,ham
I cant pick the phone right now. Pls send a message,12,ham
Ok...,10,ham
Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..,4,ham
"Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da.",4,ham
Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed å£1000 cash or å£5000 prize!,4,spam
Okie,4,ham
"Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...",4,ham
"7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st \Ur Lovely Friendship\""... good morning dear""",4,ham
Ok.,4,ham


In [15]:
spam_messages = messages[messages["category"] == "spam"]["text"]
ham_messages = messages[messages["category"] == "ham"]["text"]

spam_words = []
ham_words = []

# Since this is just classifying the message as spam or ham, we can use isalpha(). 
# This will also remove the not word in something like can't etc. 
# In a sentiment analysis setting, its better to use 
# sentence.translate(string.maketrans("", "", ), chars_to_remove)

def extractSpamWords(spamMessages):
    global spam_words
    
    words = [word.lower() for word in word_tokenize(spamMessages) if word.lower() not in stopwords.words("english") and word.lower().isalpha()]
    print(words)
    spam_words = spam_words + words
    
def extractHamWords(hamMessages):
    global ham_words
    words = [word.lower() for word in word_tokenize(hamMessages) if word.lower() not in stopwords.words("english") and word.lower().isalpha()]
    ham_words = ham_words + words

spam_messages.apply(extractSpamWords)
ham_messages.apply(extractHamWords)

['free', 'entry', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', 'may', 'text', 'fa', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'c', 'apply']
['freemsg', 'hey', 'darling', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'send', 'rcv']
['winner', 'valued', 'network', 'customer', 'selected', 'receivea', 'prize', 'reward', 'claim', 'call', 'claim', 'code', 'valid', 'hours']
['mobile', 'months', 'u', 'r', 'entitled', 'update', 'latest', 'colour', 'mobiles', 'camera', 'free', 'call', 'mobile', 'update', 'co', 'free']
['six', 'chances', 'win', 'cash', 'pounds', 'txt', 'send', 'cost', 'tsandcs', 'apply', 'reply', 'hl', 'info']
['urgent', 'week', 'free', 'membership', 'prize', 'jackpot', 'txt', 'word', 'claim', 'c', 'lccltd', 'pobox']
['xxxmobilemovieclub', 'use', 'credit', 'click', 'wap', 'link', 'next', 'txt', 'message', 'click', 'http']
['england', 'v', 'macedonia', 'dont', 'miss', 'news', 'txt', 'ur', 'national', 'team', 'eg', 'england', 

['okmail', 'dear', 'dave', 'final', 'notice', 'collect', 'tenerife', 'holiday', 'cash', 'award', 'call', 'landline', 'tcs', 'sae']
['want', 'get', 'laid', 'tonight', 'want', 'real', 'dogging', 'locations', 'sent', 'direct', 'ur', 'mob', 'join', 'uk', 'largest', 'dogging', 'network', 'txting', 'moan']
['free', 'message', 'activate', 'free', 'text', 'messages', 'replying', 'message', 'word', 'free', 'terms', 'conditions', 'visit']
['congrats', 'year', 'special', 'cinema', 'pass', 'call', 'c', 'suprman', 'v', 'etc', 'free', 'dont', 'miss']
['congratulations', 'week', 'competition', 'draw', 'u', 'prize', 'claim', 'call', 'sms']
['guaranteed', 'latest', 'nokia', 'phone', 'ipod', 'player', 'prize', 'txt', 'word', 'collect', 'ibhltd']
['boltblue', 'tones', 'reply', 'poly', 'mono', 'eg', 'cha', 'cha', 'slide', 'yeah', 'slow', 'jamz', 'toxic', 'come', 'stop', 'tones', 'txt']
['credits', 'topped', 'http', 'renewal', 'pin', 'tgxxrz']
['urgent', 'mobile', 'awarded', 'bonus', 'caller', 'prize', 'at

['reminder', 'downloaded', 'content', 'already', 'paid', 'goto', 'http', 'mymoby', 'collect', 'content']
['free', 'ringtone', 'waiting', 'collected', 'simply', 'text', 'password', 'verify', 'get', 'usher', 'britney', 'fml']
['lastest', 'stereophonics', 'marley', 'dizzee', 'racal', 'libertines', 'strokes', 'win', 'nookii', 'games', 'flirt', 'click', 'themob', 'wap', 'bookmark', 'text', 'wap']
['january', 'male', 'sale', 'hot', 'gay', 'chat', 'cheaper', 'call', 'national', 'rate', 'cheap', 'peak', 'stop', 'texts', 'call']
['money', 'r', 'lucky', 'winner', 'claim', 'prize', 'text', 'money', 'give', 'away', 'text', 'rate']
['dear', 'matthew', 'please', 'call', 'landline', 'complimentary', 'tenerife', 'holiday', 'cash', 'await', 'collection', 'sae', 'cs']
['urgent', 'call', 'landline', 'complimentary', 'tenerife', 'holiday', 'cash', 'await', 'collection', 'sae', 'cs', 'box']
['getting', 'touch', 'folks', 'waiting', 'company', 'txt', 'back', 'name', 'age', 'opt', 'enjoy', 'community']
['ur',

['hi', 'lucy', 'hubby', 'meetins', 'day', 'fri', 'b', 'alone', 'hotel', 'u', 'fancy', 'cumin', 'pls', 'leave', 'msg', 'lucy', 'x']
['account', 'credited', 'free', 'text', 'messages', 'activate', 'txt', 'word', 'credit', 'cs']
['sms', 'ac', 'jsco', 'energy', 'high', 'u', 'may', 'know', 'ur', 'leadership', 'skills', 'r', 'strong', 'psychic', 'reply', 'ans', 'end', 'reply', 'end', 'jsco']
['hot', 'live', 'fantasies', 'call', 'per', 'min', 'ntt', 'ltd', 'po', 'box', 'croydon', 'national', 'rate', 'call']
['thanks', 'vote', 'sing', 'along', 'stars', 'karaoke', 'mobile', 'free', 'link', 'reply', 'sing']
['brand', 'new', 'mobile', 'music', 'service', 'live', 'free', 'music', 'player', 'arrive', 'shortly', 'install', 'phone', 'browse', 'content', 'top', 'artists']
['urgent', 'mobile', 'awarded', 'bonus', 'caller', 'prize', 'attempt', 'contact', 'call', 'bt', 'national', 'rate']
['nokia', 'get', 'win', 'free', 'auction', 'take', 'part', 'send', 'nokia']
['hello', 'orange', 'month', 'free', 'acc

['call', 'use', 'ur', 'mins', 'calls', 'cast', 'mob', 'vary', 'service', 'provided', 'aom', 'aom', 'u', 'stop', 'ages']
['urgent', 'mobile', 'bonus', 'caller', 'prize', 'attempt', 'reach', 'call', 'asap']
['eerie', 'nokia', 'tones', 'rply', 'tone', 'title', 'eg', 'tone', 'dracula', 'titles', 'ghost', 'addamsfa', 'munsters', 'exorcist', 'twilight']
['sexy', 'singles', 'waiting', 'text', 'age', 'followed', 'gender', 'wither', 'f', 'gay', 'men', 'text', 'age', 'followed']
['freemsg', 'claim', 'ur', 'sms', 'ok', 'use', 'ur', 'mates', 'etc', 'join', 'c', 'remove', 'txtx', 'stop']
['free', 'ringtone', 'reply', 'real']
['well', 'done', 'england', 'get', 'official', 'poly', 'ringtone', 'colour', 'flag', 'yer', 'mobile', 'text', 'tone', 'flag', 'txt', 'eng', 'stop']
['final', 'chance', 'claim', 'ur', 'worth', 'discount', 'vouchers', 'today', 'text', 'yes', 'savamob', 'member', 'offers', 'mobile', 'cs', 'savamob', 'subs']
['private', 'account', 'statement', 'shows', 'unredeemed', 'bonus', 'point

['free', 'tarot', 'texts', 'find', 'love', 'life', 'try', 'free', 'text', 'chance', 'free', 'msgs']
['join', 'uk', 'horniest', 'dogging', 'service', 'u', 'sex', 'sign', 'follow', 'instructions', 'txt', 'entry']
['sunshine', 'quiz', 'wkly', 'q', 'win', 'top', 'sony', 'dvd', 'player', 'u', 'know', 'country', 'liverpool', 'played', 'mid', 'week', 'txt', 'ansr', 'sp', 'tyrone']
['knock', 'knock', 'txt', 'whose', 'enter', 'r', 'weekly', 'draw', 'gift', 'voucher', 'store', 'yr', 'choice', 'cs']
['forwarded', 'hi', 'mailbox', 'messaging', 'sms', 'alert', 'matches', 'please', 'call', 'back', 'retrieve', 'messages', 'matches']
['free', 'ring', 'tone', 'text', 'every', 'week', 'get', 'new', 'tone']
['urgent', 'mobile', 'bonus', 'caller', 'prize', 'attempt', 'reach', 'call', 'asap']
['guaranteed', 'latest', 'nokia', 'phone', 'ipod', 'player', 'prize', 'txt', 'word', 'collect', 'ibhltd']
['hello', 'darling', 'today', 'would', 'love', 'chat', 'dont', 'tell', 'look', 'like', 'sexy']
['free', 'week',

['sexy', 'sexy', 'cum', 'text', 'im', 'wet', 'warm', 'ready', 'porn', 'u', 'fun', 'msg', 'free', 'recd', 'msgs', 'inc', 'vat', 'cancel', 'text', 'stop']
['hard', 'live', 'chat', 'choose', 'girl', 'connect', 'live', 'call', 'cheap', 'chat', 'uk', 'biggest', 'live', 'service', 'vu']
['heard', 'call', 'rude', 'chat', 'private', 'line', 'cum', 'wan', 'pics', 'gettin', 'shagged', 'text', 'pix', 'send', 'stop', 'sam', 'xxx']
['time', 'tried', 'contact', 'u', 'prize', 'claim', 'call', 'sms']
['hot', 'live', 'fantasies', 'call', 'per', 'min', 'ntt', 'ltd', 'po', 'box', 'croydon']
['dear', 'voucher', 'holder', 'claim', 'weeks', 'offer', 'pc', 'please', 'go', 'http', 'ts', 'cs', 'apply']
['ur', 'going', 'bahamas', 'callfreefone', 'speak', 'live', 'operator', 'claim', 'either', 'bahamas', 'cruise', 'cash', 'opt', 'txt', 'x']
['time', 'tried', 'contact', 'u', 'pound', 'prize', 'claim', 'easy', 'call', 'per', 'min']
['ur', 'awarded', 'city', 'break', 'could', 'win', 'summer', 'shopping', 'spree', '

['hi', 'ur', 'lucky', 'night', 'uve', 'invited', 'xchat', 'uks', 'wildest', 'chat', 'txt', 'chat', 'ldn']
[]
['dear', 'voucher', 'holder', 'claim', 'class', 'airport', 'lounge', 'passes', 'using', 'holiday', 'voucher', 'call', 'booking', 'quote', 'class', 'x']
['bloomberg', 'center', 'wait', 'apply', 'future', 'http']
['yes', 'place', 'town', 'meet', 'exciting', 'adult', 'singles', 'uk', 'txt', 'chat']
['free', 'week', 'nokia', 'tone', 'ur', 'mob', 'every', 'week', 'txt', 'nokia', 'get', 'txting', 'tell', 'ur', 'mates', 'pobox']
['someone', 'u', 'know', 'asked', 'dating', 'service', 'contact', 'cant', 'guess', 'call', 'revealed', 'pobox']
['mila', 'blonde', 'new', 'uk', 'look', 'sex', 'uk', 'guys', 'u', 'like', 'fun', 'text', 'mtalk', 'increments']
['claim', 'shopping', 'spree', 'call']
['want', 'funk', 'ur', 'fone', 'weekly', 'new', 'tone', 'reply', 'text', 'original', 'n', 'best', 'tones', 'network', 'operator', 'rates', 'apply']
['twinks', 'bears', 'scallies', 'skins', 'jocks', 'cal

['double', 'mins', 'txts', 'free', 'bluetooth', 'orange', 'available', 'sony', 'nokia', 'motorola', 'phones', 'call']
['free', 'week', 'nokia', 'tone', 'ur', 'mob', 'every', 'week', 'txt', 'nokia', 'get', 'txting', 'tell', 'ur', 'mates', 'pobox']
['want', 'funk', 'ur', 'fone', 'weekly', 'new', 'tone', 'reply', 'text', 'original', 'n', 'best', 'tones', 'network', 'operator', 'rates', 'apply']
['cmon', 'babe', 'make', 'horny', 'txt', 'fantasy', 'babe', 'im', 'hot', 'sticky', 'need', 'replies', 'cost', 'cancel', 'send', 'stop']
['important', 'information', 'orange', 'user', 'today', 'ur', 'lucky', 'day', 'find', 'log', 'onto', 'http', 'fantastic', 'prizeawaiting']
['missed', 'call', 'alert', 'numbers', 'called', 'left', 'message']
['freemsg', 'records', 'indicate', 'may', 'entitled', 'pounds', 'accident', 'claim', 'free', 'reply', 'yes', 'msg', 'opt', 'text', 'stop']
['u', 'win', 'music', 'gift', 'vouchers', 'every', 'week', 'starting', 'txt', 'word', 'draw', 'tscs', 'skillgame']
['show',

['camera', 'awarded', 'sipix', 'digital', 'camera', 'call', 'fromm', 'landline', 'delivery', 'within', 'days']
['weekly', 'tones', 'ready', 'download', 'weeks', 'new', 'tones', 'include', 'crazy', 'f', 'black', 'p', 'info', 'n']
['get', 'lots', 'cash', 'weekend', 'dear', 'welcome', 'weekend', 'got', 'biggest', 'best', 'ever', 'cash', 'give', 'away']
['urgent', 'mobile', 'number', 'awarded', 'prize', 'guaranteed', 'call', 'land', 'line', 'claim', 'valid']
['thanks', 'continued', 'support', 'question', 'week', 'enter', 'u', 'draw', 'cash', 'name', 'new', 'us', 'president', 'txt', 'ans']
['unique', 'user', 'id', 'removal', 'send', 'stop', 'customer', 'services']
['urgent', 'landline', 'complimentary', 'ibiza', 'holiday', 'cash', 'await', 'collection', 'sae', 'cs', 'po', 'box']
['urgent', 'attempt', 'contact', 'u', 'prize', 'yesterday', 'still', 'awaiting', 'collection', 'claim', 'call']
['santa', 'calling', 'would', 'little', 'ones', 'like', 'call', 'santa', 'xmas', 'eve', 'call', 'book',

['private', 'account', 'statement', 'fone', 'shows', 'points', 'call', 'identifier', 'code', 'expires']
['chosen', 'receive', 'award', 'pls', 'call', 'claim', 'number', 'collect', 'award', 'selected', 'receive', 'valued', 'mobile', 'customer']
['someonone', 'know', 'trying', 'contact', 'via', 'dating', 'service', 'find', 'could', 'call', 'mobile', 'landline']
['urgent', 'please', 'call', 'landline', 'cash', 'holiday', 'await', 'collection', 'cs', 'sae', 'po', 'box']
['prize', 'go', 'another', 'customer', 'c', 'polo', 'ltd', 'suite', 'london', 'please', 'call', 'back', 'busy']
['urgent', 'mobile', 'number', 'awarded', 'prize', 'guaranteed', 'call', 'land', 'line', 'claim', 'valid']
['urgent', 'week', 'free', 'membership', 'prize', 'jackpot', 'txt', 'word', 'claim', 'c', 'lccltd', 'pobox']
['urgent', 'please', 'call', 'landline', 'cash', 'luxury', 'canary', 'islands', 'holiday', 'await', 'collection', 'cs', 'sae', 'po', 'box']
['xmas', 'iscoming', 'ur', 'awarded', 'either', 'cd', 'gift',

0       None
1       None
3       None
4       None
6       None
7       None
10      None
13      None
14      None
16      None
17      None
18      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
30      None
31      None
32      None
33      None
35      None
36      None
37      None
38      None
        ... 
5538    None
5539    None
5541    None
5542    None
5543    None
5544    None
5545    None
5546    None
5548    None
5549    None
5550    None
5551    None
5552    None
5553    None
5554    None
5555    None
5556    None
5557    None
5558    None
5559    None
5560    None
5561    None
5562    None
5563    None
5564    None
5565    None
5568    None
5569    None
5570    None
5571    None
Name: text, Length: 4825, dtype: object

In [13]:
spam_words


['free',
 'entry',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 'may',
 'text',
 'fa',
 'receive',
 'entry',
 'question',
 'std',
 'txt',
 'rate',
 'c',
 'apply',
 'freemsg',
 'hey',
 'darling',
 'week',
 'word',
 'back',
 'like',
 'fun',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'send',
 'rcv',
 'winner',
 'valued',
 'network',
 'customer',
 'selected',
 'receivea',
 'prize',
 'reward',
 'claim',
 'call',
 'claim',
 'code',
 'valid',
 'hours',
 'mobile',
 'months',
 'u',
 'r',
 'entitled',
 'update',
 'latest',
 'colour',
 'mobiles',
 'camera',
 'free',
 'call',
 'mobile',
 'update',
 'co',
 'free',
 'six',
 'chances',
 'win',
 'cash',
 'pounds',
 'txt',
 'send',
 'cost',
 'tsandcs',
 'apply',
 'reply',
 'hl',
 'info',
 'urgent',
 'week',
 'free',
 'membership',
 'prize',
 'jackpot',
 'txt',
 'word',
 'claim',
 'c',
 'lccltd',
 'pobox',
 'xxxmobilemovieclub',
 'use',
 'credit',
 'click',
 'wap',
 'link',
 'next',
 'txt',
 'message',
 'click',
 'http',
 'england