# Top ten "spammiest" words

In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from textblob import TextBlob



In [2]:
#Load in ham or spam text dataset
df = pd.read_table("../data/NLP_data/sms.tsv",encoding="utf-8", names= ["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Look at null accuracy
df.label.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [4]:
#Assign X and y
X = df.message
y = df.label

#Intialize vectorizer with default settings
vect = CountVectorizer()
#Fit and transform X
Xdtm = vect.fit_transform(X)
#Intialize, fit, and score model on training data
nb = MultinomialNB()
nb.fit(Xdtm,y)
nb.score(Xdtm,y)

0.99353912419239054

In [5]:
#Assign list of features to tokens variable
tokens = vect.get_feature_names()
len(tokens)

8713

In [6]:
#Print random slice of features
print tokens[3200:3250]

[u'fifteen', u'fifth', u'fifty', u'fight', u'fighting', u'fightng', u'fights', u'figure', u'figures', u'figuring', u'file', u'files', u'fill', u'filled', u'filling', u'fills', u'film', u'films', u'filth', u'filthy', u'filthyguys', u'final', u'finalise', u'finally', u'finance', u'financial', u'find', u'finding', u'finds', u'fine', u'finest', u'fingers', u'finish', u'finishd', u'finished', u'finishes', u'finishing', u'fink', u'finn', u'fire', u'fired', u'firefox', u'fireplace', u'fires', u'firmware', u'firsg', u'first', u'fish', u'fishhead', u'fishrman']


In [7]:
#How many times does a word appear in each class
nb.feature_count_

array([[  0.,   0.,   1., ...,   1.,   0.,   1.],
       [ 10.,  29.,   0., ...,   0.,   1.,   0.]])

In [8]:
#Shape
nb.feature_count_.shape

(2, 8713)

In [9]:
#Returns out counts of each word in documents marked "ham"
ham_token_count = nb.feature_count_[0,:]
ham_token_count

array([ 0.,  0.,  1., ...,  1.,  0.,  1.])

In [13]:
nb.feature_count_

array([[  0.,   0.,   1., ...,   1.,   0.,   1.],
       [ 10.,  29.,   0., ...,   0.,   1.,   0.]])

In [10]:
#Returns out counts of each word in documents marked "spam"
spam_token_count = nb.feature_count_[1, :]
spam_token_count

array([ 10.,  29.,   0., ...,   0.,   1.,   0.])

In [11]:
# create a DataFrame of tokens with their separate ham and spam counts
df_tokens = pd.DataFrame({'token':tokens, 
                          'ham':ham_token_count, 
                          'spam':spam_token_count}).set_index('token')

#Randomly data 
df_tokens.sample(10, random_state=12)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
weddin,1.0,0.0
gautham,3.0,0.0
lambda,1.0,0.0
salmon,1.0,0.0
live,17.0,29.0
memories,1.0,0.0
aproach,2.0,0.0
37819,0.0,1.0
algarve,0.0,2.0
versus,1.0,0.0


In [14]:
# add 1 to ham and spam counts to avoid dividing by 0
df_tokens['ham'] = df_tokens.ham + 1
df_tokens['spam'] = df_tokens.spam + 1
df_tokens.sample(10, random_state=12)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
weddin,2.0,1.0
gautham,4.0,1.0
lambda,2.0,1.0
salmon,2.0,1.0
live,18.0,30.0
memories,2.0,1.0
aproach,3.0,1.0
37819,1.0,2.0
algarve,1.0,3.0
versus,2.0,1.0


In [15]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([ 4825.,   747.])

In [16]:
# convert the ham and spam counts into frequencies
df_tokens['ham'] = df_tokens.ham / nb.class_count_[0]
df_tokens['spam'] = df_tokens.spam / nb.class_count_[1]
df_tokens.sample(10, random_state=12)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
weddin,0.000415,0.001339
gautham,0.000829,0.001339
lambda,0.000415,0.001339
salmon,0.000415,0.001339
live,0.003731,0.040161
memories,0.000415,0.001339
aproach,0.000622,0.001339
37819,0.000207,0.002677
algarve,0.000207,0.004016
versus,0.000415,0.001339


In [17]:
# calculate the ratio of spam-to-ham for each token
df_tokens['spam_ratio'] = df_tokens.spam / df_tokens.ham
df_tokens.sample(10, random_state=12)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
weddin,0.000415,0.001339,3.229585
gautham,0.000829,0.001339,1.614793
lambda,0.000415,0.001339,3.229585
salmon,0.000415,0.001339,3.229585
live,0.003731,0.040161,10.765283
memories,0.000415,0.001339,3.229585
aproach,0.000622,0.001339,2.153057
37819,0.000207,0.002677,12.91834
algarve,0.000207,0.004016,19.37751
versus,0.000415,0.001339,3.229585


In [18]:
# examine the DataFrame sorted by spam_ratio
df_tokens.sort_values('spam_ratio', ascending=False).head(10)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000207,0.15261,736.345382
prize,0.000207,0.125837,607.161981
150p,0.000207,0.096386,465.060241
tone,0.000207,0.08166,394.009371
18,0.000207,0.069612,335.876841
guaranteed,0.000207,0.068273,329.417671
500,0.000207,0.060241,290.662651
cs,0.000207,0.060241,290.662651
1000,0.000207,0.056225,271.285141
awarded,0.000207,0.052209,251.907631


# Above the top ten "spammiest" words in the dataset.