In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

In [2]:
#read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [3]:
spam_df

Unnamed: 0,spam,text
763,False,nothing but we jus tot u would ask cos u ba gu...
3365,False,i am waiting for your call sir
2582,True,3 free tarot texts find out about your love li...
721,False,sfine anytime all the best with it
2237,False,give her something to drink if she takes it an...
...,...,...
1396,False,shall i start from hear
4957,False,why didnt u call on your lunch
2534,False,ok enjoy r u there in home
4816,False,me too mark is taking forever to pick up my pr...


In [4]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

3 free tarot texts find out about your love life now try 3 for free text chance to 85555 16 only after 3 free msgs å£150 each
-------
22 146tf150p
-------
please call 08712402578 immediately as there is an urgent message waiting for you
-------
tddnewsletteremc1couk more games from thedailydraw dear helen dozens of free games  with great prizeswith
-------
you are being contacted by our dating service by someone you know to find out who it is call from a land line 09050000878 pobox45w2tg150p
-------


In [5]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

nothing but we jus tot u would ask cos u ba gua but we went mt faber yest yest jus went out already mah so today not going out jus call lor
-------
i am waiting for your call sir
-------
sfine anytime all the best with it
-------
give her something to drink if she takes it and doesnt vomit then you her temp might drop if she unmits however let me know
-------
cool ill text you in a few
-------


In [6]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [7]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13282051282051283


In [8]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [9]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [10]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

In [11]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [12]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))

In [13]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.9039301310043668


In [14]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.02079002079002079


In [15]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003223       0.000021  154.407285
1    call   0.019260       0.003570    5.395225
2    this   0.004755       0.003340    1.423442
3  number   0.001289       0.000981    1.314105
Spam Score: -23.70799659676579
Non-Spam Score: -29.183090969243914


True

In [16]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000242       0.001733  0.139525
1       do   0.001370       0.005094  0.268947
2      you   0.016762       0.025594  0.654909
3     want   0.001612       0.002296  0.701851
4       to   0.039568       0.022275  1.776335
5       go   0.001853       0.003758  0.493245
6        a   0.020953       0.015449  1.356280
7    movie   0.000081       0.000251  0.321682
8  tonight   0.000161       0.000939  0.171564
Spam Score: -59.003489101551054
Non-Spam Score: -50.342750499402634


False