In [1]:
import re
import data_analysis_tools as da
def tokenize(message: str):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

tokenize("Hello world! Today is August 4 2021.") 

{'2021', '4', 'august', 'hello', 'is', 'today', 'world'}

In [2]:
data = da.file_ops.read_csv('./files/spam_ham_dataset.csv')
data = [ (data_i['text'], data_i['label']) for data_i in data ]
print(data[4])
print('---------------------------------------------------------------------')
print(tokenize(data[4][0]))

("Subject: re : indian springs\nthis deal is to book the teco pvr revenue . it is my understanding that teco\njust sends us a check , i haven ' t received an answer as to whether there is a\npredermined price associated with this deal or if teco just lets us know what\nwe are giving . i can continue to chase this deal down if you need .", 'ham')
---------------------------------------------------------------------
{'us', 'understanding', 'know', 'whether', 'the', 'predermined', 'check', 'this', 'teco', 'revenue', 'subject', 'or', 'we', 're', 't', 'associated', 'down', 'springs', 'can', 'there', 'is', 'received', 'that', 'giving', 'you', 'i', 'indian', 'deal', 'lets', 'what', 'to', 'chase', 'haven', 'need', 'as', 'are', 'sends', 'it', 'my', 'an', 'book', 'answer', 'just', 'price', 'pvr', 'with', 'if', 'continue', 'a'}


In [3]:
spam_count = 0
ham_count = 0

def spam_ham_count_from_data(data):
    """return spam_count, ham_count from given data"""
    spam_count = ham_count = 0
    for _, label in data:
        if label == 'spam':
            spam_count += 1
        else:
            ham_count += 1

    return spam_count, ham_count

train, test = da.random.split_data(data, 0.2)
print('train:',len(train), 'test',len(test))
spam_count, ham_count = spam_ham_count_from_data(train)

p_ham = ham_count / (spam_count + ham_count)
p_spam = spam_count / (spam_count + ham_count)
print(f'ham: {ham_count}, spam: {spam_count}')
print(f'ham p= {p_ham}, spam p= {p_spam}')

train: 4136 test 1035
ham: 2944, spam: 1192
ham p= 0.7117988394584139, spam p= 0.28820116054158607


In [4]:
from collections import defaultdict
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    
    for message, is_spam in training_set:
        tokenized = tokenize(message)
        for word in tokenized:
            counts[word][0 if is_spam == 'spam' else 1] += 1
    return counts

ham_spam_counts = count_words(train)

In [5]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam) """
    return [(w,
            (spam + k) / (total_spams + 2 * k),
            (non_spam + k) / (total_non_spams + 2 * k))
            for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    from math import log, exp
    message_words = tokenize(message)
    log_prob_spam = log_prob_nospam = 0.0

    for word, prob_spam, prob_nospam in word_probs:
        if word in message_words:
            log_prob_spam += log(prob_spam)
            log_prob_nospam += log(prob_nospam)
            # print(word, prob_spam, prob_nospam)
        else:
            log_prob_spam += log(1.0 - prob_spam)
            log_prob_nospam += log(1.0 - prob_nospam)

        prob_spam = exp(log_prob_spam)
        prob_nospam = exp(log_prob_nospam)

    # print(prob_spam, prob_nospam)
    predicted = 0.0
    try:
        predicted = prob_spam / (prob_spam + prob_nospam)
    except:
        pass
    return predicted

word_probs = word_probabilities(ham_spam_counts, spam_count, ham_count)

In [6]:
test_index = 1
message = test[test_index][0]
label = test[test_index][1]
print(f'{message}\n{label} ')
spam_probability(word_probs, message)

Subject: wed unify mtg 2 : 00 - 2 allen center , 12 th floor conf room
attached is the presentation that jeff reviewed last week in the meeting .
- - - - - - - - - - - - - - - - - - - - - - forwarded by brenda f herod / hou / ect on 06 / 27 / 2000
08 : 39 pm - - - - - - - - - - - - - - - - - - - - - - - - - - -
jeff johnson @ enron
06 / 19 / 2000 05 : 34 pm
to : beth perlman / hou / ect @ ect , bryce baxter / hou / ect @ ect , brent a
price / hou / ect @ ect , leslie reeves / hou / ect @ ect , laura e scott / cal / ect @ ect ,
sheila glover / hou / ect @ ect , jefferson d sorenson / hou / ect @ ect , robert
superty / hou / ect @ ect , brenda f herod / hou / ect @ ect , kenneth m
harmon / hou / ect @ ect , dave nommensen / hou / ect @ ect , regan m smith / hou / ect @ ect ,
laray . odum @ luminant . com , nadeem . abbasi @ luminant . com , scott
williamson / hou / ect @ ect , paul f poellinger / hou / ect @ ect , john
simmons / na / enron @ enron , chris hanz / corp / enron @ enron , sa

0.0

In [7]:
for test_index in range(50):
    message = test[test_index][0]
    label = test[test_index][1]
    print(f'actual: {label}, predicted spam prob: {spam_probability(word_probs, message)}')

actual: ham, predicted spam prob: 4.074643458321017e-34
actual: ham, predicted spam prob: 0.0
actual: ham, predicted spam prob: 1.3309044680619976e-68
actual: ham, predicted spam prob: 3.0118928083352554e-32
actual: ham, predicted spam prob: 3.805504854071566e-25
actual: ham, predicted spam prob: 0.0
actual: ham, predicted spam prob: 1.9254460668151395e-61
actual: ham, predicted spam prob: 0.0
actual: spam, predicted spam prob: 1.0
actual: ham, predicted spam prob: 0.0
actual: spam, predicted spam prob: 4.2040825553278734e-14
actual: ham, predicted spam prob: 4.0926219152236524e-22
actual: ham, predicted spam prob: 8.674727768735842e-38
actual: spam, predicted spam prob: 1.9302380206633338e-10
actual: spam, predicted spam prob: 7.300563740446168e-11
actual: ham, predicted spam prob: 0.0
actual: ham, predicted spam prob: 4.550775546608689e-35
actual: ham, predicted spam prob: 0.0
actual: ham, predicted spam prob: 2.482139125519755e-23
actual: ham, predicted spam prob: 2.648495674547708e

In [8]:
classifier = da.ml.NaiveBayesClassifier(k=0.01)
classifier.train(train)

In [9]:

for test_index in range(60, 70):
    message = test[test_index][0]
    label = test[test_index][1]
    print(f'actual: {label}, predicted spam prob: {classifier.classify(message)}')

actual: spam, predicted spam prob: 1.0
actual: spam, predicted spam prob: 4.3062193849647404e-13
actual: ham, predicted spam prob: 8.0099593110892e-40
actual: ham, predicted spam prob: 1.3126400865957259e-80
actual: ham, predicted spam prob: 0.0
actual: ham, predicted spam prob: 1.860959326964966e-103
actual: ham, predicted spam prob: 1.6594643156190965e-48
actual: ham, predicted spam prob: 1.2524650911525044e-20
actual: ham, predicted spam prob: 3.467234431841853e-99
actual: ham, predicted spam prob: 1.2459195050807694e-63


In [10]:
classifier.classify("I am new to this company, so because of that i need your assistance sir!")

8.150060847152339e-12

In [11]:
classifier.classify("Free viagra, click now!")

1.882828309290226e-08

In [13]:
# testing the model
tp, fp, tn, fn = classifier.test(test, 0.7)
print(f'accuracy score: {da.ml.accuracy(tp, fp, fn, tn)}')
print(f'precision score: {da.ml.precision(tp, fp, fn, tn)}')
print(f'recall score: {da.ml.recall(tp, fp, fn, tn)}')
print(f'f1 score: {da.ml.f1_score(tp, fp, fn, tn)}')

accuracy score: 0.8753623188405797
precision score: 0.9587628865979382
recall score: 0.6058631921824105
f1 score: 0.7425149700598803


In [18]:
tp, fp, tn, fn = classifier.test(test, 0.000000001)
print(f'accuracy score: {da.ml.accuracy(tp, fp, fn, tn)}')
print(f'precision score: {da.ml.precision(tp, fp, fn, tn)}')
print(f'recall score: {da.ml.recall(tp, fp, fn, tn)}')
print(f'f1 score: {da.ml.f1_score(tp, fp, fn, tn)}')

accuracy score: 0.9091787439613527
precision score: 0.941908713692946
recall score: 0.739413680781759
f1 score: 0.8284671532846715


In [21]:
def p_spam_given_word(word_prob):
    """p(spam | message contains word)"""
    _, prob_spam, prob_nospam = word_prob
    return prob_spam / (prob_spam + prob_nospam)

words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-10:]
hammiest_words = words[:10]

print(f'spammiest words: {spammiest_words}\n')
print(f'hammiest words: {hammiest_words}')

"""
    Improvement ideas: 

    1- This classifier takes into account every word that appears in the training set, even words
    that appear only once. Modify the classifier to accept an optional min_count threshhold
    and ignore tokens that don’t appear at least that many times

    2- Use a stemmer algorithm for words. https://tartarus.org/martin/PorterStemmer/index-old.html

"""

spammiest words: [('cheap', 0.04111508196171205, 3.3967160549181054e-06), ('xp', 0.042792906159292626, 3.3967160549181054e-06), ('biz', 0.046987466653244075, 3.3967160549181054e-06), ('cialis', 0.046987466653244075, 3.3967160549181054e-06), ('drugs', 0.04782637875203436, 3.3967160549181054e-06), ('paliourg', 0.052859851344776096, 3.3967160549181054e-06), ('php', 0.05621549973993725, 3.3967160549181054e-06), ('viagra', 0.05957114813509841, 3.3967160549181054e-06), ('meds', 0.06292679653025957, 3.3967160549181054e-06), ('prescription', 0.06628244492542072, 3.3967160549181054e-06)]

hammiest words: [('enron', 8.389120987902887e-06, 0.3994572047744241), ('hpl', 8.389120987902887e-06, 0.30536817005319256), ('daren', 8.389120987902887e-06, 0.2768357551918805), ('meter', 8.389120987902887e-06, 0.209580777304502), ('nom', 8.389120987902887e-06, 0.15693167845327138), ('mmbtu', 8.389120987902887e-06, 0.14198612781163172), ('xls', 8.389120987902887e-06, 0.13960842657318903), ('volumes', 8.3891209