# Importing libraries

In [2]:
import re
import string
import pandas as pd
import os
from numpy import *
from sklearn.model_selection import train_test_split

# Parsing text
this func turns a given string into a list of lowercase words without punctuation

In [3]:
def textParse(email: str):
    return re.sub("[^\w\s]", "", email).replace("Subject", "", 1).lower().split()

# Creates a list of distinct words used in an emails dataset

In [13]:
def createVocabList(dataSet: list):
    vocab_words_spam = []
    for sentence in dataSet:
        for word in textParse(sentence):
            vocab_words_spam.append(word)

    return list(dict.fromkeys(vocab_words_spam))

createVocabList(['send us your password', 'review our website', 'send your password', 'send us your account'])

['send', 'us', 'your', 'password', 'review', 'our', 'website', 'account']

# Importing the dataset (CSV)

In [22]:
sppams = []
for filename in os.listdir('spam-ham/spam'):
     f = open(filename,'r')
     sppams.append(f.read())

dfs = pd.read_csv('spam_ham_dataset.csv')
Y = dfs['label_num']

# TODO: install sklearn to split the data
X_train, X_test, Y_train, Y_test = train_test_split(dfs, Y, test_size=0.2, random_state=1, stratify=Y)

spams_train = X_train['text'][X_train['label'] == 'spam']
hams_train = X_train['text'][X_train['label'] == 'ham']

FileNotFoundError: [Errno 2] No such file or directory: '1.txt'

# Smoothing

In [14]:
spam_emails = ['send us your password', 'review our website', 'send your password', 'send us your account']
ham_emails = ['Your activity report','benefits physical activity', 'the importance vows']

def wordsProb(emails: list):
    dict_prob = {}
    for w in createVocabList(emails):
        emails_with_w = 0     # counter
        for sentence in emails:
            if w in sentence:
                emails_with_w+=1

        print(f"Number of emails with the word {w}: {emails_with_w}")
        prob = (emails_with_w+1)/(len(emails)+2)
        print(f"Prob of the word '{w}': {prob} \n")
        dict_prob[w.lower()] = prob
    return dict_prob

dict_spamicity = wordsProb(spam_emails)
dict_hamicity = wordsProb(ham_emails)


prob_spam = len(spam_emails) / (len(spam_emails)+(len(ham_emails)))
print(prob_spam)
print(dict_spamicity)
prob_ham = len(ham_emails) / (len(spam_emails)+(len(ham_emails)))
print(prob_ham)
print(dict_hamicity)

Number of emails with the word send: 3
Prob of the word 'send': 0.6666666666666666 

Number of emails with the word us: 2
Prob of the word 'us': 0.5 

Number of emails with the word your: 3
Prob of the word 'your': 0.6666666666666666 

Number of emails with the word password: 2
Prob of the word 'password': 0.5 

Number of emails with the word review: 1
Prob of the word 'review': 0.3333333333333333 

Number of emails with the word our: 4
Prob of the word 'our': 0.8333333333333334 

Number of emails with the word website: 1
Prob of the word 'website': 0.3333333333333333 

Number of emails with the word account: 1
Prob of the word 'account': 0.3333333333333333 

Number of emails with the word your: 0
Prob of the word 'your': 0.2 

Number of emails with the word activity: 2
Prob of the word 'activity': 0.6 

Number of emails with the word report: 1
Prob of the word 'report': 0.4 

Number of emails with the word benefits: 1
Prob of the word 'benefits': 0.4 

Number of emails with the word p

# Tockenizing test data

In [15]:
tests = ['renew your password', 'renew your vows', 'benefits of our account', 'the importance of physical activity']

distinct_words_as_sentences_test = [textParse(sentence) for sentence in tests]

print(distinct_words_as_sentences_test)

test_spam_tokenized = [distinct_words_as_sentences_test[0], distinct_words_as_sentences_test[1]]
test_ham_tokenized = [distinct_words_as_sentences_test[2], distinct_words_as_sentences_test[3]]

# test_spam_tokenized = X_test['text'][X_test['label'] == 'spam']
# test_ham_tokenized = X_test['text'][X_test['label'] == 'ham']

[['renew', 'your', 'password'], ['renew', 'your', 'vows'], ['benefits', 'of', 'our', 'account'], ['the', 'importance', 'of', 'physical', 'activity']]


In [None]:
def classifyNB(vec2Classify, p0Vect, p1Vect, pC1):
    # Calcul de probabilité selon la loi de Bernoulli
    p1 = sum(vec2Classify*log(p1Vect)+(1-vec2Classify)*log(1-p1Vect))+log(pC1)
    p0 = sum(vec2Classify*log(p0Vect)+(1-vec2Classify)*log(1-p0Vect))+log(1-pC1)
    if p1 > p0:
        return 1
    else:
        return 0

In [None]:
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
        p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
        print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
        testEntry = ['stupid', 'garbage']
        thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
        print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)

# Predicting

In [20]:
def mult(list_):
    total_prob = 1
    for i in list_:
         total_prob = total_prob * i
    return total_prob

def Bayes(email):
    probs = []
    for word in email:
        Pr_S = prob_spam
        try:
            pr_WS = dict_spamicity[word]
        except KeyError:
            pr_WS = 1/(len(spam_emails)+2)

        Pr_H = prob_ham
        try:
            pr_WH = dict_hamicity[word]
        except KeyError:
            pr_WH = (1/(len(ham_emails)+2))

        prob_word_is_spam_BAYES = (pr_WS*Pr_S)/((pr_WS*Pr_S)+(pr_WH*Pr_H))
        probs.append(prob_word_is_spam_BAYES)
    print(f"All word probabilities for this sentence: {probs}")
    final_classification = mult(probs)
    if final_classification >= 0.5:
        print(f'email is SPAM: with spammy confidence of {final_classification*100}%')
    else:
        print(f'email is HAM: with spammy confidence of {final_classification*100}%')
    return final_classification
for email in test_spam_tokenized:
    print('')
    print(f"           Testing stemmed SPAM email {email} :")
    print('                 Test word by word: ')
    all_word_probs = Bayes(email)
    print(all_word_probs)

for email in test_ham_tokenized:
    print('')
    print(f"           Testing stemmed HAM email {email} :")
    print('                 Test word by word: ')
    all_word_probs = Bayes(email)
    print(all_word_probs)


           Testing stemmed SPAM email ['renew', 'your', 'password'] :
                 Test word by word: 
All word probabilities for this sentence: [0.5263157894736842, 0.8163265306122448, 0.7692307692307692]
email is HAM: with spammy confidence of 33.04965710980748%
0.3304965710980748

           Testing stemmed SPAM email ['renew', 'your', 'vows'] :
                 Test word by word: 
All word probabilities for this sentence: [0.5263157894736842, 0.8163265306122448, 0.35714285714285715]
email is HAM: with spammy confidence of 15.344483658124902%
0.15344483658124902

           Testing stemmed HAM email ['benefits', 'of', 'our', 'account'] :
                 Test word by word: 
All word probabilities for this sentence: [0.35714285714285715, 0.5263157894736842, 0.847457627118644, 0.689655172413793]
email is HAM: with spammy confidence of 10.985968720749856%
0.10985968720749856

           Testing stemmed HAM email ['the', 'importance', 'of', 'physical', 'activity'] :
               