# Email classification based on Bayesian analysis

## 1 Prepare

### 1.1. Import package

In [64]:
import os  # I/O
import re  # regular expression

import numpy as np # linear algebra
import pandas as pd # data processing and CSV file I/O
import plotly.graph_objects as go    # for plot
from matplotlib import pyplot as plt # for plot

from sklearn.model_selection import train_test_split, cross_validate # split train and test
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

RANDOM_SEED = 2022

### 1.2. Show datasets in project 

In [39]:
for dirname, _, filenames in os.walk(f'.\data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\data\SMSSpamCollection
.\data\spam_ham_dataset.csv


You only need to run one of cells following.

#### 1.2.1. DataSet 1

In [79]:
df=pd.read_csv(f'.\data\spam_ham_dataset.csv')
df.drop([df.columns[0]], axis=1,inplace=True)
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


#### 1.2.2. DataSet 2

In [66]:
df = pd.read_csv('./data/SMSSpamCollection',sep='\t')
df.head()
label = []
for w in range(len(df.text)):
    if df['label'][w] == 'ham':
        label.append(0)
    else:
        label.append(1)
df['label_num'] = label
df.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### 1.3 Data Clean

In [89]:
clean_txt = []
for w in range(len(df.text)):
   desc = df['text'][w].lower()
   #remove punctuation
   desc = re.sub('[^a-zA-Z]', ' ', desc)
   #remove tags
   desc = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
   #remove digits and special chars
   desc = re.sub("(\\d|\\W)+"," ",desc)
   clean_txt.append(desc)
df['clean'] = clean_txt
df.head()

Unnamed: 0,label,text,label_num,clean
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter this is a follow ...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom for january see attached file ...
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho we re around to ...
3,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject re indian springs this deal is to book...


## 2 Split Training data and Test data

**CountVectorizer** is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a **vector on the basis of the frequency (count) of each word** that occurs in the entire text.

In [116]:
text_train, text_test, label_train, label_test = train_test_split(df['clean'], df['label_num'], test_size=0.2, random_state=RANDOM_SEED)
vectorizer_text = CountVectorizer(max_df=0.9, min_df=10)
x_train = vectorizer_text.fit_transform(text_train)
x_test = vectorizer_text.transform(text_test)
# print(x_train.toarray())
print(label_train.values)

[0 0 0 ... 0 0 0]


## 3 Build in implementation and it's performance

In [83]:
for NB in [BernoulliNB(), MultinomialNB(), ComplementNB()]:
    NB.fit(x_train, label_train)
    p_test = NB.predict(x_test)
    test_acc = accuracy_score(label_test, p_test)        # accuracy
    test_precision = precision_score(label_test, p_test) # (spam and label as spam)/(labeled as spam)
    test_recall = recall_score(label_test, p_test)       # (spam and label as spam)/(all spam)
    print(NB)
    print('accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}'.format(test_acc, test_precision, test_recall)) # print performace

BernoulliNB()
accuracy: 0.9785, precision: 0.9618, recall: 0.8935
MultinomialNB()
accuracy: 0.9776, precision: 0.9390, recall: 0.9112
ComplementNB()
accuracy: 0.9614, precision: 0.8351, recall: 0.9290


## 4 Our Result

In [123]:
def fit(X_train, label_train):
    spam_num = len(label_train[label_train==1])
    ham_num = len(label_train) - spam_num
    p_spam = spam_num/len(label_train)
    x_data = x_train.toarray()
    p_spam_cond = np.zeros(x_data.shape[1])
    p_ham_cond = np.zeros(x_data.shape[1])
    for j in range(x_data.shape[1]):
        for i in range(x_data.shape[0]):
            if x_data[i,j]==1 and label_train[i]==1:
                p_spam_cond[j] = p_spam_cond[j]+1/spam_num
            if x_data[i,j]==1 and label_train[i]==0:
                p_ham_cond[j] = p_spam_cond[j]+1/ham_num
    return (p_spam, p_spam_cond, p_ham_cond)

In [137]:
def predict(x_test, p_spam, p_spam_cond, p_ham_cond):
    label_pre = np.zeros(x_test.shape[0])
    for i in range(x_test.shape[0]):
        px_spam = p_spam
        px_ham = 1-p_spam
        for j in range(len(p_spam_cond)):
            if x_test[i,j] == 1:
                px_spam = px_spam*p_spam_cond[j]
                px_ham = px_ham*p_ham_cond[j]
            else:
                px_spam = px_spam*(1-p_spam_cond[j])
                px_ham = px_ham*(1-p_ham_cond[j])
        label_pre[i] = int(px_spam>px_ham)
    return label_pre

In [138]:
p_spam, p_spam_cond, p_ham_cond = fit(x_train, label_train.values)
label_pre = predict(x_test, p_spam, p_spam_cond, p_ham_cond)

In [145]:
test_acc = accuracy_score(label_test, label_pre)
test_precision = precision_score(label_test, label_pre)
test_recall = recall_score(label_test, label_pre)
print('accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}'.format(test_acc, test_precision, test_recall)) # print performace
# print(len(label_pre[label_pre==1]))

accuracy: 0.8657, precision: 0.9820, recall: 0.5467


In [146]:
label_pre = predict(x_test, p_spam, p_spam_cond, p_ham_cond)