### link: https://viblo.asia/p/ly-thuyet-ve-mang-bayes-va-ung-dung-vao-bai-toan-loc-thu-rac-07LKXzkelV4


### https://www.kaggle.com/pablovargas/naive-bayes-svm-spam-filtering

In [1]:
from pyvi import ViTokenizer #For split vietnamese words
import pandas as pd #For reading xlsx file
from gensim.parsing.preprocessing import strip_non_alphanum, strip_multiple_whitespaces,preprocess_string, split_alphanum, strip_short, strip_numeric
import re 

In [2]:
with open('spam.txt', 'r', encoding = 'utf-8') as f:
    
    for line in f:
#         line = re.sub(r"", "", line)
        line = strip_non_alphanum(line).lower().strip()
        ## Tách từ vô nghĩa
        line = split_alphanum(line)
        
        ## Loại bỏ các từ đứng 1 mình
        line = strip_short(line, minsize=2)
        
        ## Loại bỏ hết các số trong văn bản
        line = strip_numeric(line)
        
        ## Ghép từ tiếng việt
        line = ViTokenizer.tokenize(line)        

In [3]:
def raw_text_preprocess(raw):
#     raw = re.sub(r"http\S+", "", raw)
    raw = strip_non_alphanum(raw).lower().strip()
#     raw = split_alphanum(raw)
    raw = strip_short(raw, minsize=2)
    raw = strip_numeric(raw)
    raw = ViTokenizer.tokenize(raw)

    return raw

### Đọc  dữ liệu đã thu nhập rồi xử lý

In [4]:
document = []
label = []
with open('spam.txt', 'r', encoding = 'utf-8') as f:
    for line in f:
        document.append(line[6: ])
        label.append(1)
with open('nonspam_email.txt', 'r', encoding = 'utf-8') as f:
    for line in f:
        document.append(line[9: ])
        label.append(0)
print(len(document))

62


In [5]:
document = [raw_text_preprocess(d) for d in document]
document_test = document[10: 30]
label_test = label[10: 30]

### Xây dựng Bag_of_words
    - Chứa tất cả các từ có trong tập dữ liệu
    - Tách các văn bản thành các từ sau đó loại bỏ tất cả các những từ trùng nhau.

In [6]:
set_words = []

for doc in document:
    words = doc.split(' ')
    set_words += words
    set(set_words)
print(len(set_words))
# print(set_words)

9945


### Chuyển văn bản sang các vector
    - Với mỗi văn bản trong tập dữ liệu, tạo thành 1 vecto 0 với số thuộc tính là chiều dài cảu bag of words.
    - Sau đó kiêm tra xem từng từ trong bag of words có nằm trong văn bản không, nếu có thì gán thuộc tính đó bằng 1.

In [7]:
import numpy as np
vectors = []

for doc in document:
    vector = np.zeros(len(set_words))
    for i, word in enumerate(set_words):
        if word in doc:
            vector[i] = 1
    vectors.append(vector)
np.shape(vectors)

(62, 9945)

### Tính các xác suất cần thiết của Naive Bayes

In [8]:
def smoothing(a, b):
    return float((a + 1)/(b + 1))

In [9]:
spam = 0
non_spam = 0

for l in label:
    if l == 1:
        spam += 1
    else:
        non_spam += 1
print(spam, non_spam)

39 23


In [10]:
spam_coef = smoothing(spam, (spam + non_spam))
non_spam_coef = smoothing(non_spam, (spam + non_spam))

### Các xác suất thành phần

In [11]:
bayes_matrix = np.zeros((len(set_words), 4))
##app/spam, app/nonspam, nonapp/spam, nonapp/nonspam

In [12]:
for i, word in enumerate(set_words):
    app_spam = 0
    app_nonspam = 0
    nonapp_spam = 0
    nonapp_nonspam = 0
    for k, v in enumerate(vectors):
        if v[i] == 1:
            if label[k] == 1:
                app_spam += 1
            else:
                app_nonspam += 1
        else:
            if label[k] == 1:
                nonapp_spam += 1
            else:
                nonapp_nonspam += 1
                
    bayes_matrix[i][0] = smoothing(app_spam, spam)
    bayes_matrix[i][1] = smoothing(app_nonspam, non_spam)
    bayes_matrix[i][2] = smoothing(nonapp_spam, spam)
    bayes_matrix[i][3] = smoothing(nonapp_nonspam, non_spam)

### Phân loại thư mới

In [13]:
new_document = document[30] 
new_document = raw_text_preprocess(new_document)

#Vectorizer
vector = np.zeros(len(set_words))
for i, word in enumerate(set_words):
    if word in new_document:
        vector[i] = 1

In [14]:
log = np.zeros(2)

predict_spam = spam_coef #P(spam)
predict_non_spam = non_spam_coef #P(non_spam)

index = 0

for i, v in enumerate(vector):
    if v == 0:
        predict_spam *= bayes_matrix[i][2] #P(xi|cj)
        predict_non_spam *= bayes_matrix[i][3]
    else:
        predict_spam *= bayes_matrix[i][0]
        predict_non_spam *= bayes_matrix[i][1]
    
    if predict_spam < 1e-10:
        predict_spam *= 1000
        log[0] += 1
    
    if predict_non_spam < 1e-10:
        predict_non_spam *= 1000
        log[1] +=1

In [15]:
def compare(predict_spam, predict_non_spam, log):
    while (log[0] > log[1]):
        predict_spam /= 10
        log[0] -=1
        if predict_spam > predict_non_spam:
            return True
        
    while(log[1] > log[0]):
        predict_non_spam /= 10
        log[1] -= 1
        if predict_non_spam > predict_spam:
            return False
        
    if predict_spam > predict_non_spam:
        return True
    return False

In [16]:
def predict(mail):
    mail = raw_text_preprocess(mail)
    
    vector = np.zeros(len(set_words))
    for i, word in enumerate(set_words):
        if word in mail:
            vector[i] = 1
    log = np.zeros(2)

    predict_spam = spam_coef
    predict_non_spam = non_spam_coef

    for i, v in enumerate(vector):
        if v == 0:
            predict_spam *= bayes_matrix[i][2]
            predict_non_spam *= bayes_matrix[i][3]
        else:
            predict_spam *= bayes_matrix[i][0]
            predict_non_spam *= bayes_matrix[i][1]

        if predict_spam < 1e-10:
            predict_spam *= 1000
            log[0] += 1

        if predict_non_spam < 1e-10:
            predict_non_spam *= 1000
            log[1] +=1
            
    if compare(predict_spam, predict_non_spam, log):
        return 1
    return 0

In [17]:
from sklearn.metrics import accuracy_score
pred = [predict(d) for d in document_test]
accuracy_score(label_test, pred)

0.8