# 使用朴素贝叶斯模型检测垃圾邮件

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import warnings
warnings.filterwarnings("ignore")

## 数据预处理

In [4]:
data = pd.read_csv('web前端拉勾北京_预处理.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,job,low_wages,hight_wages,area,ink,staff,jobDes,inkDes
0,0,前端开发实习生-E方向,5,10,海淀区,字节跳动,2000.0,"六险一金,弹性工作,免费三餐,租房补贴","内容资讯,短视频,D轮及以上"
1,1,AI Lab前端开发实习生,4,8,海淀区,字节跳动,2000.0,"下午茶,团队氛围好,用户过亿,大牛带队","内容资讯,短视频,D轮及以上"
2,2,前端工程师,20,35,中关村,希悦,150.0,"技术前沿,代码规范,校园办公,周末双休","软件服务,咨询,不需要融资"
3,3,高级前端开发工程师,25,50,朝阳区,建信金科,2000.0,"福利待遇优厚,大平台,前景广阔","科技金融,不需要融资"
4,4,web前端教研讲师-少儿编程,15,30,海淀区,达内集团,2000.0,"高薪,上市企业,金股奖励,六险一金","教育,培训,上市公司"


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   450 non-null    int64  
 1   job          450 non-null    object 
 2   low_wages    450 non-null    int64  
 3   hight_wages  450 non-null    int64  
 4   area         450 non-null    object 
 5   ink          377 non-null    object 
 6   staff        441 non-null    float64
 7   jobDes       449 non-null    object 
 8   inkDes       423 non-null    object 
dtypes: float64(1), int64(3), object(5)
memory usage: 31.8+ KB


In [8]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18650 entries, 0 to 18650
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    18650 non-null  object
 1   Label   18650 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 437.1+ KB


In [9]:
# 去除网站
data["Body"] = [re.sub(r"http\S+", "", text) for text in data["Body"]]

In [10]:
# 去除单词和字母
data["Body"] = [re.sub("[^a-zA-Z0-9]"," ",text) for text in data["Body"]]

In [11]:
# 全部小写
data["Body"] = [text.lower() for text in data["Body"]]

In [12]:
train, test = train_test_split(data, train_size=0.90, test_size=0.10)

In [13]:
print(f"训练集长度: {len(train)}\n测试集长度: {len(test)}")
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(f"训练集预览：\n{train.head()}")

训练集长度: 16785
测试集长度: 1865
训练集预览：
                                                Body  Label
0  url   date  2002 10 06t18 12 56 01 00in the se...      0
1  subject  failure notice  this is the mail deli...      1
2   have you checked your personal credit reports...      1
3  subject  california litigation team   weekly c...      0
4  url   date  not supplied img   an alberta teen...      0


In [14]:
train_body = list(train["Body"])
train_label = list(train["Label"])
test_body = list(test["Body"])
test_label = list(test["Label"])

## 基于词袋的多项式朴素贝叶斯

### 使用CountVectorizer构建训练集词袋

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_train_vec = vec.fit_transform(train_body)
X_test_vec = vec.transform(test_body)

y_train_vec = train_label
y_test_vec = test_label

vec_result = pd.DataFrame(X_train_vec.toarray(), columns = vec.get_feature_names())
vec_result

Unnamed: 0,00,000,0000,000000,00000000,000000000,000000000003619,000000000005168,000000000005409,000000000005412,...,zzxtfeerekvwkug,zzyudgpd,zzzason,zzzglvaa,zzzickeletto,zzzlist,zzzz,zzzzason,zzzzcc,zzzzteana
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16780,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16783,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 使用多项式朴素贝叶斯进行建模和预测

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb_vec = MultinomialNB()
mnb_vec.fit(X_train_vec,y_train_vec)
y_predict_vec = mnb_vec.predict(X_test_vec)
print(f"预测准确率为:{mnb_vec.score(X_test_vec,y_test_vec)}\n")
print(classification_report(y_test_vec,y_predict_vec))

预测准确率为:0.9689008042895443

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1136
           1       0.95      0.97      0.96       729

    accuracy                           0.97      1865
   macro avg       0.97      0.97      0.97      1865
weighted avg       0.97      0.97      0.97      1865



In [24]:
# 选取邮件测试
mail_index = np.random.randint(len(test_body));

print(f"预测邮件信息为 test_body[{mail_index}]: \n{test_body[mail_index]}\n")
one_mail = vec.transform([test_body[mail_index]])
print(f"邮件信息转换为词袋为:\n{one_mail.toarray()[0]}\n")
print(f"预测邮件的所属类别为: {test_label[mail_index]}")
print(f"朴素贝叶斯模型预测的结果为: {mnb_vec.predict(one_mail)[0]}\n")
if test_label[mail_index] == mnb_vec.predict(one_mail)[0]:
    print("模型预测准确")
else:
    print("模型预测不准确")

预测邮件信息为 test_body[1328]: 
subject  weekend noms                                              forwarded by ami chokshi   corp   enron on 09   08   2000  09   46 am                                                        royal   b   edmondson   reliantenergy   com on 09   07   2000 02   50   28 pm  to   ami   chokshi   enron   com  cc    subject   weekend noms    see attached file   hpl   sept   xls      hpl   sept   xls

邮件信息转换为词袋为:
[0 0 0 ... 0 0 0]

预测邮件的所属类别为: 0
朴素贝叶斯模型预测的结果为: 0

模型预测准确


### 使用TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(train_body)
X_train = tfidf.transform(train_body)
X_test = tfidf.transform(test_body)

y_train = train_label
y_test = test_label
# 由于数据量过大导致内存溢出,所以下面语句不执行
# tfidf_result = pd.DataFrame(X_train.toarray(), columns = vec.get_feature_names())
# tfidf_result

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_train,y_train)
y_predict = mnb_tfidf.predict(X_test)
print('Accuracy of Native Bayes Classifier is :{} \n'.format(mnb_tfidf.score(X_test,y_test)))
print(classification_report(y_test,y_predict))

Accuracy of Native Bayes Classifier is :0.9131367292225201 

              precision    recall  f1-score   support

           0       0.89      0.99      0.93      1136
           1       0.97      0.80      0.88       729

    accuracy                           0.91      1865
   macro avg       0.93      0.89      0.91      1865
weighted avg       0.92      0.91      0.91      1865



In [27]:
# 随机选取邮件测试
mail_index = np.random.randint(len(test_body));
print(f"预测邮件信息为 test_body[{mail_index}]: \n{test_body[mail_index]}\n")
one_mail = tfidf.transform([test_body[mail_index]])
print(f"邮件信息转换为tf-idf特征为:\n{one_mail.toarray()}\n")
print(f"预测邮件的所属类别为: {test_label[mail_index]}")
print(f"tfidf朴素贝叶斯模型预测的结果为: {mnb_tfidf.predict(one_mail)[0]}\n")
if test_label[mail_index] == mnb_tfidf.predict(one_mail)[0]:
    print("模型预测准确")
else:
    print("模型预测不准确")

预测邮件信息为 test_body[1600]: 
1  fight the risk of cancer   slim down   guaranteed to lose 10 12 lbs in 30 days  get the child support you deserve   free legal advice  join the web s fastest growing singles community  start your private photo album online   a wonderful day  offer manager prizemamaif you wish to leave this list please use the link below  

邮件信息转换为tf-idf特征为:
[[0. 0. 0. ... 0. 0. 0.]]

预测邮件的所属类别为: 1
tfidf朴素贝叶斯模型预测的结果为: 1

模型预测准确


## 中文

In [37]:
import jieba

sentences = [
    ['我来到北京清华大学'],
    ['我来到天津天津大学'],
    ['我到河北省来']
]

label = [1,1,0]

words = []
for i in sentences:
    seg_list = jieba.cut(i[0])
    words.append(" ".join(seg_list))
print(f"分词之后的结果为:\n{words}")

分词之后的结果为:
['我 来到 北京 清华大学', '我 来到 天津 天津大学', '我 到 河北省 来']


In [40]:
def words_bag(sentences):
    # 分词, 需要导入jieba
    words = []
    for i in sentences:
        seg_list = jieba.cut(i[0])
        words.append("/".join(seg_list).split("/"))
    print(words)
    words_set = set([])
    # 创建两个集合的并集
    for i in words:
        words_set = words_set.union(set(i))
    words_set = list(words_set)
    print(words_set)
    train_matrix = np.zeros((len(sentences),len(words_set)))
    for i, k in enumerate(words):
        print("第{}个文本为：\n {}".format(i+1, k))
        vec = [0] * len(words_set)
        for word in k:
            if word in words_set:
                vec[words_set.index(word)] += 1
            else:
                print("the word: %s is not in my vocabulary!" % word)        
        train_matrix[i,:] = vec
        print("第{}个文本的向量化表示为：\n {} \n".format(i+1,train_matrix[i]))
    print(f"训练后的词袋:\n{train_matrix}")
    return train_matrix

In [41]:
train = pd.DataFrame(words_bag(sentences))
label = [1,1,0]

[['我', '来到', '北京', '清华大学'], ['我', '来到', '天津', '天津大学'], ['我', '到', '河北省', '来']]
['来', '天津', '到', '我', '天津大学', '北京', '来到', '河北省', '清华大学']
第1个文本为：
 ['我', '来到', '北京', '清华大学']
第1个文本的向量化表示为：
 [0. 0. 0. 1. 0. 1. 1. 0. 1.] 

第2个文本为：
 ['我', '来到', '天津', '天津大学']
第2个文本的向量化表示为：
 [0. 1. 0. 1. 1. 0. 1. 0. 0.] 

第3个文本为：
 ['我', '到', '河北省', '来']
第3个文本的向量化表示为：
 [1. 0. 1. 1. 0. 0. 0. 1. 0.] 

训练后的词袋:
[[0. 0. 0. 1. 0. 1. 1. 0. 1.]
 [0. 1. 0. 1. 1. 0. 1. 0. 0.]
 [1. 0. 1. 1. 0. 0. 0. 1. 0.]]


### 训练

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb1 = MultinomialNB()
mnb1.fit(train,label)
y_predict = mnb1.predict(train)
print(f"模型准确度: {mnb1.score(train,label)}")
print(classification_report(label,y_predict))

模型准确度: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



### idf

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
train = tfidf.fit_transform(words)
train = pd.DataFrame(train.toarray())
train

Unnamed: 0,0,1,2,3,4,5
0,0.622766,0.0,0.0,0.47363,0.0,0.622766
1,0.0,0.622766,0.622766,0.47363,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb = MultinomialNB()
mnb.fit(train,label)
y_predict = mnb.predict(train)
print('Accuracy of Native Bayes Classifier is :{} \n'.format(mnb.score(train,label)))
print(classification_report(label,y_predict))

Accuracy of Native Bayes Classifier is :1.0 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

