# 使用朴素贝叶斯模型检测垃圾邮件

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import warnings
warnings.filterwarnings("ignore")

## 数据预处理

In [10]:
data = pd.read_csv('web前端拉勾北京_预处理.csv')

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,job,low_wages,hight_wages,area,ink,staff,skill,jobDes,inkDes
0,0,前端开发实习生-E方向,5,10,海淀区,字节跳动,2000.0,"JavaScript,Angular,React,web","六险一金,弹性工作,免费三餐,租房补贴","内容资讯,短视频,D轮及以上"
1,1,AI Lab前端开发实习生,4,8,海淀区,字节跳动,2000.0,"JavaScript,HTML,CSS,React,Vue","下午茶,团队氛围好,用户过亿,大牛带队","内容资讯,短视频,D轮及以上"
2,2,前端工程师,20,35,中关村,希悦,150.0,"TypeScript,React","技术前沿,代码规范,校园办公,周末双休","软件服务,咨询,不需要融资"
3,3,高级前端开发工程师,25,50,朝阳区,建信金科,2000.0,"JavaScript,Html,React,Vue,Web","福利待遇优厚,大平台,前景广阔","科技金融,不需要融资"
4,4,web前端教研讲师-少儿编程,15,30,海淀区,达内集团,2000.0,,"高薪,上市企业,金股奖励,六险一金","教育,培训,上市公司"


In [12]:
data = data.dropna()

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335 entries, 0 to 448
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   335 non-null    int64  
 1   job          335 non-null    object 
 2   low_wages    335 non-null    int64  
 3   hight_wages  335 non-null    int64  
 4   area         335 non-null    object 
 5   ink          335 non-null    object 
 6   staff        335 non-null    float64
 7   skill        335 non-null    object 
 8   jobDes       335 non-null    object 
 9   inkDes       335 non-null    object 
dtypes: float64(1), int64(3), object(6)
memory usage: 28.8+ KB


In [14]:
# 全部小写
data["skill"] = [text.lower() for text in data["skill"]]

In [15]:
train, test = train_test_split(data, train_size=0.90, test_size=0.10)

In [16]:
print(f"训练集长度: {len(train)}\n测试集长度: {len(test)}")
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(f"训练集预览：\n{train.head()}")

训练集长度: 301
测试集长度: 34
训练集预览：
   Unnamed: 0                     job  low_wages  hight_wages area     ink  \
0         206  前端开发工程师-目的地 (MJ004864)         25           35  海淀区    去哪儿网   
1         219               资深前端开发工程师         40           80  海淀区  欧科云链集团   
2         407                   前端开发岗         13           25  丰台区    政务科技   
3         190                 前端开发工程师         20           30  四季青      美摄   
4          97        高级前端工程师(J413369)         15           25  朝阳区  我爱我家集团   

    staff                         skill               jobDes  \
0  2000.0   javascript,typescript,react        大平台,福利多,发展空间大   
1  2000.0  javascript,react,vue,node,js  六险一金,分布式办公,管晚餐,作息正常   
2   150.0         it,html,css,redux,web             五险一金,资源好   
3   150.0        it,javascript,html,vue   音视频行业领先,技术大牛,弹性不打卡   
4  2000.0            javascript,angular                 上市公司   

                    inkDes  
0              旅游,出行,不需要融资  
1                 区块链,上市公司  
2  软件服务,咨询,IT技术服务,咨询,不需要融资  
3 

In [17]:
train_skill = list(train["skill"])
train_target = list(train["low_wages"])
test_skill = list(test["skill"])
test_target = list(test["low_wages"])

## 基于词袋的多项式朴素贝叶斯

### 使用CountVectorizer构建训练集词袋

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_train_vec = vec.fit_transform(train_skill)
X_test_vec = vec.transform(test_skill)

y_train_vec = train_target
y_test_vec = test_target

vec_result = pd.DataFrame(X_train_vec.toarray(), columns = vec.get_feature_names())
vec_result

Unnamed: 0,android,angular,cocos,css,es,golang,hlml,html,hybrid,hybridapp,...,struts,tcp,tomcat,typescript,ugui,unity,vue,web,windows,xml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
297,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
298,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
299,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


### 使用多项式朴素贝叶斯进行建模和预测

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb_vec = MultinomialNB()
mnb_vec.fit(X_train_vec,y_train_vec)
y_predict_vec = mnb_vec.predict(X_test_vec)
print(f"预测准确率为:{mnb_vec.score(X_test_vec,y_test_vec)}\n")
print(classification_report(y_test_vec,y_predict_vec))

预测准确率为:0.14705882352941177

              precision    recall  f1-score   support

           8       0.00      0.00      0.00         1
          12       1.00      0.50      0.67         2
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         1
          15       0.08      0.40      0.13         5
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         4
          20       0.25      0.10      0.14        10
          22       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         6
          30       0.00      0.00      0.00         2

    accuracy                           0.15        34
   macro avg       0.21      0.18      0.18        34
weighted avg       0.17      0.15      0.13        34



In [75]:
test_index = np.random.randint(len(test_skill));

print(f"需要预测的 test_skill[{test_index}]: \n{test_skill[test_index]}\n")
wordbag = vec.transform([test_skill[test_index]])
print(f"预测信息转换为词袋为:\n{wordbag.toarray()[0]}\n")
print(f"预测信息的实际薪资为: {test_target[test_index]}")
print(f"朴素贝叶斯模型预测的薪资结果为: {mnb_vec.predict(wordbag)[0]}\n")
# if test_target[mail_index] == mnb_vec.predict(one_mail)[0]:
#     print("模型预测准确")
# else:
#     print("模型预测不准确")

需要预测的 test_skill[27]: 
javascript,typescript

预测信息转换为词袋为:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0]

预测信息的实际薪资为: 15
朴素贝叶斯模型预测的薪资结果为: 20

