In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./finalData/finalData.csv')
tagInfo = pd.read_csv("./finalData/tagInfo.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,tag,job,low_wages,hight_wages,year_wages,city,ink,staff,skill,jobDes,inkDes
0,0,0,运维工程师（数据库方向）,9.0,15.0,144.0,上海,嘉扬,500,"SQL,SQL,Server,ORACLE,MySQL,SQLServer,DBA,DB",,"移动互联网,不需要融资"
1,1,0,dba数据库工程师,6.0,10.0,96.0,上海,领途网络,50,"SQL,Server,MySQL,ORACLE,PostgreSQL,PSQL","技术总监亲自带队,五险一金,补贴等,组织旅游","人工智能服务,物联网,网络通信,不需要融资"
2,2,0,数据库工程师dba,8.0,9.0,102.0,广州,巨杉数据库,500,"JAVA,Python,Shell,Linux",公司正规发展空间大注重培养五险一金,"数据服务,咨询,D轮及以上"
3,3,0,数据库工程师dba,8.0,9.0,102.0,北京,巨杉数据库,500,"JAVA,Shell,Hadoop,Spark,Linux",公司正规发展空间大注重培养五险一金,"数据服务,咨询,D轮及以上"
4,4,0,图数据库开发工程师,10.0,15.0,150.0,北京,赛思信安,500,"JAVA,Titan,janusGraph","五险一金,培养,员工体检,不加班,年底奖金","数据服务,咨询,不需要融资"


In [4]:
data = data.dropna()

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2074 entries, 1 to 2691
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2074 non-null   int64  
 1   tag          2074 non-null   int64  
 2   job          2074 non-null   object 
 3   low_wages    2074 non-null   float64
 4   hight_wages  2074 non-null   float64
 5   year_wages   2074 non-null   float64
 6   city         2074 non-null   object 
 7   ink          2074 non-null   object 
 8   staff        2074 non-null   int64  
 9   skill        2074 non-null   object 
 10  jobDes       2074 non-null   object 
 11  inkDes       2074 non-null   object 
dtypes: float64(3), int64(3), object(6)
memory usage: 210.6+ KB


In [6]:
# 全部小写
data["skill"] = [text.lower() for text in data["skill"]]

In [7]:
train, test = train_test_split(data, train_size=0.90, test_size=0.10)

In [8]:
print(f"训练集长度: {len(train)}\n测试集长度: {len(test)}")
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(f"训练集预览：\n{train.head()}")

训练集长度: 1866
测试集长度: 208
训练集预览：
   Unnamed: 0  tag         job  low_wages  hight_wages  year_wages city   ink  \
0        2436    7     业务运维工程师       15.0         30.0       270.0   深圳    领星   
1         495    2  web前端开发工程师       10.0         15.0       150.0   西安  杭州世平   
2        1459    4       测试工程师        8.0         11.0       114.0   广州   赛博威   
3        2057    6     移动开发工程师       13.0         18.0       186.0   成都   NCS   
4        1614    4       测试工程师        7.0          8.0        90.0   上海  速强信息   

   staff                                        skill                jobDes  \
0   2000  java,python,mysql,kubernetes,k,s,prometheus  高速增长,扁平管理,团队氛围好,文化活泼   
1    500                            html,html,css,vue        五险一金,交通补助,话费补助   
2   2000                             sql,server,mysql             周末双休,五险一金   
3   2000                                      it,java   出国工作机会,跨国同事合作,团队氛围好   
4    500                                      it,java       平时项目组不加班,加班有加班费   

        

In [9]:
train_skill = list(train["skill"])
train_target = list(train["tag"])
test_skill = list(test["skill"])
test_target = list(test["tag"])

## 基于词袋的多项式朴素贝叶斯

### 使用CountVectorizer构建训练集词袋

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_train_vec = vec.fit_transform(train_skill)
X_test_vec = vec.transform(test_skill)

y_train_vec = train_target
y_test_vec = test_target

vec_result = pd.DataFrame(X_train_vec.toarray(), columns = vec.get_feature_names())
vec_result

Unnamed: 0,activemq,activity,ad,adams,adb,ai,aiops,aix,allegro,allure,...,vxlan,waf,web,webgl,weblogic,webpack,windows,xhtml,xml,zabbix
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1861,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1862,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1863,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 使用多项式朴素贝叶斯进行建模和预测

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
mnb_vec = MultinomialNB()
mnb_vec.fit(X_train_vec,y_train_vec)
y_predict_vec = mnb_vec.predict(X_test_vec)
print(f"预测准确率为:{mnb_vec.score(X_test_vec,y_test_vec)}\n")
print(classification_report(y_test_vec,y_predict_vec))

预测准确率为:0.6875

              precision    recall  f1-score   support

           0       0.68      0.72      0.70        18
           1       0.67      0.18      0.29        11
           2       0.82      0.87      0.84        46
           3       0.81      0.60      0.69        35
           4       0.50      0.64      0.56        33
           5       1.00      0.60      0.75        20
           6       0.80      0.50      0.62         8
           7       0.58      0.81      0.67        37

    accuracy                           0.69       208
   macro avg       0.73      0.62      0.64       208
weighted avg       0.72      0.69      0.68       208



In [15]:
print(f"类别参考标准:\n{tagInfo}\n")
test_index = np.random.randint(len(test_skill));
print(f"需要预测的 test_skill[{test_index}]: \n{test_skill[test_index]}\n")
wordbag = vec.transform([test_skill[test_index]])
print(f"预测信息转换为词袋为:\n{wordbag.toarray()[0]}\n")
print(f"预测信息的实际职位为: {test_target[test_index]}")
print(f"朴素贝叶斯模型预测的职位结果为: {mnb_vec.predict(wordbag)[0]}\n")
if test_target[test_index] == mnb_vec.predict(wordbag)[0]:
    print("模型预测准确")
else:
    print("模型预测不准确")

类别参考标准:
   Unnamed: 0     class  tag
0           0   DBA.csv    0
1           1  人工智能.csv    1
2           2  前端开发.csv    2
3           3  后端开发.csv    3
4           4    测试.csv    4
5           5  硬件开发.csv    5
6           6  移动开发.csv    6
7           7    运维.csv    7

需要预测的 test_skill[78]: 
it,python,mysql

预测信息转换为词袋为:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]

预测信息的实际职位为: 4
朴素贝叶斯模型预测的职位结果为: 4

模型预测准确


In [13]:
# 模型保存
with open('model/predict.pickle','wb') as f:
    pickle.dump(mnb_vec,f)
with open('model/wordbag.pickle','wb') as f:
    pickle.dump(vec,f)