In [1]:
import numpy as np
import time
import os
import gensim
import jieba
import warnings
import pandas as pd
import jieba.posseg as pseg
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim
from tqdm import tqdm_notebook
from gensim.models.doc2vec import Doc2Vec
from gensim import corpora, models
from pyspark.sql import SparkSession

%matplotlib inline

warnings.filterwarnings(action='ignore')



In [2]:
jieba.load_userdict('../model/backup_cc/user_dict.txt') # jieba载入自定义词典

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ianxu\AppData\Local\Temp\jieba.cache
Loading model cost 0.627 seconds.
Prefix dict has been built succesfully.


In [3]:
def CreateSparkSession(appName="ResumeBehavior"):
    spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName(appName) \
        .getOrCreate()
    return spark

def getSparkSessionInstance():
    if ("sparkSessionResumeBehaviorInstance" not in globals()):
        globals()["sparkSessionResumeBehaviorInstance"] = CreateSparkSession()
    return globals()["sparkSessionResumeBehaviorInstance"]

In [4]:
parquet_file = '../../IRecruit_proj/trunk/IResume/ResumeAnalyse/model/ParquetResume'
post_file = '../../IRecruit_proj/trunk/IResume/ResumeAnalyse/model/ParquetPostSim'

In [5]:
sqlCxt = getSparkSessionInstance()
resumeDF = sqlCxt.read.parquet(parquet_file)
resumeDF.registerTempTable("resumes")

In [6]:
postDF = sqlCxt.read.parquet(post_file)
postDF.registerTempTable("post")

In [7]:
high_proba_resumes = pd.DataFrame()
for i in tqdm_notebook(range(1,12)):
    temp_post = sqlCxt.sql(f"select * from (SELECT ResumeId, PostType, Sim, Row_Number() OVER (order by Sim desc) as rank FROM post WHERE GroupId = 78 AND PostType = {str(i)}) as temp where rank <= 6000").toPandas()
    temp_post.drop('rank', axis=1, inplace=True)
    high_proba_resumes = pd.concat([high_proba_resumes, temp_post])

A Jupyter Widget




In [8]:
resumeIds = high_proba_resumes.ResumeId.tolist()
resume_df = sqlCxt.sql(f"SELECT ResumeId, Content FROM resumes WHERE GroupId = 78 and ResumeId in {tuple(resumeIds)}").toPandas()

In [9]:
high_proba_resumes = high_proba_resumes.merge(resume_df, on='ResumeId', how='inner')

In [11]:
high_proba_resumes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60581 entries, 0 to 60580
Data columns (total 4 columns):
ResumeId    60581 non-null int64
PostType    60581 non-null int64
Sim         60581 non-null float64
Content     60581 non-null object
dtypes: float64(1), int64(2), object(1)
memory usage: 2.3+ MB


In [16]:
label2pos = {'1': 'php', '2': 'java', '3': 'c++', '4': 'web前端', '5': 'ios',\
            '6': 'android', '7': '系统测试', '8': '产品策划', '9': '产品运营', '10': '基础研究', '11': '游戏客户端'}

lda_model_path = '../model/post_lda_models/'
if not os.path.exists(lda_model_path):
    os.makedirs(lda_model_path)
    
topic_model_list = []
corpus_list = []
dictionary_list = []
    
num_topics = 6
for i in range(1, 12):
    print(label2pos[str(i)], ':')
    resume_corpus = high_proba_resumes[high_proba_resumes.PostType==i].Content.apply(lambda x: x.split(',')).tolist()
    dictionary_LDA = corpora.Dictionary(resume_corpus)
    dictionary_LDA.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in resume_corpus]
    
    %time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=50, alpha=[1]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    
    lda_model.save(lda_model_path + 'LDA_%s'% i )
    topic_model_list.append(lda_model)
    corpus_list.append(corpus)
    dictionary_list.append(dictionary_LDA)
    
    for i, topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=50):
        print(str(i)+": "+ topic)
        print('-------------------------------------------------------------------------')
    print('===========================================================================================================================\n')

php :
Wall time: 1min 38s
0: 0.019*"业绩" + 0.018*"职责" + 0.016*"有限公司" + 0.016*"中国" + 0.013*"简介" + 0.010*"北京" + 0.009*"企业" + 0.008*"情况" + 0.007*"人数" + 0.007*"产品" + 0.007*"下属" + 0.007*"研发" + 0.007*"雇主" + 0.007*"名称" + 0.006*"薪酬" + 0.006*"建设" + 0.006*"安全" + 0.005*"集团" + 0.005*"科技" + 0.005*"发展" + 0.005*"保密" + 0.005*"客户" + 0.005*"员工" + 0.004*"相关" + 0.004*"实施" + 0.004*"参与" + 0.004*"上海" + 0.004*"全球" + 0.004*"完成" + 0.004*"维护" + 0.004*"中心" + 0.004*"需求" + 0.004*"网络" + 0.004*"学历" + 0.004*"团队" + 0.004*"生产" + 0.004*"部门" + 0.004*"汇报" + 0.004*"业务" + 0.004*"对象" + 0.004*"培训" + 0.004*"包括" + 0.004*"组织" + 0.004*"国际" + 0.003*"学校名称" + 0.003*"国家" + 0.003*"招聘" + 0.003*"数据中心" + 0.003*"学院" + 0.003*"沟通"
-------------------------------------------------------------------------
1: 0.014*"业务" + 0.009*"优化" + 0.008*"百度" + 0.008*"redis" + 0.008*"架构" + 0.007*"接口" + 0.006*"完成" + 0.006*"监控" + 0.006*"产品" + 0.006*"支持" + 0.005*"广告" + 0.005*"存储" + 0.005*"框架" + 0.005*"mysql" + 0.005*"php" + 0.005*"团队" + 0.005*"数据库" + 0.005*"接入" 

4: 0.032*"环境" + 0.024*"软件" + 0.018*"开发工具" + 0.017*"oracle" + 0.011*"java" + 0.009*"硬件" + 0.009*"eclipse" + 0.008*"linux" + 0.008*"报表" + 0.007*"设备" + 0.007*"采用" + 0.006*"应用" + 0.006*"监控" + 0.006*"维护" + 0.006*"服务器" + 0.005*"hibernate" + 0.005*"sql" + 0.005*"tomcat" + 0.005*"相关" + 0.005*"windows" + 0.004*"编写" + 0.004*"移动" + 0.004*"struts" + 0.004*"jsp" + 0.004*"运行" + 0.004*"采集" + 0.004*"测试" + 0.004*"处理" + 0.003*"工具" + 0.003*"web" + 0.003*"完成" + 0.003*"人员" + 0.003*"weblogic" + 0.003*"myeclipse" + 0.003*"server" + 0.003*"部门" + 0.003*"综合" + 0.003*"子系统" + 0.003*"电信" + 0.003*"集成" + 0.003*"过程" + 0.003*"查询" + 0.003*"文档" + 0.003*"参与" + 0.003*"部分" + 0.003*"统计" + 0.003*"简介" + 0.003*"ibm" + 0.003*"终端" + 0.002*"j2ee"
-------------------------------------------------------------------------
5: 0.026*"银行" + 0.019*"支付" + 0.016*"交易" + 0.011*"金融" + 0.010*"处理" + 0.010*"资金" + 0.009*"核心" + 0.009*"报表" + 0.009*"平安" + 0.008*"账户" + 0.007*"贷款" + 0.007*"产品" + 0.007*"结算" + 0.006*"风险" + 0.006*"查询" + 0.006*"流程" + 0.0

2: 0.026*"网站" + 0.025*"完成" + 0.020*"交互" + 0.019*"css" + 0.014*"代码" + 0.014*"编写" + 0.014*"jquery" + 0.013*"html" + 0.012*"产品" + 0.012*"优化" + 0.012*"需求" + 0.012*"javascript" + 0.012*"js" + 0.011*"效果" + 0.011*"布局" + 0.010*"浏览器" + 0.010*"运用" + 0.010*"设计" + 0.009*"web" + 0.009*"html5" + 0.009*"css3" + 0.009*"网页" + 0.008*"移动" + 0.008*"科技" + 0.008*"ajax" + 0.008*"维护" + 0.006*"配合" + 0.006*"学院" + 0.006*"利用" + 0.006*"熟悉" + 0.006*"制作" + 0.006*"体验" + 0.006*"了解" + 0.006*"界面" + 0.005*"熟练" + 0.005*"学习" + 0.005*"解决" + 0.005*"ui" + 0.005*"bootstrap" + 0.005*"相关" + 0.005*"性能" + 0.005*"响应" + 0.005*"静态" + 0.005*"div" + 0.004*"应用" + 0.004*"学历" + 0.004*"参与" + 0.004*"模块" + 0.004*"调试" + 0.004*"学校名称"
-------------------------------------------------------------------------
3: 0.055*"设计" + 0.026*"app" + 0.025*"产品" + 0.017*"网站" + 0.011*"制作" + 0.010*"活动" + 0.010*"移动" + 0.009*"提供" + 0.008*"官网" + 0.008*"pc" + 0.008*"ui" + 0.007*"服务" + 0.007*"企业" + 0.007*"交互" + 0.007*"需求" + 0.007*"客户" + 0.007*"科技" + 0.007*"商城" + 0.0

0: 0.023*"游戏" + 0.021*"环境" + 0.020*"软件" + 0.018*"sdk" + 0.018*"客户端" + 0.015*"开发工具" + 0.010*"管理" + 0.009*"支付" + 0.009*"eclipse" + 0.008*"下载" + 0.007*"后台" + 0.007*"视频" + 0.007*"硬件" + 0.007*"信息" + 0.007*"服务器" + 0.007*"包括" + 0.006*"广告" + 0.006*"公司" + 0.006*"支持" + 0.006*"http" + 0.005*"接入" + 0.005*"登录" + 0.005*"ios" + 0.005*"采用" + 0.005*"windows" + 0.005*"版本" + 0.004*"java" + 0.004*"聊天" + 0.004*"数据库" + 0.004*"消息" + 0.004*"移动" + 0.004*"pc" + 0.004*"接口" + 0.004*"com" + 0.004*"语音" + 0.004*"助手" + 0.004*"推送" + 0.003*"编写" + 0.003*"文件" + 0.003*"发送" + 0.003*"展示" + 0.003*"协议" + 0.003*"好友" + 0.003*"地图" + 0.003*"web" + 0.003*"studio" + 0.003*"微信" + 0.003*"上传" + 0.003*"图片" + 0.003*"中心"
-------------------------------------------------------------------------
1: 0.023*"产品" + 0.018*"设计" + 0.016*"公司" + 0.016*"职责" + 0.013*"研发" + 0.013*"团队" + 0.012*"参与" + 0.012*"需求" + 0.011*"移动" + 0.011*"完成" + 0.010*"客户端" + 0.009*"业绩" + 0.008*"业务" + 0.008*"维护" + 0.008*"简介" + 0.008*"雇主" + 0.008*"名称" + 0.008*"版本" + 0.007*"科技"

-------------------------------------------------------------------------
5: 0.020*"app" + 0.020*"bug" + 0.016*"用户" + 0.016*"模块" + 0.012*"提交" + 0.011*"后台" + 0.010*"评审" + 0.010*"测试报告" + 0.009*"跟踪" + 0.008*"参与" + 0.007*"文档" + 0.007*"信息" + 0.007*"功能测试" + 0.007*"用例" + 0.007*"订单" + 0.007*"web" + 0.007*"缺陷" + 0.007*"商品" + 0.006*"熟悉" + 0.006*"提供" + 0.006*"接口" + 0.006*"软件" + 0.006*"回归" + 0.006*"测试计划" + 0.006*"测试环境" + 0.006*"性能" + 0.005*"工具" + 0.005*"包括" + 0.005*"管理系统" + 0.005*"科技" + 0.005*"搭建" + 0.005*"学院" + 0.005*"手机" + 0.004*"流程" + 0.004*"商城" + 0.004*"环境" + 0.004*"前台" + 0.004*"登录" + 0.004*"服务" + 0.004*"结果" + 0.003*"操作" + 0.003*"网站" + 0.003*"兼容性" + 0.003*"总结" + 0.003*"查看" + 0.003*"职责" + 0.003*"设置" + 0.003*"注册" + 0.003*"linux" + 0.003*"了解"
-------------------------------------------------------------------------

产品策划 :
Wall time: 4min 31s
0: 0.013*"研发" + 0.012*"计划" + 0.012*"问题" + 0.010*"组织" + 0.008*"生产" + 0.007*"实施" + 0.007*"销售" + 0.007*"项目管理" + 0.007*"跟进" + 0.007*"过程" + 0.007*"质量" + 0.007*"培

3: 0.017*"宣传" + 0.017*"撰写" + 0.010*"公关" + 0.010*"文案" + 0.010*"企业" + 0.008*"相关" + 0.007*"集团" + 0.007*"设计" + 0.007*"组织" + 0.007*"沟通" + 0.007*"客户" + 0.006*"行业" + 0.006*"主要" + 0.006*"中国" + 0.006*"新闻" + 0.006*"计划" + 0.006*"制作" + 0.006*"广告" + 0.005*"文化" + 0.005*"现场" + 0.005*"关系" + 0.005*"协助" + 0.005*"发布会" + 0.005*"创意" + 0.005*"稿件" + 0.005*"统筹" + 0.005*"发布" + 0.005*"深圳" + 0.005*"对接" + 0.005*"大型" + 0.004*"上海" + 0.004*"物料" + 0.004*"年度" + 0.004*"报告" + 0.004*"定位" + 0.004*"国际" + 0.004*"部门" + 0.004*"效果" + 0.004*"规划" + 0.003*"论坛" + 0.003*"展会" + 0.003*"媒介" + 0.003*"主题" + 0.003*"跟进" + 0.003*"前期" + 0.003*"对外" + 0.003*"危机" + 0.003*"协调" + 0.003*"会议" + 0.003*"职责"
-------------------------------------------------------------------------
4: 0.052*"游戏" + 0.012*"赛事" + 0.011*"直播" + 0.010*"手游" + 0.009*"玩家" + 0.009*"主播" + 0.007*"腾讯" + 0.006*"主要" + 0.006*"社区" + 0.006*"宣传" + 0.005*"相关" + 0.005*"校园" + 0.005*"网易" + 0.005*"跟进" + 0.005*"计划" + 0.005*"期间" + 0.005*"版本" + 0.005*"体育" + 0.004*"发行" + 0.004*"素材" + 0.004*"视频" 

1: 0.019*"ui" + 0.017*"科技" + 0.016*"参与" + 0.014*"学校名称" + 0.012*"动作" + 0.012*"端游" + 0.012*"学院" + 0.011*"三国" + 0.010*"界面" + 0.010*"上线" + 0.010*"深圳" + 0.010*"大学" + 0.009*"卡牌" + 0.009*"专业" + 0.009*"网络" + 0.009*"网易" + 0.009*"风格" + 0.009*"广州" + 0.008*"学历" + 0.008*"英雄" + 0.008*"mmorpg" + 0.008*"特效" + 0.007*"技能" + 0.007*"rpg" + 0.007*"题材" + 0.007*"角色" + 0.006*"arpg" + 0.006*"西游" + 0.006*"网络科技" + 0.006*"类型" + 0.005*"回合制" + 0.005*"传奇" + 0.005*"u3d" + 0.005*"深圳市" + 0.005*"业绩" + 0.005*"怪物" + 0.005*"职业" + 0.005*"世界" + 0.004*"手机游戏" + 0.004*"棋牌" + 0.004*"武侠" + 0.004*"moba" + 0.004*"天下" + 0.004*"横版" + 0.004*"部分" + 0.004*"大型" + 0.004*"仙侠" + 0.004*"期间" + 0.004*"战斗" + 0.004*"研发"
-------------------------------------------------------------------------
2: 0.044*"系统" + 0.022*"策划" + 0.020*"玩法" + 0.016*"数值" + 0.016*"战斗" + 0.012*"进行" + 0.011*"内容" + 0.011*"关卡" + 0.010*"活动" + 0.010*"相关" + 0.009*"版本" + 0.009*"技能" + 0.009*"功能" + 0.009*"完成" + 0.008*"需求" + 0.008*"运营" + 0.008*"玩家" + 0.007*"美术" + 0.007*"核心" + 0.007*"

### 可视化表示各岗位主题：

- 左侧圆圈中心距离表示主题之间的差距，越近则该两个主题越接近
- 左侧圆圈的大小表示该主题占该岗位所有简历的多少，越大则该主题的简历越多

<font color=red>**右侧关键词条状图的选择标准公式：**</font> $r(w,k\mid \lambda)=\lambda log(\phi _{kw}))+(1-\lambda)log(\frac{\phi _{kw}}{p_{w}})$
> 看一个主题的关键词，不仅要看该主题下这个词出现的频率（因素1），还要看相比于其他主题，这个词对该主题的独有性（因素2）。$\lambda$ 用来调节因素1和因素2的权重，等于1时只看因素1，等于2时只看因素2。$\lambda = 0$ 更能看出各主题之间出现的特定词，所以建议将右侧$\lambda$ 先调节到0看主题特性，然后调节$\lambda$到1做进一步参考。

<font color=red>*NOTE：*</font>
每个岗位的6个主题可能有部分多余和重叠，去掉多余无用的主题， 合并相似重叠的主题。

#### <font color=blue>岗位1：PHP</font>

In [17]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[0], corpus=corpus_list[0], dictionary=dictionary_list[0])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位2：JAVA</font>

In [18]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[1], corpus=corpus_list[1], dictionary=dictionary_list[1])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位3：C++</font>

In [19]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[2], corpus=corpus_list[2], dictionary=dictionary_list[2])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位4：Web前端</font>

In [21]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[3], corpus=corpus_list[3], dictionary=dictionary_list[3])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位5：IOS</font>

In [23]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[4], corpus=corpus_list[4], dictionary=dictionary_list[4])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位6：Android</font>

In [24]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[5], corpus=corpus_list[5], dictionary=dictionary_list[5])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位7：系统测试</font>

In [26]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[6], corpus=corpus_list[6], dictionary=dictionary_list[6])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位8：产品策划</font>

In [27]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[7], corpus=corpus_list[7], dictionary=dictionary_list[7])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位9：产品运营</font>

In [29]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[8], corpus=corpus_list[8], dictionary=dictionary_list[8])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位10：基础研究</font>

In [30]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[9], corpus=corpus_list[9], dictionary=dictionary_list[9])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

#### <font color=blue>岗位11：游戏客户端</font>

In [31]:
vis = pyLDAvis.gensim.prepare(topic_model=topic_model_list[10], corpus=corpus_list[10], dictionary=dictionary_list[10])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)