In [1]:
"""
加载LTP工具:
"""
import os
LTP_DATA_DIR = 'F:/MyDownloads/ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径
from pyltp import Segmentor # 分词器
from pyltp import Postagger # 词性标注器
from pyltp import Parser # 句法分析器

segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'data/userword.txt')
# segmentor.load(cws_model_path)  # 加载模型
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型

def get_words_and_tags(sentence):
    words = list(segmentor.segment(sentence))
    postags = list(postagger.postag(words))
    return [words,postags]

relation_dic = {'SBV':'主谓关系','VOB':'动宾关系','IOB':'间宾关系','FOB':'前置宾语',\
                'DBL':'兼语','ATT':'定中关系','ADV':'状中结构','CMP':'动补结构',\
                'COO':'并列关系','POB':'介宾关系','LAD':'左附加关系','RAD':'右附加关系','IS':'独立结构','HED':'核心关系','WP':'标点'}

In [28]:
#加载停用词表：
stopwords = []
with open(r'C:\Users\x1c\Desktop\猎上顺丰简历项目\stopwords.txt','r',encoding='utf-8') as f:
    stopwords1 = f.readlines()
stopwords1 = [x.replace('\n','') for x in stopwords1]
with open(r'C:\Users\x1c\Desktop\猎上顺丰简历项目\stopwords_additional.txt','r',encoding='utf-8') as f:
    stopwords2 = f.readlines()
stopwords2 = [x.replace('\n','') for x in stopwords2]
stopwords = stopwords1+stopwords2

In [31]:
"""
抽取句子中指定关系的动词：
"""
import re
# 自定义分句：
def beyondSplit(x):
    sentences = re.split("。|，|？|！|：|；|\.|,|\?|\!|:|;|\n|\t",x)
    return sentences
selected_relations = ['SBV','VOB','IOB','FOB','ATT','COO','HED']


def getImportantVerbs(sentence,selected_relations=selected_relations):
    important_verbs = []
    # 当句子太长的时候，会让LTP崩溃，也会让你崩溃！因为LTP不会告诉你出了什么问题！
    if len(sentence)>200:
        sentences = beyondSplit(sentence)
    else:
        sentences = [sentence]
    for sentence in sentences:
        words,postags = get_words_and_tags(sentence)
        arcs = parser.parse(words, postags)
        for index,arc in enumerate(arcs):
            word = words[index]
            relation = arc.relation
#             print(word,relation)
            tag = postags[index]
            if relation in selected_relations \
            and tag == 'v' and len(word)>1 \
            and word not in stopwords:
                important_verbs.append(word)
    return important_verbs

In [32]:
print(get_words_and_tags('开发机器学习系统'))
print(getImportantVerbs('开发机器学习系统'))
print(get_words_and_tags('进行系统的设计、研发工作'))
print(getImportantVerbs('进行系统的设计、研发工作'))

[['开发', '机器学习', '系统'], ['v', 'n', 'n']]
['开发']
[['进行', '系统', '的', '设计', '、', '研发', '工作'], ['v', 'a', 'u', 'v', 'wp', 'v', 'v']]
['设计', '研发']


In [33]:
"""
对CV的动词词频统计：
"""
import os
import pandas as pd

fs = os.listdir('data/CV_data')
categories = [x.split('.')[0] for x in fs]
# ['Algorithms', 'Embedded', 'HR', 'JAVA', 'Reserve_Cadres', 'TEST']
CV_v_dict = {}
for cate in categories:
    CV_v_dict[cate] = []

In [34]:
for cate,f in zip(categories,fs):
    df = pd.read_excel('data/CV_data/'+f)
    print(cate,df.shape)

Algorithms (300, 12)
Embedded (1000, 12)
HR (1000, 12)
JAVA (1000, 12)
Reserve_Cads (1000, 12)
Test (1000, 11)


In [90]:
from collections import Counter
# index = 5
# f = fs[index]
# cate = categories[index]
for f,cate in zip(fs,categories):
    print("正在处理文件：",f)
    df = pd.read_excel('data/CV_data/'+f,)
    for s1,s2,s3 in zip(df['项目-描述'],df['项目-职责'],df['工作内容']):
        if isinstance(s1,str):
            CV_v_dict[cate] += getImportantVerbs(s1) 
        if isinstance(s2,str):
            CV_v_dict[cate] += getImportantVerbs(s2) 
        if isinstance(s3,str):
            CV_v_dict[cate] += getImportantVerbs(s3) 
    print(f,'共包含动词数：',len(CV_v_dict[cate]))
    counted_dic = Counter(CV_v_dict[cate])
    sorted_dic = sorted(counted_dic.items(),key=lambda x:x[1],reverse=True)
    print("正在将排序后的数据写入文件......")
    with open('results/ImportantVerbs_'+cate+'.txt','w',encoding='utf-8') as txt:
        for item in sorted_dic:
            line = str(item[0])+','+str(item[1])
            txt.write(line+"\n")
    print("写入完毕！")

正在处理文件： Algorithms.xlsx
Algorithms.xlsx 共包含动词数： 10672
正在将排序后的数据写入文件......
写入完毕！
正在处理文件： Embedded.xlsx
Embedded.xlsx 共包含动词数： 30086
正在将排序后的数据写入文件......
写入完毕！
正在处理文件： HR.xlsx
HR.xlsx 共包含动词数： 29380
正在将排序后的数据写入文件......
写入完毕！
正在处理文件： JAVA.xls
JAVA.xls 共包含动词数： 57584
正在将排序后的数据写入文件......
写入完毕！
正在处理文件： Reserve_Cads.xlsx
Reserve_Cads.xlsx 共包含动词数： 9652
正在将排序后的数据写入文件......
写入完毕！
正在处理文件： Test.xlsx
Test.xlsx 共包含动词数： 47006
正在将排序后的数据写入文件......
写入完毕！


In [103]:
"""
CV词频分布：
"""
from pyecharts import Line
from pyecharts import WordCloud
Percents = []
for cate in categories:
    total_num = len(CV_v_dict[cate])
    counted_dic = Counter(CV_v_dict[cate])
    sorted_dic = sorted(counted_dic.items(),key=lambda x:x[1],reverse=True)
    unique_num = len(sorted_dic)
    print(total_num,unique_num)
    percents = []
    accunulated_num = 0
    for i,each in enumerate(sorted_dic):
        top_num = i+1 # 前几个词
        accunulated_num += each[1]
        percent = accunulated_num/total_num
        percents.append(percent)
    Percents.append(percents)
    wordcloud = WordCloud(width=1300, height=620)
    wordcloud.add("",[x[0] for x in sorted_dic], [x[1] for x in sorted_dic], word_size_range=[20, 100])
    wordcloud.render('results/wordcloud/'+cate+'.html')

pad_Percents = []
for each in Percents:
#     print(len(each))
    len_plus = 2371-len(each)
    plus = [1 for _ in range(len_plus)]
    each = each + plus
    pad_Percents.append(each)
for each in pad_Percents:
    print(len(each))

# 画词频折线图：
attr = [x+1 for x in range(2371)]
line = Line("CV动词词频分布(不含停用词、非重要关系词)",width=1500,height=600)
# line.use_theme("")
for cate,v in zip(categories,pad_Percents):
    line.add(cate, attr, v, \
             mark_point=[{"coord": [200,v[201]], "name": "top 200"}],
             mark_point_textcolor='#333333',
            is_fill=True,
            area_opacity=0.3,
            is_smooth=True,
            legend_selectedmode='single')
line.render('results/CV动词词频分布.html')

10672 1145
30086 1833
29380 1761
57584 2286
9652 1366
47006 2371
2371
2371
2371
2371
2371
2371


In [42]:
"""
JD动词分布统计：
"""
from collections import Counter
import pandas as pd
df = pd.read_csv('data/position.csv',header=0)
reserve_df = df[df['position_title']=='储备干部']
reserve_verbs = []
for s1,s2 in zip(reserve_df['description'],reserve_df['job_requirements']):
    if isinstance(s1,str):
        reserve_verbs += getImportantVerbs(s1)
    if isinstance(s2,str):
        reserve_verbs += getImportantVerbs(s2)
counted_dic = Counter(reserve_verbs)
sorted_dic = sorted(counted_dic.items(),key = lambda x:x[1],reverse=True)
print("正在写入....")
with open('ImportantVerbs_Reserve_JD.txt','w',encoding='utf-8') as txt:
    for item in sorted_dic:
        line = str(item[0])+','+str(item[1])
        txt.write(line+"\n")
print("写入完毕！")

正在写入....
写入完毕！


In [104]:
segmentor.release()
postagger.release()
parser.release()