#### 实体标准化表
<img src='images/标准化表1.jpg'></img>
<img src='images/标准化表3.jpg'></img>

## 1.定义各处理类型的函数

In [1]:
from gensim.models import Word2Vec
import numpy as np
wvmodel = Word2Vec.load('F:/Jupyter/--NLP/big_things/models/wikibaikeWV250/wikibaikewv250')
wvdim = 250
import jieba
from sklearn.metrics.pairwise import cosine_similarity
vocab = wvmodel.wv.vocab
wvmodel['哈哈'].shape

  


(250,)

In [74]:
"""Process type : 1
worlist_similarity
1. 把实体列表中的每一个实体表示成一个向量
2. 两两计算词向量相似度
3. 设定阈值，高于阈值的就拿出来计算平均分
4. 可将匹配矩阵导出用于可视化
"""
import copy
def e2v(entity): # 给任何一个任意长度的词，返回一个定长的词向量
    try:
        if entity in vocab:
            return wvmodel[entity]
        else:
            wv = np.zeros((wvdim,))
            words = jieba.lcut(entity)
            count = 0
            for word in words:
                if word in vocab:
                    count += 1
                    wv += wvmodel[word]
            if count > 0:
                return wv/count
            else:
                return np.zeros((wvdim,))
    except Exception as e:
        print('==error==:',e)
        print('==error entity==:',entity)

def wordList_similarity(entity_list1,entity_list2,threshold = 0.2): # 计算两个词列表的相似度
    wv_list1 = [e2v(entity).reshape(1,wvdim) for entity in entity_list1]
    wv_list2 = [e2v(entity).reshape(1,wvdim) for entity in entity_list2]
    E1 = np.concatenate(wv_list1) # shape:(n,wvdim), n is the length of list
    E2 = np.concatenate(wv_list2)
    similarity_matrix = cosine_similarity(E1,E2)
    highlight_matrix = copy.deepcopy(similarity_matrix)
    def show_up(x):
        x[x<threshold] = 0
        return x
    highlight_matrix = show_up(highlight_matrix)
    print('similarity_matrix:\n',similarity_matrix)
    print('highlight_matrix:\n',highlight_matrix)

#     highlight_items = np.squeeze(highlight_matrix.reshape(1,-1))
    highlight_items = highlight_matrix.reshape(1,-1)[0]
    sumup = 0
    count = 0
    highlight_score = 0
    print('highlight_items:',highlight_items)
    for item in highlight_items:
        if item>0:
            sumup += item
            count += 1
    if count>0:
        highlight_score = sumup/count
    print('Overall similarity:',np.average(similarity_matrix))
    print('Highlight similarity:',highlight_score)
    return highlight_score

wordList_similarity(['北京','是','中国','首都'],['巴黎','乃','法国','都城'])

similarity_matrix:
 [[0.4310978  0.17273347 0.2793395  0.24811262]
 [0.08069427 0.24177119 0.15040708 0.07349762]
 [0.17280224 0.14655209 0.31007016 0.20673734]
 [0.4315029  0.11927645 0.19731879 0.5148311 ]]
highlight_matrix:
 [[0.4310978  0.         0.2793395  0.24811262]
 [0.         0.24177119 0.         0.        ]
 [0.         0.         0.31007016 0.20673734]
 [0.4315029  0.         0.         0.5148311 ]]
highlight_items: [0.4310978  0.         0.2793395  0.24811262 0.         0.24177119
 0.         0.         0.         0.         0.31007016 0.20673734
 0.4315029  0.         0.         0.5148311 ]
Overall similarity: 0.23604654
Highlight similarity: 0.33293282985687256


  if sys.path[0] == '':


0.33293282985687256

In [98]:
"""Process type : 2
number_compare
JD和CV的一些标签按照数值大小进行比较：
1. 若CV的数字大于等于JD的要求，就匹配度为1；
2. 若CV的数字小于JD的要求，则计算gap，按照某种函数映射到0~1内，然后用1减去该值，得到匹配度；
3. 若JD的要求为空，则匹配度为1；
4. 若JD又不为空，而CV为空，则匹配度为0；
5. 此处可以设置硬卡控，则不满足要求直接匹配度为0；
"""
import math
import re
def sigmoid(x):
    return 1/(1+pow(math.e,-x))

chinese2num = {'一':1,'二':2,'两':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9,'十':10,\
              '十一':11,'十二':12,'十三':13,'十四':14,'十五':15}
chineseNums = ''.join(list(chinese2num.keys()))
def get_num(sentence):
    s = str(sentence)
    num_cn = re.findall(r"(["+chineseNums+"]{1,2})",s)
#     num_ab = re.findall(r"([0-9]{1,2})",s)
    num_ab = re.findall(r'([0-9]{1,}[.][0-9]*)',s)
    if len(num_cn)>0:
        num = chinese2num[num_cn[0]]
    elif len(num_ab)>0:
        num = num_ab[0]
    else:
        num = -1
    return float(num)

def number_compare_score(JD_entity_list,CV_entity_list,isHardRule = False):
    print(JD_entity_list,CV_entity_list)
    s1 = ''.join([str(x) for x in JD_entity_list])
    s2 = ''.join([str(x) for x in CV_entity_list]) # 要先都转化成str，才能够使用join连起来。原list可能含有其他类型
#     num1 = int(get_num(s1))
#     num2 = int(get_num(s2))
    num1 = get_num(s1)
    num2 = get_num(s2)
    print('jd num:',num1,'cv num:',num2)
    if num1 == -1:
        return 1
    elif num2 == -1:
        return 0
    elif num1 <= num2:
        return 1
    elif isHardRule:
        return 0
    else:
        return 1-sigmoid(abs(num1-num2))

jd = ['四年工作经验']
cv = ['工作3.5年']
number_compare_score(jd,cv,isHardRule=False)

['四年工作经验'] ['工作3.5年']
jd num: 4.0 cv num: 3.5


0.3775406687981454

In [136]:
"""Process type : 3
level_compare

"""
xueli_level = {'小学':1,'初中':2,'高中':3,'大专':4,'本科':5,'硕士':6,'博士':7}
def level_compare(JD_entity_list,CV_entity_list,tag):
    if tag=='学历':
        if len(JD_entity_list) == 0:
            num1 = -1
        else:
            num1 = xueli_level[JD_entity_list[0]]
        if len(CV_entity_list) == 0:
            num2 = -1
        else:
            num2 = xueli_level[CV_entity_list[0]]
        print('jd xueli:',num1,'cv xueli:',num2)
        if num1 == -1:
            return 1
        elif num2 == -1:
            return 0
        elif num1 <= num2:
            return 1
        elif isHardRule:
            return 0
        else:
            return 1-sigmoid(abs(num1-num2))
    
    elif tag=='学校':
        return 1
    else:
        return 1  # ?????????????
    return 1

"""Process type : 4
bool_compare

"""
def bool_compare(JD_entity_list,CV_entity_list):
    set1 = set(JD_entity_list)
    set2 = set(CV_entity_list)
    inter = set1.intersection(set2)
    if len(inter)>0:
        return 1
    else:
        return 0

In [137]:
def matchScore(JD_entity_list,CV_entity_list,process_type,tag=None):
    """
    参数:
    JD_entity_list，CV_entity_list，要比较的两个实体列表
    process_type, 可取1,2,3,4, 详见‘标准化表’
    """
    assert isinstance(JD_entity_list,list)==True, '当前输入的JD_entity_list不是list类型！'
    assert isinstance(CV_entity_list,list)==True, '当前输入的CV_entity_list不是list类型！'
    
    if len(JD_entity_list) == 0:
        print('JD here is empty! Return socre 1.')
        return 1
    if len(CV_entity_list) == 0:
        print('CV here is empty! Return score 0.')
        return 0
    
    if int(process_type) == 1: # 文本相似度
        print("Using wordList_similarity method.")
        return wordList_similarity(JD_entity_list,CV_entity_list,threshold=0.2)
    
    elif int(process_type) == 2: # 数值大小
        print("Using number_compare_score method.")
        return number_compare_score(JD_entity_list,CV_entity_list,isHardRule=False)
    
    elif int(process_type) == 3: # 层次比较
        print("Using level_compare method.")
        return level_compare(JD_entity_list,CV_entity_list,tag)
    
    elif int(process_type) == 4: # 布尔比较
        print("Using bool_compare method.")
        return bool_compare(JD_entity_list,CV_entity_list)
    
    else:
        print("Error: Wrong process_type! Must be 1,2,3,4! Your silly input type is:",process_type)
        return "Error: Wrong process_type! Must be 1,2,3,4!"
    

In [89]:
matchScore('asdf',['规划','物流','技术'],1)

AssertionError: 当前输入的JD_entity_list不是list类型！

## 针对不同的维度，设计对应的计算逻辑
### 使用上面定义好的`matchScore`函数
```
matchScore(JD_entity_list, CV_entity_list, process_type)
```

In [116]:
import pandas as pd
import json
with open('data/cv.json',encoding='utf-8') as f:
    cv_words = json.loads(f.read())
    for each in cv_words:
        print(each)
print('---------')
with open('data/jd.json',encoding='utf-8') as f:
    jd_words = json.loads(f.read())
    for each in jd_words:
        print(each)
        
cv_words['经验']['其他经验'] 

id
教育
经验
技能
基本信息
其他标签
---------
id
name
教育
经验
技能
基本信息
其他标签


{'业务': ['1', []],
 '产品': ['1', []],
 '公司名': ['3', []],
 '年限': ['2', []],
 '所在城市': ['4', []],
 '职位': ['1', []],
 '行业': ['1', []]}

In [146]:
"""
需要修改！！！！！！
允许两个字典不一样，以JD的字典为准！
"""


## 设计一个函数，可以一口气比较两个字典。两个字典的结构完全一样,由tag-value组成
## 根据JD中的tag去找CV中对于的tag，最后各tag的分数加权平均
def dictMatch(JD_dict,CV_dict,tag_weights_dict):
    tags = JD_dict.keys()
    if len(tags) == 0: # 如果JD没有tags，那就当做都满足了
        return 1
    else:
        final_score = 0 # 是各个tag得分的加权平均
        weights_sum = 0
        for tag in tag_weights_dict:
            weights_sum += tag_weights_dict[tag]
        for tag in tags:
            assert JD_dict[tag][0] == CV_dict[tag][0], '老哥，两个标签的处理类型不一样呀，请检查！'
            JD_entity_list = JD_dict[tag][1]
            CV_entity_list = CV_dict[tag][1]
            process_type = JD_dict[tag][0]
            score = matchScore(JD_entity_list,CV_entity_list,process_type)
            score = score*tag_weights_dict[tag]
            print('current tag socre:','tag',tag,score)
            final_score += score
        return final_score/weights_sum # 最后除以这个，使得范围在0~1

In [153]:
"""
计算“经验”维度的得分
"""
def softmax(x): 
    x = np.array(x).reshape(1,len(x))
    return np.exp(x)/np.sum(np.exp(x),axis=1)
def desc_weights(length): # 递减权重，用于多段经历
    w = sorted([i for i in range(1,length+1)],reverse=True)
    return list(softmax(w)[0])

## tag的权重事先给定一个初始值
exp_tag_weights = {'公司名':1, '所在城市':1, '业务':1, '产品':1, '职位':1, '年限':1, '行业':1}
def ExpScore(jd_exp,cv_exp,tag_weights):
    """
    直接从标准化输入中，根据‘经验’的key来取出。
    jd_exp应该直接是一个字典，包含标签和对应的实体；
    cv_exp
    jd_exp,cv_exp都是字典，前者单层，后者多层
    """
    jd_exp = jd_exp
    tags = list(jd_exp.keys())

    allType_scores  =[]
    for exp_type in ['工作经验','项目经验']: # 各自都是一个list，包含若干个dict
        # 对每一种经验，计算其各段子经验的加权平均
        print("------------Now processing %s----------"%exp_type)
        exps = cv_exp[exp_type] # 一个list，包含若干个dict
        if len(exps)>0:
            time_weights = desc_weights(len(exps))
            current_type_score = 0
            for t,exp in enumerate(exps):
                current_type_score += dictMatch(jd_exp,exp,tag_weights)*time_weights[t] 
            allType_scores.append(current_type_score)
        else:
            allType_scores.append(0)
    allType_scores.append(dictMatch(jd_exp,cv_exp['其他经验'],tag_weights)) # 其他经验不是list，就是一个dict，所以直接丢进入算
    
    best_score = max(allType_scores)
    return best_score

ExpScore(jd_words['经验'],cv_words['经验'],tag_weights=exp_tag_weights)

------------Now processing 工作经验----------
JD here is empty! Return socre 1.
current tag socre: tag 公司名 1
JD here is empty! Return socre 1.
current tag socre: tag 所在城市 1
Using wordList_similarity method.
similarity_matrix:
 [[0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
  0.3411799  0.3701852 ]
 [0.46225103 0.40988464 0.37209713 0.29920428 0.40883243 0.36392237
  0.30105929 0.27676373]
 [0.51905503 0.81889691 0.55337567 0.14397976 0.225566   0.5206425
  0.38567083 0.23555425]]
highlight_matrix:
 [[0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
  0.3411799  0.3701852 ]
 [0.46225103 0.40988464 0.37209713 0.29920428 0.40883243 0.36392237
  0.30105929 0.27676373]
 [0.51905503 0.81889691 0.55337567 0.         0.225566   0.5206425
  0.38567083 0.23555425]]
highlight_items: [0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
 0.3411799  0.3701852  0.46225103 0.40988464 0.37209713 0.29920428
 0.40883243 0.36392237 0.30105929 0.27676373 0.51

  if sys.path[0] == '':


0.7382878833273336

In [134]:
jd_words['技能']

['1', ['客户管理', '产品推广\n管理', '监督指定', '维修工程师', '英语说写能力', '微软办公软件']]

In [130]:
"""
计算“技能”维度的得分
这个最简单，就是一个list
"""
def SkillScore(jd_skill,cv_skill):
    process_type = int(jd_skill[0])
    jd_entity_list = jd_skill[1]
    cv_entity_list = cv_skill[1]
    score = matchScore(jd_entity_list,cv_entity_list,process_type)
    return score
SkillScore(jd_words['技能'],cv_words['技能'])

Using wordList_similarity method.
similarity_matrix:
 [[0.50068142 0.52693534 0.39348812 0.56892239 0.43691496 0.56892239]
 [0.55542405 0.54296017 0.35086498 0.59353431 0.50519486 0.59353431]
 [0.40523667 0.36019251 0.35383354 0.37214733 0.31832191 0.37214733]
 [0.39784989 0.4236861  0.34062104 0.46300251 0.33657805 0.46300251]
 [0.33826722 0.24043145 0.38978132 0.48842965 0.56507221 0.48842965]
 [0.47926591 0.48732074 0.35796791 0.47098948 0.38160882 0.47098948]]
highlight_matrix:
 [[0.50068142 0.52693534 0.39348812 0.56892239 0.43691496 0.56892239]
 [0.55542405 0.54296017 0.35086498 0.59353431 0.50519486 0.59353431]
 [0.40523667 0.36019251 0.35383354 0.37214733 0.31832191 0.37214733]
 [0.39784989 0.4236861  0.34062104 0.46300251 0.33657805 0.46300251]
 [0.33826722 0.24043145 0.38978132 0.48842965 0.56507221 0.48842965]
 [0.47926591 0.48732074 0.35796791 0.47098948 0.38160882 0.47098948]]
highlight_items: [0.50068142 0.52693534 0.39348812 0.56892239 0.43691496 0.56892239
 0.55542405 0

  if sys.path[0] == '':


0.4417375140080262

In [132]:
cv_words['教育']

[{'专业': ['1', ['医药营销']], '学历': ['3', ['大专']], '学校': ['3', ['西安医学院']]}]

In [133]:
jd_words['教育']

{'专业': ['1', []], '学历': ['3', ['大专']], '学校': ['3', []]}

In [150]:
edu_tag_weights = {'专业':1,'学历':1,'学校':1}
def EduScore(jd_edu,cv_edu,tag_weights):
    # 当前只考虑最近一段教育经历
    return dictMatch(jd_edu,cv_edu[0],tag_weights)

EduScore(jd_words['教育'],cv_words['教育'],edu_tag_weights)

JD here is empty! Return socre 1.
current tag socre: tag 学校 1
Using level_compare method.
current tag socre: tag 学历 1
JD here is empty! Return socre 1.
current tag socre: tag 专业 1


1.0

In [162]:
def OverallScore(jd,cv,exp_tag_weights,edu_tag_weights):
    exp_score = ExpScore(jd['经验'],cv['经验'],exp_tag_weights)
    skill_score = SkillScore(jd['技能'],cv['技能'])
    edu_score = EduScore(jd['教育'],cv['教育'],edu_tag_weights)
    print('======================Result:========================')
    print('JD name: ',jd['name'],' JD id: ',jd['id'])
    print(' CV id: ',cv['id'])
    print('经验维度得分：',exp_score)
    print('技能维度得分：',skill_score)
    print('教育维度得分：',edu_score)
    print('总分：',exp_score+skill_score+edu_score)
    return exp_score+skill_score+edu_score

exp_tag_weights = {'公司名':1, '所在城市':1, '业务':1, '产品':1, '职位':1, '年限':1, '行业':1}
edu_tag_weights = {'专业':1,'学历':1,'学校':1}
OverallScore(jd_words,cv_words,exp_tag_weights,edu_tag_weights)

------------Now processing 工作经验----------
JD here is empty! Return socre 1.
current tag socre: tag 公司名 1
JD here is empty! Return socre 1.
current tag socre: tag 所在城市 1
Using wordList_similarity method.
similarity_matrix:
 [[0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
  0.3411799  0.3701852 ]
 [0.46225103 0.40988464 0.37209713 0.29920428 0.40883243 0.36392237
  0.30105929 0.27676373]
 [0.51905503 0.81889691 0.55337567 0.14397976 0.225566   0.5206425
  0.38567083 0.23555425]]
highlight_matrix:
 [[0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
  0.3411799  0.3701852 ]
 [0.46225103 0.40988464 0.37209713 0.29920428 0.40883243 0.36392237
  0.30105929 0.27676373]
 [0.51905503 0.81889691 0.55337567 0.         0.225566   0.5206425
  0.38567083 0.23555425]]
highlight_items: [0.56091617 0.57250099 0.60695407 0.2048798  0.35311877 0.66420528
 0.3411799  0.3701852  0.46225103 0.40988464 0.37209713 0.29920428
 0.40883243 0.36392237 0.30105929 0.27676373 0.51

  if sys.path[0] == '':


2.18002539733536

In [154]:
jd_words.keys()

dict_keys(['id', 'name', '教育', '经验', '技能', '基本信息', '其他标签'])

In [155]:
cv_words.keys()

dict_keys(['id', '教育', '经验', '技能', '基本信息', '其他标签'])