In [1]:
from gensim.models import Word2Vec
import numpy as np
wvmodel = Word2Vec.load('../big_things/models/wikibaikeWV250/wikibaikewv250')
wvdim = 250



In [11]:
wvmodel.most_similar('激素',topn=20)

  """Entry point for launching an IPython kernel.


[('雄激素', 0.8141771554946899),
 ('催乳素', 0.809292197227478),
 ('荷尔蒙', 0.7942413091659546),
 ('促性腺激', 0.7935157418251038),
 ('雌激素', 0.7930401563644409),
 ('神经递质', 0.7916024923324585),
 ('孕激素', 0.790703296661377),
 ('性激素', 0.7873826026916504),
 ('皮质激素', 0.7795277833938599),
 ('孕酮', 0.7776371240615845),
 ('皮质醇', 0.77471923828125),
 ('甲状腺', 0.7733747363090515),
 ('甲状腺素', 0.7732579112052917),
 ('睾酮', 0.7722375392913818),
 ('细胞生长', 0.7704721093177795),
 ('降钙素', 0.7698075771331787),
 ('乙酰胆碱', 0.7692161798477173),
 ('黄体素', 0.7682069540023804),
 ('GnRH', 0.7665923833847046),
 ('儿茶酚胺', 0.7660119533538818)]

In [102]:
import jieba
from sklearn.metrics.pairwise import cosine_similarity
vocab = wvmodel.wv.vocab
"""
JD和CV的关键词都以字典的方式存储，按照标签分类，假设二者的标签是相同的。
"""
JD1_dic = {'行业':['金融'],\。
          '学历':['硕士'],\
          '技能':['Java','Hadoop','Android'],\
          '业务':['系统开发','系统设计','软件开发','数据处理'],\
          }

CV1_dic = {'行业':['经济','互联网'],\
          '学历':['本科'],\
          '技能':['Java','Hadoop','C++'],\
          '业务':['编写软件','大数据处理','用户界面设计','软件开发','软件测试'],\
          }
CV2_dic = {'行业':['经济','互联网'],\
          '学历':['本科'],\
          '技能':['Hadoop','Android','C++'],\
          '业务':['编写软件','大数据处理','用户界面设计','软件开发','软件测试'],\
          }
CV3_dic = {'行业':['能源','石油'],\
          '学历':['硕士'],\
          '技能':['C','C++','Java'],\
          '业务':['项目管理','数据分析','需求发现','编写文档'],\
          }
CV4_dic = {'行业':['翻译','语言','外交'],\
          '学历':['硕士'],\
          '技能':['德语','翻译'],\
          '业务':['文献翻译','同声传译','商业谈判'],\
          }
# 输入一组数，返回对应的归一化后的结果
def softmax(x): 
    x = np.array(x).reshape(1,len(x))
    return np.exp(x)/np.sum(np.exp(x),axis=1)

# 给定列表长度，返回递减的权重列表
def desc_weights(length): 
    w = sorted([i for i in range(1,length+1)],reverse=True)
    return list(softmax(w)[0])

def Wordlist_Wv(wordlist,wvdim=wvdim,weights_descend=False):
    if weights_descend:
        weights = desc_weights(len(wordlist))
    else:
        weights = [1 for _ in wordlist]
    l = 0
    wv = np.zeros((wvdim,))
    for word,weight in zip(wordlist,weights):
        if word in vocab.keys():
            wv += wvmodel[word]*weight
            l += 1
        else:
            split_words = jieba.lcut(word)
            split_weight = weight/len(split_words)
            for each in split_words:
                try:
                    wv += wvmodel[each]*split_weight
                    l += 1
                except:
                    print('* Warning：Word [',each,'] not in vocab!')
    return (wv/l).reshape(1,wvdim)

In [103]:
"""
方法一(Baseline)：Simple AvgWV Similarity
对所有词语，进行词向量平均，然后计算cos相似度。
注：词典中没有的词，经过jieba分词后，再录入。
"""

def AvgWvSim(dic1,dic2):
    words1 = []
    words2 = []
    for li in dic1.values():
        words1 += li
    for li in dic2.values():
        words2 += li
        
    wv1 = Wordlist_Wv(words1)
    wv2 = Wordlist_Wv(words2)

    return cosine_similarity(wv1,wv2)[0][0]

print("JD1与CV1的匹配得分：",AvgWvSim(JD1_dic,CV1_dic))
print("JD1与CV2的匹配得分：",AvgWvSim(JD1_dic,CV2_dic))
print("JD1与CV3的匹配得分：",AvgWvSim(JD1_dic,CV3_dic))
print("JD1与CV4的匹配得分：",AvgWvSim(JD1_dic,CV4_dic))

JD1与CV1的匹配得分： 0.934870676153894
JD1与CV2的匹配得分： 0.9439874757803037
JD1与CV3的匹配得分： 0.8967426977192904
JD1与CV4的匹配得分： 0.668335534274794




In [107]:
"""
方法二：Focused-AvgW2V
对应的标签进行相似度计算，然后再按照对不同的标签的权重进行加权平均，得到总分。
（相当于添加了sentence-level attention）
"""
tag_weights = {'行业':0.1,'学历':0.25,'技能':0.3,'业务':0.35}
def FocusedAvgWvSim(dic1,dic2,tag_weights=tag_weights,weights1_desc=False,weights2_desc=False):
    total_score = 0
    for cate in dic1:
        wv1 = Wordlist_Wv(dic1[cate],weights_descend=weights1_desc)
        wv2 = Wordlist_Wv(dic2[cate],weights_descend=weights2_desc)
        score = cosine_similarity(wv1,wv2)
        weighted_score = score*tag_weights[cate]
        total_score += weighted_score
    return total_score

print("JD1与CV1的匹配得分：",FocusedAvgWvSim(JD1_dic,CV1_dic)[0][0])
print("JD1与CV2的匹配得分：",FocusedAvgWvSim(JD1_dic,CV2_dic)[0][0])
print("JD1与CV3的匹配得分：",FocusedAvgWvSim(JD1_dic,CV3_dic)[0][0])
print("JD1与CV4的匹配得分：",FocusedAvgWvSim(JD1_dic,CV4_dic)[0][0])

JD1与CV1的匹配得分： 0.8068824189903465
JD1与CV2的匹配得分： 0.8226672324602106
JD1与CV3的匹配得分： 0.8457817366796778
JD1与CV4的匹配得分： 0.5937559241738408




In [110]:
"""
方法三：
在二的基础上，添加对不同词的重要性。默认JD中，同一条中，越靠前的词重要性越强；CV则各词权重相同。
（相当于添加了word-level attention）
"""

print("JD1与CV1的匹配得分：",FocusedAvgWvSim(JD1_dic,CV1_dic,weights1_desc=True)[0][0])
print("JD1与CV2的匹配得分：",FocusedAvgWvSim(JD1_dic,CV2_dic,weights1_desc=True)[0][0])
print("JD1与CV3的匹配得分：",FocusedAvgWvSim(JD1_dic,CV3_dic,weights1_desc=True)[0][0])
print("JD1与CV4的匹配得分：",FocusedAvgWvSim(JD1_dic,CV4_dic,weights1_desc=True)[0][0])

JD1与CV1的匹配得分： 0.8031052619376688
JD1与CV2的匹配得分： 0.7864576685133724
JD1与CV3的匹配得分： 0.8489762216010368
JD1与CV4的匹配得分： 0.5748876829998409




In [112]:
tag_weights = {'行业':0.25,'学历':0.1,'技能':0.3,'业务':0.35}
print("JD1与CV1的匹配得分：",FocusedAvgWvSim(JD1_dic,CV1_dic,tag_weights=tag_weights,weights1_desc=True)[0][0])
print("JD1与CV2的匹配得分：",FocusedAvgWvSim(JD1_dic,CV2_dic,tag_weights=tag_weights,weights1_desc=True)[0][0])
print("JD1与CV3的匹配得分：",FocusedAvgWvSim(JD1_dic,CV3_dic,tag_weights=tag_weights,weights1_desc=True)[0][0])
print("JD1与CV4的匹配得分：",FocusedAvgWvSim(JD1_dic,CV4_dic,tag_weights=tag_weights,weights1_desc=True)[0][0])

JD1与CV1的匹配得分： 0.8125211012398271
JD1与CV2的匹配得分： 0.7958735078155307
JD1与CV3的匹配得分： 0.7820377095558115
JD1与CV4的匹配得分： 0.48981383108376303


