In [1]:
#======================加载word2vec模型==========================
from gensim.models import Word2Vec
wv = Word2Vec.load("F:/Jupyter/--NLP/big_things/models/wikibaikeWV250/wikibaikewv250")
vocab = wv.wv.vocab



In [2]:
# ================从长短语中获取排列组合的词对===========
import jieba
def wordsComp(phrase):
    words = jieba.lcut(phrase)
    comps = []
    for i in range(len(words)):
        for j in range(i+1,len(words)):
            comp = [words[i],words[j]]
            comps.append(comp)
    return comps

phrase = '自然语言处理相关技术'
wordsComp(phrase)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\x1c\AppData\Local\Temp\jieba.cache
Loading model cost 1.262 seconds.
Prefix dict has been built succesfully.


[['自然语言', '处理'],
 ['自然语言', '相关'],
 ['自然语言', '技术'],
 ['处理', '相关'],
 ['处理', '技术'],
 ['相关', '技术']]

In [3]:
# ==================加载百科词频文件===================
import json
wiki_word_freq_json = open('F:/Jupyter/--NLP/big_things/wiki_word_freq.json','r',encoding='utf-8').read()
wiki_word_freq_dic = json.loads(wiki_word_freq_json)
print(wiki_word_freq_dic['能力'])
print(wiki_word_freq_dic['java'])

27191
133


In [4]:
#==========================Utility funcitons===========================
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

"""
计算一组词的综合词向量，有若干方法：
1.简单地直接计算平均，没有的词直接忽略
2.同上，但是没有的词，拆成单字来计算词向量
3.在上一步的基础上，加上百科词频来赋予权重，然后加权平均
4.再加上位置权重
"""
# 根据百科词频计算词权重：
import math
def wiki_weight(word):
    try:
        freq = wiki_word_freq_dic[word]
    except KeyError:
        freq = 1
    return 1/(math.log10(freq+1))

shape_vec = wv["加油"]
unk_wv = {}

# 计算位置权重：
def loc_weight(w_list):
    list_len=len(w_list)
    i =1-(list_len-1)*0.1
    loc_weight_list=[]
    for wd in w_list:
        loc_weight_list.append(i)
        i =i+0.05
    return (loc_weight_list)
    
class Phrase_vec:
    # 1.简单地直接计算平均，没有的词直接忽略
    def simpleAvgVec(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            if word in vocab.keys():
                vec += wv[word]
                n += 1
        if n>0:
            return vec/n
        else:
            return vec
    # 2.同上，但是没有的词，拆成单字来计算词向量
    def avgVec(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            if word in vocab.keys():
                vec += wv[word]
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    # 3.在上一步的基础上，加上百科词频来赋予权重，然后加权平均
    def weightedAvgVec1(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            weight = wiki_weight(word)
            if word in vocab.keys():
                vec += wv[word]*weight
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec*weight
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    # 4.再加上位置权重
    def weightedAvgVec2(self,words):
        loc_weights = loc_weight(words)
        vec = np.zeros_like(shape_vec)
        n = 0
        for word,l_w in zip(words,loc_weights):
            weight = wiki_weight(word)
            if word in vocab.keys():
                vec += wv[word]*weight*l_w
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec*weight*l_w
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    """
    一些短语向量的衡量指标：
    ATM: 每个词和整个短语平均词向量的余弦距离
    Conicity：锥度，所有ATM的平均值，越大，说明各个词越相近/越聚集
    VS（vector spread）：向量散度，衡量各个词的分散性
    """
    def atm(self,word,words):
        words_avg_vec = Phrase_vec.avgVec(self,words).reshape(1,250)
        current_vec = Phrase_vec.avgVec(self,[word]).reshape(1,250)
        atm_score = cosine_similarity(current_vec,words_avg_vec)[0][0]
        return atm_score
    def conicity_vs(self,words):
        atms = [Phrase_vec.atm(self,word,words) for word in words]
        sum = 0
        for atm in atms:
            sum += atm
        conicity_score = sum/len(words)
        minus_square = 0
        for atm in atms:
            minus_square += pow((atm-conicity_score),2)
        vs_score = minus_square/len(words)
        return [10*conicity_score,1000*vs_score]
        

p_vec = Phrase_vec()

def getSimiScore(words,target_words):
    words_vec = p_vec.weightedAvgVec1(words).reshape(1,250)
    target_vec = p_vec.weightedAvgVec1(target_words).reshape(1,250)
    return cosine_similarity(words_vec,target_vec)[0][0]



In [5]:
"""
conicity,vs两个指标的试验：
"""
conicity,vs = p_vec.conicity_vs(['开心','高兴','兴奋','心情'])
conicity1,vs1 = p_vec.conicity_vs(['信息','管理','系统','开发'])
conicity2,vs2 = p_vec.conicity_vs(['昨天','电脑','物理','书籍'])
print(conicity,' ',conicity1,' ',conicity2)
print(vs,' ',vs1,' ',vs2)

7.799393385648727   7.039833068847656   6.8465059995651245
0.05617520905221163   0.5096023786546056   7.38286402438959




In [6]:

#=======================Sum up, get final results==================
"""
input: 
    sentence/短语
output:
    word_pairs---list
    relations---list
    simiscores---list
"""
def getParserResult(sentence):
    target_words = jieba.lcut(sentence)
    word_comps = wordsComp(sentence)
    simiscores = []
    for word_comp in word_comps:
        score = getSimiScore(word_comp,target_words)
        simiscores.append(score)
    return [word_comps,simiscores]

In [10]:
import re
import os
ann_filenames = []
phrases = []
kind = 'Reserved_Employee'
base_path = 'F:/Jupyter/--NLP/big_things/resume_txt_ann_6kinds'
files = os.listdir(base_path+"/"+kind)
ann_filenames = [f for f in files if os.path.splitext(f)[-1]=='.ann']
for index,ann_filename in enumerate(ann_filenames):
    with open(base_path+"/"+kind+"/"+ann_filename,encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            phrase = l.split('\t')[-1].strip()
#             phrase = re.sub('[a-zA-Z]','',phrase)
            if len(jieba.lcut(l))>=2:
#             if len(phrase)>4:
                phrases.append(phrase)

In [11]:
for phrase in phrases[100:200]:
    word_comps,simiscores = getParserResult(phrase)

    Z = zip(simiscores,word_comps)
    Z = sorted(Z,reverse=True)
    if len(word_comps)>=1 and len(phrase)>4:
        title = "标注短语：\n【"+phrase+'】'
        print(title)
        for simiscore_,word_comps_ in Z:
            print(word_comps_," || ",simiscore_)
        print("========================")


标注短语：
【客户满意度】
['客户', '满意度']  ||  1.0
标注短语：
【钢结构检测】
['钢结构', '检测']  ||  1.0
标注短语：
【钢筋保护层检测】
['钢筋', '保护层']  ||  0.9610065
['钢筋', '检测']  ||  0.94737816
['保护层', '检测']  ||  0.93412316
标注短语：
【结构荷载试验】
['荷载', '试验']  ||  0.96668375
['结构', '荷载']  ||  0.9378102
['结构', '试验']  ||  0.8408343
标注短语：
【工程质量鉴定】
['工程质量', '鉴定']  ||  1.0000001
标注短语：
【建筑物沉降观测】
['沉降', '观测']  ||  0.9560437
['建筑物', '沉降']  ||  0.9416058
['建筑物', '观测']  ||  0.88247716
标注短语：
【房屋抗震鉴定】
['抗震', '鉴定']  ||  0.94098055
['房屋', '抗震']  ||  0.9289453
['房屋', '鉴定']  ||  0.86611235
标注短语：
【数据分析整理】
['数据分析', '整理']  ||  0.99999994
标注短语：
【气化器的设计】
['气化', '器']  ||  0.9394505
['气化', '设计']  ||  0.8978141
['气化', '的']  ||  0.864365
['器', '设计']  ||  0.79427195
['器', '的']  ||  0.74281216
['的', '设计']  ||  0.6039327
标注短语：
【挖掘客户潜在需求】
['潜在', '需求']  ||  0.8872643
['客户', '潜在']  ||  0.8820095
['挖掘', '客户']  ||  0.86234796
['客户', '需求']  ||  0.86049145
['挖掘', '潜在']  ||  0.85461366
['挖掘', '需求']  ||  0.85448444
标注短语：
【维护和跟进现有买家资源】
['维护', '买家']  ||  0.859174
['跟进', '资源'] 



In [9]:
jieba.lcut("黑盒用例设计")

['黑盒', '用例', '设计']