In [1]:
#======================加载word2vec模型==========================
from gensim.models import Word2Vec
wv = Word2Vec.load("F:/Jupyter/--NLP/big_things/models/wikibaikeWV250/wikibaikewv250")
vocab = wv.wv.vocab



In [1]:
##=====================加载LTP模型 ========================
import os
LTP_DATA_DIR = 'F:/MyDownloads/ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径
from pyltp import Segmentor # 分词器
from pyltp import Postagger # 词性标注器
from pyltp import Parser # 句法分析器

segmentor = Segmentor()  # 初始化实例
# segmentor.load(cws_model_path)  
segmentor.load_with_lexicon(cws_model_path, 'userword.txt')# 加载模型
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型

relation_dic = {'SBV':'主谓关系','VOB':'动宾关系','IOB':'间宾关系','FOB':'前置宾语',\
                'DBL':'兼语','ATT':'定中关系','ADV':'状中结构','CMP':'动补结构',\
                'COO':'并列关系','POB':'介宾关系','LAD':'左附加关系','RAD':'右附加关系','IS':'独立结构','HED':'核心关系','WP':'标点'}


In [3]:
# ==================加载百科词频文件===================
import json
wiki_word_freq_json = open('F:/Jupyter/--NLP/big_things/wiki_word_freq.json','r',encoding='utf-8').read()
wiki_word_freq_dic = json.loads(wiki_word_freq_json)
print(wiki_word_freq_dic['能力'])
print(wiki_word_freq_dic['java'])

27191
133


In [4]:
#==========================Utility funcitons===========================
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_words_and_tags(sentence):
    words = list(segmentor.segment(sentence))
    postags = list(postagger.postag(words))
    return [words,postags]

"""
计算一组词的综合词向量，有若干方法：
1.简单地直接计算平均，没有的词直接忽略
2.同上，但是没有的词，拆成单字来计算词向量
3.在上一步的基础上，加上百科词频来赋予权重，然后加权平均
4.再加上位置权重
"""
# 根据百科词频计算词权重：
import math
def wiki_weight(word):
    try:
        freq = wiki_word_freq_dic[word]
    except KeyError:
        freq = 1
    return 1/(math.log10(freq+1))

shape_vec = wv["加油"]
unk_wv = {}

# 计算位置权重：
def loc_weight(w_list):
    list_len=len(w_list)
    i =1-(list_len-1)*0.1
    loc_weight_list=[]
    for wd in w_list:
        loc_weight_list.append(i)
        i =i+0.05
    return (loc_weight_list)
    
class Phrase_vec:
    # 1.简单地直接计算平均，没有的词直接忽略
    def simpleAvgVec(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            if word in vocab.keys():
                vec += wv[word]
                n += 1
        if n>0:
            return vec/n
        else:
            return vec
    # 2.同上，但是没有的词，拆成单字来计算词向量
    def avgVec(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            if word in vocab.keys():
                vec += wv[word]
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    # 3.在上一步的基础上，加上百科词频来赋予权重，然后加权平均
    def weightedAvgVec1(self,words):
        vec = np.zeros_like(shape_vec)
        n = 0
        for word in words:
            weight = wiki_weight(word)
            if word in vocab.keys():
                vec += wv[word]*weight
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec*weight
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    # 4.再加上位置权重
    def weightedAvgVec2(self,words):
        loc_weights = loc_weight(words)
        vec = np.zeros_like(shape_vec)
        n = 0
        for word,l_w in zip(words,loc_weights):
            weight = wiki_weight(word)
            if word in vocab.keys():
                vec += wv[word]*weight*l_w
                n += 1
            else:
#                 print(word," NOT IN WV-DICT ! USE CHAR-VEC")
                chars = [c for c in word]
                unk_vec = Phrase_vec.simpleAvgVec(self,chars)*len(word)
                vec += unk_vec*weight*l_w
                n += 1
                unk_wv[word] = unk_vec
        if n>0:
            return vec/n
        else:
            return vec
    
    """
    一些短语向量的衡量指标：
    ATM: 每个词和整个短语平均词向量的余弦距离
    Conicity：锥度，所有ATM的平均值，越大，说明各个词越相近/越聚集
    VS（vector spread）：向量散度，衡量各个词的分散性
    """
    def atm(self,word,words):
        words_avg_vec = Phrase_vec.avgVec(self,words).reshape(1,250)
        current_vec = Phrase_vec.avgVec(self,[word]).reshape(1,250)
        atm_score = cosine_similarity(current_vec,words_avg_vec)[0][0]
        return atm_score
    def conicity_vs(self,words):
        atms = [Phrase_vec.atm(self,word,words) for word in words]
        sum = 0
        for atm in atms:
            sum += atm
        conicity_score = sum/len(words)
        minus_square = 0
        for atm in atms:
            minus_square += pow((atm-conicity_score),2)
        vs_score = minus_square/len(words)
        return [10*conicity_score,1000*vs_score]
        

p_vec = Phrase_vec()

def getSimiScore(words,target_words):
    words_vec = p_vec.weightedAvgVec1(words).reshape(1,250)
    target_vec = p_vec.weightedAvgVec1(target_words).reshape(1,250)
    return cosine_similarity(words_vec,target_vec)[0][0]



In [5]:
"""
conicity,vs两个指标的试验：
"""
conicity,vs = p_vec.conicity_vs(['开心','高兴','兴奋','心情'])
conicity1,vs1 = p_vec.conicity_vs(['信息','管理','系统','开发'])
conicity2,vs2 = p_vec.conicity_vs(['昨天','电脑','物理','书籍'])
print(conicity,' ',conicity1,' ',conicity2)
print(vs,' ',vs1,' ',vs2)

7.799393385648727   7.039833068847656   6.8465059995651245
0.05617520905221163   0.5096023786546056   7.38286402438959




In [6]:

#=======================Sum up, get final results==================
"""
input: 
    sentence/短语
output:
    word_pairs---list
    relations---list
    simiscores---list
"""
def getParserResult(sentence):
    words,postags = get_words_and_tags(sentence)
    conicity,vs = p_vec.conicity_vs(words)
    arcs = parser.parse(words, postags)  # 句法分析

    results = ''
    
    word_pairs = []
    relations = []
    simiscores = []

    for index,arc in enumerate(arcs):
        word = words[index]
        relation_word = words[arc.head-1] if arc.head>0 else 'ROOT'
        relation = arc.relation
        combination = words[index]+'-'+relation_word if index<=arc.head-1 else relation_word+'-'+words[index]
        tag = postags[index]
        relation_tag = postags[arc.head-1] if arc.head>0 else 'ROOT'
        
        # 要输出的关系：
        if relation not in ['HED','LAD','RAD','POB','IS','WP','CMP'] :
            # 获取word_pair，并计算相似度。然后添加到相关的list中
            word_pair = set([word,relation_word])
            simiscore = getSimiScore(words,list(word_pair))
            word_pairs.append(word_pair)
            relations.append(relation)
            simiscores.append(simiscore)
            
            if relation == 'COO':
                # 必须新建一个list，再用新list来循环，否则会进入死循环！
                ori_word_pairs = word_pairs[:] 
                for the_pair in ori_word_pairs:
                    cross = word_pair&the_pair
                    if len(cross) == 1 and 'ROOT' not in the_pair: # 发现重合,但不能是其COO-pair本身
                        whole = word_pair|the_pair
                        new_pair = whole - cross
                        simiscore = getSimiScore(words,list(new_pair))
                        word_pairs.append(new_pair)
                        relations.append("New!")
                        simiscores.append(simiscore)
                        
    return [word_pairs,relations,simiscores]

In [11]:
import re
import jieba
ann_filenames = []
phrases = []
kind = 'TEST'
base_path = 'F:/Jupyter/--NLP/big_things/resume_txt_ann_6kinds'
files = os.listdir(base_path+"/"+kind)
ann_filenames = [f for f in files if os.path.splitext(f)[-1]=='.ann']
for index,ann_filename in enumerate(ann_filenames):
    with open(base_path+"/"+kind+"/"+ann_filename,encoding='utf-8') as f:
        lines = f.readlines()
        for l in lines:
            phrase = l.split('\t')[-1].strip()
#             phrase = re.sub('[a-zA-Z]','',phrase)
            if len(jieba.lcut(l))>=2:
#             if len(phrase)>4:
                phrases.append(phrase)

In [12]:
for phrase in phrases[100:200]:
    word_pairs,relations,simiscores = getParserResult(phrase)

    Z = zip(simiscores,word_pairs,relations)
    Z = sorted(Z,reverse=True)
    if len(word_pairs)>=1 and len(phrase)>4:
        title = "标注短语：\n【"+phrase+'】'
        print(title)
        for simiscore_,word_pair_,relation_ in Z:
    #         Z = zip(word_pair,relation,simiscore)
    #         Z = sorted(Z,reverse=True)
    #         word_pair_,relation_,simiscore_ = zip(*Z)
            print(word_pair_," || ",relation_," || ",simiscore_)
        print("========================")


标注短语：
【验证线上版本】
{'验证', '线'}  ||  ATT  ||  0.88802004
{'上', '版本'}  ||  ATT  ||  0.7902255
{'上', '线'}  ||  ATT  ||  0.7553108
标注短语：
【测试仪器的维护和点检】
{'仪器', '点检'}  ||  New!  ||  0.85611224
{'维护', '点检'}  ||  COO  ||  0.83868194
{'仪器', '维护'}  ||  ATT  ||  0.8062717
{'仪器', '测试'}  ||  ATT  ||  0.7709675
标注短语：
【产品的标识】
{'标识', '产品'}  ||  ATT  ||  0.97434413
标注短语：
【有效周期的维护和确认】
{'维护', '周期'}  ||  ATT  ||  0.8513347
{'确认', '周期'}  ||  New!  ||  0.8353963
{'有效', '周期'}  ||  ATT  ||  0.833671
{'维护', '确认'}  ||  COO  ||  0.8236501
标注短语：
【IQC日报表】
{'IQC', '日报表'}  ||  ATT  ||  1.0
标注短语：
【烟尘处理器】
{'处理器', '烟尘'}  ||  ATT  ||  1.0000001
标注短语：
【采集样品数据】
{'采集', '样品'}  ||  ATT  ||  0.95344424
{'数据', '样品'}  ||  ATT  ||  0.9237138
标注短语：
【FPC组装】
{'FPC', '组装'}  ||  SBV  ||  1.0
标注短语：
【FR4冲压】
{'冲压', 'FR4'}  ||  ATT  ||  1.0000001
标注短语：
【ISO14000质量】
{'质量', 'ISO14000'}  ||  ATT  ||  0.9999998
标注短语：
【团队建设和发展】
{'团队', '建设'}  ||  FOB  ||  0.9133179
{'团队', '发展'}  ||  New!  ||  0.8962901
{'发展', '建设'}  ||  COO  ||  0.8750663
标注短语：
【客诉原



In [None]:
segmentor.release()
postagger.release()
parser.release()