# 近义词挖掘
词 w 的词向量设为：w 的上下文的词袋子表示（使用TFIDF）

In [1]:
# 加载文件
import numpy as np
import pandas as pd
new_data = pd.read_excel('./var/new_data.xlsx')
import pickle
with open('./var/word2id', 'rb') as f:
    word2id = pickle.load(f)
word_num = len(word2id)

with open('./var/TF', 'rb') as f:
    TF = pickle.load(f)
with open('./var/IDF', 'rb') as f:
    IDF = pickle.load(f)
id2word = {}
for key, val in word2id.items():
    id2word[val] = key

In [2]:
# 计算出现频度超过10的词的下标
big_words = []
for i in range(word_num):
    if(TF[i] > 10):
        big_words.append(i)
        
big_num = len(big_words)

big2id = {}
id2big = {}
for i in range(big_num):
    big2id[i] = big_words[i]
for key, val in big2id.items():
    id2big[val] = key

In [3]:
word_num

19855

In [4]:
# context 每行是一个词的特征向量
# 特征向量就是别的词在它的上下文中的 TFIDF
# 把这个作为特征向量的思想是，近义词常出现在相似的上下文中
context = np.zeros((big_num, word_num))

def cnt_context(poem):
    for place, row in poem.iterrows():
        word = row['word']
        line_num = row['line_number']
        id = word2id[word]
        if id in id2big:
            bigid = id2big[id]
            for con_place, con_row in poem.iterrows():
                con_word = con_row['word']
                con_id = word2id[con_word]
                dist = abs(place - con_place)
                con_line_num = con_row['line_number']
                if id != con_id:
                    context[bigid][con_id] += IDF[con_id]
                    if line_num == con_line_num:
                        context[bigid][con_id] += IDF[con_id] * 10
    
new_data.groupby('Poem_id').apply(cnt_context)

for line in context:
    line /= np.linalg.norm(line)

context

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00358148],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00100208,
        0.00105585],
       [0.00758006, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [5]:
# 降维
from sklearn.decomposition import PCA
dec_to = 128
dcontext = PCA(n_components = dec_to).fit_transform(context)

In [6]:
synonym_list = []
for i in range(big_num):
    cur_list = []
    moon_vec = dcontext[i]
    similarity = np.zeros(big_num)
    for i, v in enumerate(dcontext):
        similarity[i] = np.linalg.norm(moon_vec - v)

    st = np.argsort(similarity)
    for i in range(1,4):
        cur_list.append(st[i])
    synonym_list.append(cur_list)

In [7]:
# Test
for i in synonym_list[id2big[word2id['西风']]]:
    print(id2word[big2id[i]])

北风
狂风
凉风


In [8]:
import pickle
with open('./var/synonym_list', 'wb') as f:
    pickle.dump(synonym_list, f)

In [19]:
# Test
moon_big = id2big[word2id['西风']]
moon_vec = dcontext[moon_big]
similarity = np.zeros(big_num)
for i, v in enumerate(dcontext):
    similarity[i] = np.linalg.norm(moon_vec - v)
    
st = np.argsort(similarity)
for i in range(30):
    print(id2word[big2id[st[i]]], similarity[st[i]])

西风 0.0
北风 0.3782612781137929
狂风 0.3919064063921832
凉风 0.40099835751388474
秋风 0.40305529328849954
南风 0.4566438310104346
吹 0.45973978949219096
东风 0.4609658908222566
微风 0.46427321900730334
萧萧 0.4645822903116324
松风 0.4674310446960449
塘 0.46821572753182733
动 0.4699403839971106
荷叶 0.47361226973421455
吹落 0.47503252228869847
一夜 0.4828543142079771
黄云 0.48333717429095197
江城 0.4835876568071024
妾 0.48751738580983234
日落 0.4890045993640968
塞上 0.4922096527940893
临风 0.49356731217515065
满城 0.49387178150527744
蒋 0.4978557069932768
清凉 0.4980829926009589
簟 0.4983622139458006
萧条 0.5007295150310298
不开 0.5041722552840748
楚客 0.5051921655093423
凤 0.5065234688187786


In [10]:
# 储存结果
with open('./var/big2id', 'wb') as f:
    pickle.dump(big2id, f)
with open('./var/id2big', 'wb') as f:
    pickle.dump(id2big, f)
with open('./var/id2word', 'wb') as f:
    pickle.dump(id2word, f)