# 词向量

In [1]:
from gensim.models import KeyedVectors
import time

In [2]:
wv_model = KeyedVectors.load_word2vec_format('/Users/zn-nlp/Documents/project1_auto_master_qa/datasets/w2v.bin',
                                             binary=True)

In [3]:
def timeit(f):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = f(*args, **kwargs)
        end_time = time.time()
        print("%s函数运行时间为：%.8f" %(f.__name__, end_time - start_time))
        return res
    return wrapper

In [4]:
@timeit
def run():
    wv_model.wv.most_similar("维修")
run()

  This is separate from the ipykernel package so we can avoid doing imports until


run函数运行时间为：0.22017193


使用gensim有以下缺点：

* gensim加载模型耗时很长

* 占用内存很大，会将所有的词向量加载进入内存，占用内存很大（>10g）

* most_similar函数耗时较长。gensim使用的算法似乎是暴力求解（待验证），耗时较长，0.17秒。

当词向量非常大的时候，如何提升搜索查找速度

In [5]:
import json

In [6]:
from collections import OrderedDict

In [7]:
word_index = OrderedDict()

In [8]:
for counter, key in enumerate(wv_model.vocab.keys()):
    word_index[key] = counter

In [9]:
with open('word_index.json', 'w') as fp:
    json.dump(word_index, fp)

In [10]:
from annoy import AnnoyIndex

In [11]:
wv_index = AnnoyIndex(256)

  """Entry point for launching an IPython kernel.


In [12]:
i = 0

for key in wv_model.vocab.keys():
    v = wv_model[key]
    wv_index.add_item(i, v)
    i += 1

In [13]:
wv_index.build(10)

True

In [14]:
wv_index.save('wv_index_build10.index')

True

In [15]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [16]:
for item in wv_index.get_nns_by_item(word_index[u'车'], 11):
    print(reverse_word_index[item])

车
这车
之光车
车能
19761
我车
门见
两伐
ko7
10899
15800


In [18]:
@timeit
def run():
    for item in wv_index.get_nns_by_item(word_index['维修'], 11):
        print(reverse_word_index[item])
run()

维修
当面交易
装好点
熟练程度
祁阳县
修锁店
专修
给付
艾特
4520
调就行
run函数运行时间为：0.00043082
