[toc]

# Gensim 的 Word2Vec 实践

In [38]:
from gensim.models import word2vec
import gensim

raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]
sentences = [s.split() for s in raw_sentences]

model = word2vec.Word2Vec(sentences, min_count=1)

In [54]:
word2vec.Word2Vec?

常用到的参数

- size : int, optional
    - Dimensionality of the word vectors
- window : int, optional
    - Maximum distance between the current and predicted word within a sentence.
- min_count : int, optional
    - Ignores all words with total frequency lower than this.
- sg : {0, 1}, optional
    - Training algorithm: 1 for skip-gram; otherwise CBOW.
- hs : {0, 1}, optional
    - If 1, hierarchical softmax will be used for model training.
    - If 0, and `negative` is non-zero, negative sampling will be used.
- negative : int, optional
    If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
    should be drawn (usually between 5-20).
    If set to 0, no negative sampling is used.
-

## 从多个文件中输入语料

In [13]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

sentences = MySentences('/some/directory') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)

NameError: name 'gensim' is not defined

## 模型保存与读取

In [17]:
model.save('test.model')

In [19]:
restored_model = gensim.models.word2vec.Word2Vec.load("test.model")

### 保存成二进制文件

In [27]:
binary_save_path = "test.model.bin"
model.wv.save_word2vec_format(binary_save_path, binary=True)

restored_binary_model = gensim.models.KeyedVectors.load_word2vec_format(binary_save_path, binary=True)

## 模型预测

In [52]:
model.wv.vocab # 词典中的所有词汇

{'the': <gensim.models.keyedvectors.Vocab at 0x11641a208>,
 'quick': <gensim.models.keyedvectors.Vocab at 0x11641a240>,
 'brown': <gensim.models.keyedvectors.Vocab at 0x11641a2b0>,
 'fox': <gensim.models.keyedvectors.Vocab at 0x11641a2e8>,
 'jumps': <gensim.models.keyedvectors.Vocab at 0x11641a320>,
 'over': <gensim.models.keyedvectors.Vocab at 0x11641a358>,
 'lazy': <gensim.models.keyedvectors.Vocab at 0x11641a390>,
 'dogs': <gensim.models.keyedvectors.Vocab at 0x11641a3c8>,
 'yoyoyo': <gensim.models.keyedvectors.Vocab at 0x11641a400>,
 'you': <gensim.models.keyedvectors.Vocab at 0x11641a438>,
 'go': <gensim.models.keyedvectors.Vocab at 0x11641a470>,
 'home': <gensim.models.keyedvectors.Vocab at 0x11641a4a8>,
 'now': <gensim.models.keyedvectors.Vocab at 0x11641a4e0>,
 'to': <gensim.models.keyedvectors.Vocab at 0x11641a518>,
 'sleep': <gensim.models.keyedvectors.Vocab at 0x11641a550>}

### 获取词向量

In [30]:
model.wv['you']

array([-8.46173847e-04, -1.18479526e-04,  3.21807084e-03,  4.99845808e-03,
        4.77842428e-03, -3.48846824e-03, -4.29400010e-03,  1.23152218e-04,
       -2.46726978e-03,  3.05278203e-03,  3.72302555e-03,  4.91536362e-03,
        2.10555480e-03,  3.03062989e-04, -1.67109061e-03, -3.17640114e-03,
       -3.49465734e-03, -2.93565402e-03, -2.12275982e-03,  4.75081988e-03,
       -4.52112732e-03,  1.27372611e-03, -1.47000610e-04,  4.88099689e-03,
       -1.28948118e-03,  2.26798770e-03,  1.00740278e-03, -4.67300368e-03,
        3.89396190e-03,  3.68328160e-03, -4.92732320e-03,  4.78392380e-04,
       -3.48263746e-03,  1.35074486e-03,  3.89290787e-03,  2.35763728e-03,
        1.96083076e-03, -4.07436350e-03,  1.82393834e-03, -2.06615823e-05,
       -2.42632627e-03, -8.64281319e-04, -2.38293223e-03,  1.37142965e-03,
       -7.16688577e-04,  3.39390291e-03, -2.94442616e-05,  3.78515688e-03,
       -5.90594078e-04,  2.93036830e-03, -7.90181337e-04, -9.00543644e-04,
       -2.27597402e-03,  

### 获取和某个单词最相似的前 topn 个单词

In [44]:
model.wv.most_similar(["you"], topn=3)

[('over', 0.23280760645866394),
 ('sleep', 0.09017609059810638),
 ('home', 0.08646240830421448)]

### 做 xx 之于 xx = xx 之于 xx 的推理题

In [50]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

KeyError: "word 'woman' not in vocabulary"

### 计算两个单词之间的相似度

In [47]:
model.wv.similarity("fox", "dogs")

0.059096213

# References
1. [基于 Gensim 的 Word2Vec 实践 - 知乎](https://zhuanlan.zhihu.com/p/24961011)