In [1]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

In [4]:
def evaluate_wordsim(wordvec):
    p_ws353 = wordvec.evaluate_word_pairs(datapath('wordsim353.tsv'))[1][0]
    p_rw = wordvec.evaluate_word_pairs("word2vec/rw/rw_clean.txt")[1][0]
    p_sl999 = wordvec.evaluate_word_pairs(datapath('simlex999.txt'))[1][0]
    print("WS353:", p_ws353)
    print("RW:", p_rw)
    print("SL999", p_sl999)

## Pretrained GoogleNews-300

In [8]:
wv_google = KeyedVectors.load_word2vec_format('dataset/GoogleNews-vectors-negative300.bin', binary=True)
print("vocab:", len(wv_google.vocab))
print("vec_dim:", wv_google.vector_size)

vocab: 3000000
vec_dim: 300


In [11]:
evaluate_wordsim(wv_google)

WS353: 0.6589215888009288
RW: 0.5525559901031721
SL999 0.43607859778335434


## Train by gensim

In [9]:
dataset = api.load("text8")

with open("dataset/text8.txt", "w") as f:
    for data in dataset:
        f.write(" ".join(data) + "\n")

提取text8一部分用于测试：
```sh
$ head -i 100 text8.txt > text8_100.txt
```

In [8]:
dataset = api.load("text8")
model = Word2Vec(dataset)
print("vocab:", len(model.wv.vocab))
print("vec_dim:", model.vector_size)

vocab: 71290
vec_dim: 100


In [9]:
evaluate_wordsim(model.wv)

0.623768051438205
0.3198280325425669
0.24994655821500755


## Train by SGNS-PyTorch

In [None]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
import tqdm

from word2vec.trainer import Word2VecTrainer

%load_ext autoreload
%autoreload 2

wv = Word2VecTrainer("dataset/text8.txt", saved_model_path="tmp/skipgram.epoch1.batch50000", output_file="sgns.vec") # emb_dim=100, vocab=50000, SparseAdam, lr=0.001

wv.train()

In [10]:
from gensim.models import KeyedVectors

wv = KeyedVectors.load_word2vec_format("sgns.vec", binary=False)
vocab = list(wv.vocab.keys())
print("Loaded vocab size %i" % len(vocab))
evaluate_wordsim(wv)

Loaded vocab size 50000
WS353: 0.660792652633121
RW: 0.3430154080998551
SL999 0.2649420256825831
