In [1]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
import gensim.downloader as api
import pprint
import tqdm

pp = pprint.PrettyPrinter()

In [None]:
def evaluate_wordsim(wordvec):
    p_ws353 = wordvec.evaluate_word_pairs(datapath('wordsim353.tsv'))[1][0]
    p_rw = wordvec.wv.evaluate_word_pairs("word2vec/rw/rw_clean.txt")[1][0]
    p_sl999 = wordvec.wv.evaluate_word_pairs(datapath('simlex999.txt'))[1][0]
    print("WS353:", p_ws353)
    print("RW:", p_rw)
    print("SL999", p_sl999)

## Pretrained GoogleNews-300

In [None]:
wv_google = word2vec.KeyedVectors.load_word2vec_format('dataset/GoogleNews-vectors-negative300.bin', binary=True)
print("vocab:", len(wv_google.vocab))
print("vec_dim:", wv_google.vector_size)

## Train by gensim

In [2]:
print(api.info("text8"))
dataset = api.load("text8")

with open("dataset/text8.txt", "w") as f:
    for data in dataset:
        f.write(" ".join(data) + "\n")

{'num_records': 1701, 'record_format': 'list of str (tokens)', 'file_size': 33182058, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py', 'license': 'not found', 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.', 'checksum': '68799af40b6bda07dfa47a32612e5364', 'file_name': 'text8.gz', 'read_more': ['http://mattmahoney.net/dc/textdata.html'], 'parts': 1}


提取text8部分：
```sh
$ head -i 300 text8.txt > text8_300.txt
```

In [8]:
# dataset = open("dataset/text8_300.txt").read()
dataset = api.load("text8")
model = Word2Vec(dataset)
print("vocab:", len(model.wv.vocab))
print("vec_dim:", model.vector_size)

vocab: 71290
vec_dim: 100


In [9]:
evaluate_wordsim(model.wv)

0.623768051438205
0.3198280325425669
0.24994655821500755


## Train by word2vec DIY

In [None]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
import gensim.downloader as api
import pprint
import tqdm

from word2vec.trainer import Word2VecTrainer
pp = pprint.PrettyPrinter()

%load_ext autoreload
%autoreload 2

wv = Word2VecTrainer("dataset/text8.txt", output_file="sgns.vec") # emb_dim=100, vocab=30000, SparseAdam, lr=0.001
# wv = Word2VecTrainer("dataset/text8_100.txt", output_file="sgns_mini.vec")
wv.train()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
len data: 17005207
epoch= 1, batch=    0: sp=-0.093 0.005  pair/sec = 85904.12 loss=0.139
epoch= 1, batch=10000: sp=-0.032 -0.065  pair/sec = 10465.73 loss=0.938
epoch= 1, batch=20000: sp=0.081 -0.009  pair/sec = 10712.15 loss=0.882
epoch= 1, batch=30000: sp=0.107 0.080  pair/sec = 10695.68 loss=0.761
epoch= 1, batch=40000: sp=0.186 0.150  pair/sec = 10555.32 loss=0.857
epoch= 1, batch=50000: sp=0.230 0.188  pair/sec = 9650.68 loss=0.839
epoch= 1, batch=60000: sp=0.261 0.206  pair/sec = 10796.21 loss=0.850
epoch= 1, batch=70000: sp=0.290 0.215  pair/sec = 10748.83 loss=0.826
epoch= 1, batch=80000: sp=0.338 0.233  pair/sec = 10465.18 loss=0.888
epoch= 1, batch=90000: sp=0.366 0.242  pair/sec = 10534.43 loss=0.775
epoch= 1, batch=100000: sp=0.375 0.243  pair/sec = 9603.46 loss=0.817
epoch= 1, batch=110000: sp=0.382 0.251  pair/sec = 10622.22 loss=0.770
epoch= 1, batch=120000: sp=0.390 0.249  pair/sec 

In [7]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

wv = KeyedVectors.load_word2vec_format("sgns.vec", binary=False)
vocab = list(wv.vocab.keys())
print("Loaded vocab size %i" % len(vocab))

Loaded vocab size 50000


In [8]:
evaluate_wordsim(wv)

0.5027689131705056
0.2358323572771211
0.183232016465508
