In [3]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
import pprint
import tqdm

pp = pprint.PrettyPrinter()

In [4]:
def evaluate_wordsim(wordvec):
    p_ws353 = wordvec.evaluate_word_pairs(datapath('wordsim353.tsv'))[1][0]
    p_rw = wordvec.evaluate_word_pairs("word2vec/rw/rw_clean.txt")[1][0]
    p_sl999 = wordvec.evaluate_word_pairs(datapath('simlex999.txt'))[1][0]
    print("WS353:", p_ws353)
    print("RW:", p_rw)
    print("SL999", p_sl999)

## Pretrained GoogleNews-300

In [8]:
wv_google = KeyedVectors.load_word2vec_format('dataset/GoogleNews-vectors-negative300.bin', binary=True)
print("vocab:", len(wv_google.vocab))
print("vec_dim:", wv_google.vector_size)

vocab: 3000000
vec_dim: 300


In [11]:
evaluate_wordsim(wv_google)

WS353: 0.6589215888009288
RW: 0.5525559901031721
SL999 0.43607859778335434


## Train by gensim

In [2]:
print(api.info("text8"))
dataset = api.load("text8")

with open("dataset/text8.txt", "w") as f:
    for data in dataset:
        f.write(" ".join(data) + "\n")

{'num_records': 1701, 'record_format': 'list of str (tokens)', 'file_size': 33182058, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py', 'license': 'not found', 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.', 'checksum': '68799af40b6bda07dfa47a32612e5364', 'file_name': 'text8.gz', 'read_more': ['http://mattmahoney.net/dc/textdata.html'], 'parts': 1}


提取text8部分：
```sh
$ head -i 300 text8.txt > text8_300.txt
```

In [8]:
# dataset = open("dataset/text8_300.txt").read()
dataset = api.load("text8")
model = Word2Vec(dataset)
print("vocab:", len(model.wv.vocab))
print("vec_dim:", model.vector_size)

vocab: 71290
vec_dim: 100


In [9]:
evaluate_wordsim(model.wv)

0.623768051438205
0.3198280325425669
0.24994655821500755


## Train by word2vec DIY

In [None]:
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
import gensim.downloader as api
import pprint
import tqdm

from word2vec.trainer import Word2VecTrainer
pp = pprint.PrettyPrinter()

%load_ext autoreload
%autoreload 2

wv = Word2VecTrainer("dataset/text8.txt", saved_model_path="tmp/skipgram.epoch1.batch50000", output_file="sgns.vec") # emb_dim=100, vocab=30000, SparseAdam, lr=0.001
# wv = Word2VecTrainer("dataset/text8.txt", saved_model_path="", output_file="sgns.vec") # emb_dim=100, vocab=30000, SparseAdam, lr=0.001
# wv = Word2VecTrainer("dataset/text8_100.txt", output_file="sgns_mini.vec")
wv.train()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
len data: 17005207
epoch= 1, batch=    0: sp=0.614 0.369  pair/sec = 91120.71 loss=0.073
epoch= 1, batch=10000: sp=0.610 0.366  pair/sec = 10931.75 loss=0.381
epoch= 1, batch=20000: sp=0.613 0.366  pair/sec = 10947.02 loss=0.708
epoch= 1, batch=30000: sp=0.609 0.368  pair/sec = 10948.62 loss=0.856
epoch= 1, batch=40000: sp=0.618 0.370  pair/sec = 11021.52 loss=0.850
epoch= 1, batch=50000: sp=0.622 0.374  pair/sec = 9802.78 loss=0.670
epoch= 1, batch=60000: sp=0.622 0.373  pair/sec = 10863.37 loss=0.778
epoch= 1, batch=70000: sp=0.624 0.376  pair/sec = 11011.65 loss=0.657
epoch= 1, batch=80000: sp=0.629 0.379  pair/sec = 10943.09 loss=0.730
epoch= 1, batch=90000: sp=0.623 0.382  pair/sec = 10665.93 loss=0.568
epoch= 1, batch=100000: sp=0.622 0.381  pair/sec = 9668.13 loss=0.707
epoch= 1, batch=110000: sp=0.620 0.385  pair/sec = 10997.14 loss=0.791
epoch= 1, batch=120000: sp=0.621 0.388  pair/sec = 10

In [1]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

wv = KeyedVectors.load_word2vec_format("sgns.vec", binary=False)
vocab = list(wv.vocab.keys())
print("Loaded vocab size %i" % len(vocab))

Loaded vocab size 50000


In [5]:
evaluate_wordsim(wv)

WS353: 0.6060414633359121
RW: 0.2730176745649218
SL999 0.2319199528849217
