In [None]:
%run ./../various/_epoch-callback.ipynb

In [None]:
import os
from pathlib import Path
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec

In [None]:
W2V_MODEL_DIR = f'{PROC_DATA_DIR}/w2v'
W2V_MODEL_PATH = f'{W2V_MODEL_DIR}/w2v_model.model'

WV_DIM = 300
MODEL_ITERS = 30

In [None]:
if RANDOM_SEED is not None:
    os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)

In [None]:
corpus = df[proc_doc_col].tolist()
workers = 1 if RANDOM_SEED is not None else 3

if not Path(W2V_MODEL_PATH).is_file():
    logger.info("Training Word2Vec model...")
    with tqdm(total=MODEL_ITERS, disable=SILENT) as pbar:
        w2v_model = Word2Vec(min_count=1, epochs=MODEL_ITERS, vector_size=WV_DIM, 
                             workers=workers, seed=RANDOM_SEED)
        wf = {dictionary[idx]: dictionary.dfs[idx] for idx in dictionary}
        w2v_model.build_vocab_from_freq(wf, corpus_count=len(corpus))
        #w2v_model.build_vocab(corpus)

        pbar_updater = GensimEpochCallback(end_func=pbar.update)
        loss_tracker = GensimEpochCallback(end_func=w2v_model.get_latest_training_loss)

        w2v_model.train(corpus, total_examples=w2v_model.corpus_count, 
                        epochs=w2v_model.epochs, compute_loss=True,
                        callbacks=[pbar_updater, loss_tracker])

    logger.debug("Logging training loss for each iteration")
    losses = loss_tracker.end_results
    prev_loss = 0
    for i, loss in enumerate(losses):
        logger.debug(f"{i + 1}: {loss - prev_loss}")
        prev_loss = loss

    Path(W2V_MODEL_DIR).mkdir(parents=True, exist_ok=True)
    logger.info("Storing Word2Vec model to disk...")
    w2v_model.save(W2V_MODEL_PATH)
else:
    w2v_model = Word2Vec.load(W2V_MODEL_PATH)

In [None]:
import numpy as np
np.savetxt(f'{W2V_MODEL_DIR}/w2v_wv.tsv', w2v_model.wv.vectors, delimiter='\t')
with open(f'{W2V_MODEL_DIR}/w2v_wv.meta', 'w', encoding='utf-8') as f:
    [f.write(_ + '\n') for _ in w2v_model.wv.index_to_key]