In [None]:
%run ./../various/_epoch_callback.ipynb

In [None]:
import os
from pathlib import Path
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec

In [None]:
W2V_MODEL_DIR = f'{PROC_DATA_DIR}/w2v'
W2V_MODEL_PATH = f'{W2V_MODEL_DIR}/w2v_dataset.model'

MODEL_ITERS = 50
VEC_DIMS = 300

In [None]:
if RANDOM_SEED is not None:
    os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)

In [None]:
corpus = df[proc_doc_col]
workers = 1 if RANDOM_SEED is not None else 3

if not Path(W2V_MODEL_PATH).is_file():
    logger.info("Training Word2Vec model...")
    with tqdm(total=MODEL_ITERS, disable=SILENT) as pbar:
        w2v_model = Word2Vec(min_count=1, epochs=MODEL_ITERS, vector_size=VEC_DIMS, 
                             workers=workers, seed=RANDOM_SEED)
        w2v_model.build_vocab(corpus)

        pbar_updater = EpochCallback(end_func=pbar.update)
        loss_tracker = EpochCallback(end_func=w2v_model.get_latest_training_loss)

        w2v_model.train(corpus, total_examples=w2v_model.corpus_count, 
                        epochs=w2v_model.epochs, compute_loss=True,
                        callbacks=[pbar_updater, loss_tracker])

    logger.debug("Logging training loss for each iteration")
    losses = loss_tracker.end_results
    prev_loss = 0
    for i, loss in enumerate(losses):
        logger.debug(f"{i + 1}: {loss - prev_loss}")
        prev_loss = loss

    Path(W2V_MODEL_DIR).mkdir(parents=True, exist_ok=True)
    logger.info("Storing Word2Vec model to disk...")
    w2v_model.save(W2V_MODEL_PATH)
else:
    w2v_model = Word2Vec.load(W2V_MODEL_PATH)