In [None]:
import os
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models.word2vec import LineSentence
from datetime import datetime

In [None]:
# train data 
TRAIN_DATA_PATH = os.getenv("train_data_path")
TRAIN_DATA_DESCRIPTION = os.getenv("train_data_description")

# model data
MODEL_PATH = os.getenv("model_path")
MODEL_ARCHITECTURE = os.getenv("model_architecture")
MODEL_ID = os.getenv("model_id")
MODEL_METADATA_PATH = MODEL_PATH + "metadata.yaml"

# model hyperparameters
VECTOR_SIZE = int(os.getenv("vector_size"))
WINDOW = int(os.getenv("window"))
MIN_COUNT = int(os.getenv("min_count"))

In [None]:
print(f"TRAIN_DATA_PATH: {TRAIN_DATA_PATH}")
print(f"TRAIN_DATA_DESCRIPTION: {TRAIN_DATA_DESCRIPTION}")
print(f"MODEL_PATH: {MODEL_PATH}")
print(f"MODEL_ARCHITECTURE: {MODEL_ARCHITECTURE}")
print(f"MODEL_ID: {MODEL_ID}")
print(f"MODEL_METADATA_PATH: {MODEL_METADATA_PATH}")
print(f"VECTOR_SIZE: {VECTOR_SIZE}")
print(f"WINDOW: {WINDOW}")
print(f"MIN_COUNT: {MIN_COUNT}")

In [None]:
sentences = LineSentence(TRAIN_DATA_PATH)

In [None]:
time_start = datetime.now()
model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=os.cpu_count()
)
duration = (datetime.now() - time_start).seconds / 60
model.save(MODEL_PATH)

In [None]:
# calculate size of training data
def calc_size(file):
    size = os.path.getsize(file)
    for unit in ["","KB","MB","GB","TB"]:
        if abs(size) < 1024.0:
            return f"{round(size, 1)} {unit}"
        size /= 1024.0
train_data_size = calc_size(TRAIN_DATA_PATH)
model_data_size = calc_size(MODEL_PATH)

# calculate hash of training data
train_data_md5_hash = subprocess.run(["md5sum", TRAIN_DATA_PATH], capture_output=True, text=True)
train_data_md5_hash = train_data_md5_hash.stdout.split()[0]


# aggregate into metadata dictionary
metadata = {
    "train_data_name": TRAIN_DATA_NAME,
    "train_data_size": train_data_size,
    "train_data_md5_hash": train_data_md5_hash,
    "training_vector_size": VECTOR_SIZE,
    "window": WINDOW,
    "min_count": MIN_COUNT,
    "workers": WORKERS,
    "training_duration (minutes)": round(duration, 1),
    "model_data_size": model_data_size,
}

# write to yaml
with open(MODEL_METADATA_PATH, "w") as f:
    # iteration over dictionary to ensure the yaml writer respects the order
    for k, v in metadata.items():
        yaml.dump({k: v}, f)