# Import

In [12]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_type="AgNewsTitle"

# Read data

In [6]:
df = pd.read_csv(
    f"../Preprocessing/data/{data_type}/master.csv", index_col=0
)

In [7]:
with open(f"../Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
documents = [
    TaggedDocument(doc.split(" "), [i]) for i, doc in enumerate(df.words)
]

In [9]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
max_model_num = config["vectorize"]["doc2vec"]["max_model_num"]

In [10]:
def get_doc2vec(texts, vector_size,seed, path):
    model = Doc2Vec(
        documents,
        vector_size=vector_size,
        window=5,
        min_count=1,
        epochs=50,
        seed=seed,
        workers=os.cpu_count(),
    )
    model.save(path)
    vectors = [model.infer_vector(text) for text in texts]
    return vectors

In [11]:
vectors_path = f"data/{data_type}/doc2vec/vector"
models_path = f"data/{data_type}/doc2vec/model"
for vector_size in tqdm(vector_dims):
    for model_num in range(max_model_num):
        vectors = get_doc2vec(
            df.words.apply(lambda word: word.split(" ")),
            vector_size,
            seed=model_num,
            path=make_filepath(f"{models_path}/{vector_size}/{model_num}.model"),
        )

        np.savetxt(
            make_filepath(f"{vectors_path}/{vector_size}/{model_num}.csv"),
            np.stack(vectors),
            delimiter=",",
        )

100%|██████████| 9/9 [2:02:00<00:00, 813.39s/it]
