# Import

In [1]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")

In [3]:
from ALL import config 

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [5]:
newsgroups_df = pd.read_csv("../../Preprocessing/data/20News/master.csv", index_col=0)

In [6]:
with open("../../Preprocessing/data/20News/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [7]:
documents = [
    TaggedDocument(doc.split(" "), [i]) for i, doc in enumerate(newsgroups_df.words)
]

In [8]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_num = config["vectorize"]["doc2vec"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]

In [9]:
def get_doc2vec(texts, vector_size,seed, path):
    model = Doc2Vec(
        documents,
        vector_size=vector_size,
        window=5,
        min_count=1,
        epochs=50,
        seed=seed,
        workers=os.cpu_count(),
    )
    model.save(path)
    vectors = [model.infer_vector(text) for text in texts]
    return vectors

In [10]:
model_nums = 1

In [11]:
vectors_path = "../data/20News/doc2vec/vector/"
models_path = "../data/20News/doc2vec/model/"
for vector_size in tqdm(vector_dims):
    for model_num in range(model_nums):
        vector_path = f"{vectors_path}/{vector_size}/"
        model_path = f"{models_path}/{vector_size}/"
        os.makedirs(vector_path, exist_ok=True)
        os.makedirs(model_path, exist_ok=True)

        vectors = get_doc2vec(
            newsgroups_df.words.apply(lambda word: word.split(" ")),
            vector_size,
            seed=model_num,
            path=f"{model_path}{model_num}.model",
        )

        np.savetxt(
            f"{vector_path}{model_num}.csv",
            np.stack(vectors),
            delimiter=",",
        )

100%|██████████| 14/14 [2:49:39<00:00, 727.11s/it] 
