# Import

In [3]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## Add configuration file

In [4]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [5]:
from ALL import config 
from util import *

## Set condition

In [6]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [7]:
data_type="AgNewsTitle"

In [8]:
s3 = S3Manager()

# Read data

In [9]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [10]:
df = pd.read_csv(df_path, index_col=0)

In [11]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [12]:
with open(labels_path, mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

# Vectorize

In [13]:
documents = [
    TaggedDocument(doc.split(" "), [i]) for i, doc in enumerate(df.words)
]

In [14]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
max_model_num = config["vectorize"]["doc2vec"]["max_model_num"]

In [15]:
def get_doc2vec(texts, vector_size, seed, path):
    model = Doc2Vec(
        documents,
        vector_size=vector_size,
        window=5,
        min_count=1,
        epochs=50,
        seed=seed,
        workers=os.cpu_count(),
    )
    model.save(path)
    vectors = [model.infer_vector(text) for text in texts]
    return vectors

In [None]:
vectors_path = f"../temporary/Vectorize/{data_type}/doc2vec/vector"
models_path = f"../temporary/Vectorize/{data_type}/doc2vec/model"
for vector_size in tqdm(vector_dims):
    for model_num in range(max_model_num):
        vectors = get_doc2vec(
            df.words.apply(lambda word: word.split(" ")),
            vector_size,
            seed=model_num,
            path=make_filepath(f"{models_path}/{vector_size}/{model_num}.model"),
        )

        np.save(
            make_filepath(f"{vectors_path}/{vector_size}/{model_num}.npy"),
            np.stack(vectors),
        )

 33%|███▎      | 3/9 [41:56<1:24:05, 840.98s/it]

## upload file

In [None]:
s3.upload(
    f"../temporary/Vectorize/{data_type}/doc2vec/", f"Vectorize/{data_type}/doc2vec"
)

In [None]:
s3.delete_local_all()