# Import

In [1]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
s3 = S3Manager()

In [24]:
data_type="AgNews"
vectorize_type = "doc2vec"

In [25]:
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"

# Read data

In [26]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [27]:
df = pd.read_csv(df_path[0], index_col=0)

In [28]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [29]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [30]:
max_vector_model_nums = config["vectorize"][vectorize_type]["max_model_num"]
vector_dims = config["vectorize"][vectorize_type]["dims"]

In [31]:
if vectorize_type == "doc2vec":
    vector_object = f"Vectorize/{data_type}/{vectorize_type}/vector/"
elif vectorize_type == "sentenceBERT":
    vector_object = f"Vectorize/{data_type}/{vectorize_type}/{transformer_model}/vector"
else:
    raise NotImplementedError

In [32]:
s3.download(vector_object)

['/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/10/0.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/10/1.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/160/0.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/160/1.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/0.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/1.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/2.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/3.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/4.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/5.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/2/6.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/20/0.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/20/1.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vector/3/0.csv',
 '/home/jovyan/temporary/Vectorize/AgNews/doc2vec/vect

# Functions

In [33]:
def centralize_array(array):
    return array - np.mean(array, axis=0)

In [34]:
def normarize_array(array):
    return array / np.sqrt(np.sum(array * array, axis=1).reshape(-1, 1))

In [35]:
def normarize_vector(vector):
    return vector / np.sqrt(np.sum(vector * vector))

In [36]:
def get_average_vector(vectors):
    sum_vector = np.sum(vectors, axis=0)
    return normarize_vector(sum_vector)

# Centralize Normalize

In [None]:
vectors_path = f"../../temporary/Vectorize/data/{data_type}/{vectorize_type}/vector"
converted_vectors_path = f"../data/{data_type}/{vectorize_type}/vector"

In [13]:
for vector_model_num in range(max_vector_model_nums):
    for vector_dim in tqdm(vector_dims):
        vector = np.loadtxt(
            f"{vectors_path}/{vector_dim}/{vector_model_num}.csv",
            delimiter=",",
        )
        centralized_vector = centralize_array(vector)
        normarized_vector = normarize_array(centralized_vector)

        np.save(
            make_filepath(
                f"{converted_vectors_path}/{vector_dim}/centralized/{vector_model_num}.npy"
            ),
            centralized_vector,
        )
        np.save(
            make_filepath(
                f"{converted_vectors_path}/{vector_dim}/normalized/{vector_model_num}.npy"
            ),
            normarized_vector,
        )

100%|██████████| 9/9 [00:22<00:00,  2.49s/it]


# Normalize