# Import

In [19]:
import csv
import os
import sys

import numpy as np
import pandas as pd
import umap
from scipy.sparse.csgraph import connected_components
from sklearn.decomposition import PCA
from tqdm import tqdm

## Add configuration file

In [20]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [21]:
from ALL import config 
from util import *

## Set condition

In [22]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [23]:
s3 = S3Manager()

In [24]:
data_type="AgNews"

In [25]:
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"

# Read data

In [26]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [27]:
df = pd.read_csv(df_path[0], index_col=0)

In [28]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [29]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [30]:
s3.download(f"Vectorize/{data_type}/sentenceBERT/{transformer_model}")

['/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/1_Pooling/config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/README.md',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config_sentence_transformers.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/modules.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/pytorch_model.bin',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/sentence_bert_config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/special_tokens_map.json',
 '/home/j

In [31]:
max_model_num = config["vectorize"]["sentenceBERT"]["max_model_num"]
vector_dims = config["vectorize"]["sentenceBERT"]["dims"]

# Dimension Depression

In [32]:
vector_path = (
    f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"
)

for vector_dim in tqdm(vector_dims):
    for model_num in range(max_model_num):
        vectors = np.load(
            f"{vector_path}/raw/{model_num}.npy",
        )
        reduced_vectors = PCA(
            n_components=vector_dim, random_state=model_num
        ).fit_transform(vectors)

        np.save(
            make_filepath(f"{vector_path}/pca/{vector_dim}/{model_num}.npy"),
            reduced_vectors,
        )

100%|██████████| 10/10 [00:23<00:00,  2.30s/it]


In [40]:
np.save(
    make_filepath(f"{vector_path}/pca/{vectors.shape[1]}/{model_num}.npy"), vectors
)

In [33]:
vector_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"

for vector_dim in tqdm(vector_dims):
    for model_num in range(max_model_num):
        vectors = np.load(
            f"{vector_path}/raw/{model_num}.npy",
        )
        reduced_vectors = umap.UMAP(
            n_components=vector_dim, random_state=model_num
        ).fit_transform(vectors)
        np.save(
            make_filepath(f"{vector_path}/umap/{vector_dim}/{model_num}.npy"),
            reduced_vectors,
        )

100%|██████████| 10/10 [1:45:35<00:00, 633.53s/it] 


In [41]:
np.save(
    make_filepath(f"{vector_path}/umap/{vectors.shape[1]}/{model_num}.npy"), vectors
)

## upload file

In [42]:
s3.upload(
    f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector", 
    f"Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"
)

'../../temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/vector'

In [43]:
s3.delete_local_all()

../../temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/vector
/home/jovyan/temporary/Preprocessing/AgNews/master.csv
/home/jovyan/temporary/Preprocessing/AgNews/class.csv
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/1_Pooling/config.json
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/README.md
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config.json
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config_sentence_transformers.json
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/modules.json
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/pytorch_model.bin
/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v

In [36]:
send_line_notify(f"end {data_type} sentenceBERT {transformer_model}")