# Import

In [1]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
s3 = S3Manager()

In [6]:
data_type = "20News"

In [7]:
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"

# Read data

In [8]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [9]:
df = pd.read_csv(df_path, index_col=0)

In [10]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [11]:
with open(labels_path, mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [12]:
max_model_num = config["vectorize"]["sentenceBERT"]["max_model_num"]

# Embedding

In [13]:
def get_sentenceBERT(texts,seed, path):
    model = SentenceTransformer(transformer_model)
    model.save(path)
    vectors = model.encode(texts)
    return vectors

In [None]:
vectors_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"
models_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/model"

for model_num in tqdm(range(max_model_num)):
    vectors = get_sentenceBERT(
        df.text.tolist(),
        seed=model_num,
        path=make_filepath(f"{models_path}/{model_num}"),
    )

    np.save(
        make_filepath(f"{vectors_path}/{model_num}.npy"),
        np.stack(vectors),
    )

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## upload file

In [None]:
s3.upload(
    f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/", 
    f"Vectorize/{data_type}/sentenceBERT/{transformer_model}/"
)

In [None]:
s3.delete_local_all()