# Import

In [58]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

## Add configuration file

In [59]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [60]:
from ALL import config 
from util import *

## Set condition

In [61]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [62]:
s3 = S3Manager()

In [63]:
data_type = "AgNewsTitle"

In [64]:
transformer_model = "sentence-transformers/all-mpnet-base-v2"

# Read data

In [65]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [66]:
df = pd.read_csv(df_path, index_col=0)

In [67]:
# sampling for test
# df.sample(n=1000, random_state=0)

In [68]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [69]:
with open(labels_path, mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [70]:
max_model_num = config["vectorize"]["sentenceBERT"]["max_model_num"]

# Embedding

In [71]:
def get_sentenceBERT(texts,seed, path):
    model = SentenceTransformer(transformer_model)
    model.save(path)
    vectors = model.encode(texts)
    return vectors

In [None]:
vectors_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"
models_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/model"

for model_num in tqdm(range(max_model_num)):
    vectors = get_sentenceBERT(
        df.text.tolist(),
        seed=model_num,
        path=make_filepath(f"{models_path}/{model_num}"),
    )

    np.save(
        make_filepath(f"{vectors_path}/raw/{model_num}.npy"),
        np.stack(vectors),
    )

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## upload file

In [None]:
s3.upload(
    f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/", 
    f"Vectorize/{data_type}/sentenceBERT/{transformer_model}/"
)

In [None]:
s3.delete_local_all()

In [None]:
send_line_notify(f"end {data_type} sentenceBERT {transformer_model}")