# Import

In [1]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
s3 = S3Manager()

In [6]:
data_type = "20News"

In [7]:
transformer_model = "sentence-transformers/gtr-t5-large"

# Read data

In [8]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [9]:
df = pd.read_csv(df_path[0], index_col=0)

In [10]:
# sampling for test
# df.sample(n=1000, random_state=0)

In [11]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [12]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [13]:
max_model_num = config["vectorize"]["sentenceBERT"]["max_model_num"]

# Embedding

In [14]:
def get_sentenceBERT(texts,seed, path):
    model = SentenceTransformer(transformer_model)
    model.save(path)
    vectors = model.encode(texts)
    return vectors

In [None]:
vectors_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/vector"
models_path = f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/model"

for model_num in tqdm(range(max_model_num)):
    vectors = get_sentenceBERT(
        df.text.tolist(),
        seed=model_num,
        path=make_filepath(f"{models_path}/{model_num}"),
    )

    np.save(
        make_filepath(f"{vectors_path}/raw/{model_num}.npy"),
        np.stack(vectors),
    )


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)071a2/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)c5306071a2/README.md:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading (…)306071a2/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/670M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)071a2/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)06071a2/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

## upload file

In [None]:
s3.upload(
    f"../../temporary/Vectorize/{data_type}/sentenceBERT/{transformer_model}/", 
    f"Vectorize/{data_type}/sentenceBERT/{transformer_model}/"
)

In [None]:
s3.delete_local_all()

In [None]:
send_line_notify(f"end {data_type} sentenceBERT {transformer_model}")

In [None]:
!free

In [None]:
!df