# Import

In [38]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from bertopic import BERTopic
from tqdm import tqdm

## Add configuration file

In [39]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [40]:
from ALL import config
import MultilayerDict as md
from util import *

## Set condition

In [41]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [42]:
s3 = S3Manager()

In [43]:
sampling_nums = [1, 2, 4, 8, 16, 32, 64, 128]

In [44]:
# data_types = [f"20NewsSampled{sampling_num}" for sampling_num in sampling_nums]
data_types = ["TweetTopic", "TweetFinance"]
clustering_model = "BERTopic"

In [45]:
data_type = data_types[0]

In [46]:
model_nums = config["clustering"][clustering_model]["max_model_num"]
topic_nums = {
    data_type: config["data"][data_type_classifier(data_type)]["class_num"]
    for data_type in data_types
}

# Read data

In [47]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [48]:
df = pd.read_csv(df_path[0], index_col=0)

In [49]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [50]:
label = df["class"].to_numpy()

In [51]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [52]:
base_pred_path = f"Clustering/{data_type}/BERTopic/pred"
base_model_path = f"Clustering/{data_type}BERTopic/model"
base_prob_path = f"Postprocessing/{data_type}/BERTopic/prob"

# Clustering

In [53]:
for model_num in tqdm(range(model_nums)):
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(df.text)
    topic_model.save(
        make_filepath(f"{root_path_temporary}{base_model_path}/{model_nums}.npy")
    )
    np.save(
        make_filepath(f"{root_path_temporary}{base_pred_path}/{model_num}.npy"), topics
    )
    np.save(
        make_filepath(f"{root_path_temporary}{base_prob_path}/{model_num}.npy"), probs
    )

100%|██████████| 1/1 [02:52<00:00, 172.55s/it]


## upload file

In [54]:
s3.upload(f"{root_path_temporary}{base_pred_path}")
s3.upload(f"{root_path_temporary}{base_prob_path}")
s3.upload(f"{root_path_temporary}{base_model_path}")

'/home/jovyan/temporary/Clustering/TweetTopicBERTopic/model'

In [55]:
s3.delete_local_all()

/home/jovyan/temporary/Clustering/TweetTopic/BERTopic/pred
/home/jovyan/temporary/Postprocessing/TweetTopic/BERTopic/prob
/home/jovyan/temporary/Clustering/TweetTopicBERTopic/model
/home/jovyan/temporary/Preprocessing/TweetTopic/master.csv
/home/jovyan/temporary/Preprocessing/TweetTopic/class.csv


In [56]:
send_line_notify(f"BERTopic.ipynb {data_type} ")