# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [7]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [8]:
s3 = S3Manager()

In [9]:
sampling_nums = [1, 2, 4, 8, 16, 32, 64, 128]

In [12]:
# data_types = [f"20NewsSampled{sampling_num}" for sampling_num in sampling_nums]
data_types = ["TweetTopic", "TweetFinance"]
clustering_model = "LDA"

In [13]:
data_type = data_types[0]

In [14]:
model_nums = config["clustering"][clustering_model]["max_model_num"]
topic_nums = {
    data_type: config["data"][data_type_classifier(data_type)]["class_num"]
    for data_type in data_types
}

# Read data

In [15]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [16]:
df = pd.read_csv(df_path[0], index_col=0)

In [17]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [18]:
label = df["class"].to_numpy()

In [19]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [28]:
base_pred_path = f"Clustering/{data_type}/{clustering_model}/pred"
base_prob_path = f"Clustering/{data_type}/{clustering_model}/prob"
base_stats_path = f"Postprocessing/{data_type}/{clustering_model}/stats"

In [29]:
s3.download(base_pred_path)

[]

In [30]:
s3.download(base_prob_path)

[]

# Calculate Stats

In [18]:
for model_num in range(model_nums):
    pred = np.load(f"{root_path_temporary}{base_pred_path}/{model_num}.npy")
    mi = adjusted_mutual_info_score(pred, label)
    stats = {"mi": mi}
    with open(
        make_filepath(f"{root_path_temporary}{base_stats_path}/{model_num}.csv"), "w"
    ) as f:
        writer = csv.DictWriter(f, stats.keys())
        writer.writeheader()
        writer.writerow(stats)

## upload file

In [19]:
s3.upload(f"{root_path_temporary}{base_stats_path}")

'/home/jovyan/temporary/Postprocessing/TweetTopic/BERTopic/stats'

In [20]:
s3.delete_local_all()

/home/jovyan/temporary/Postprocessing/TweetTopic/BERTopic/stats
/home/jovyan/temporary/Preprocessing/TweetTopic/master.csv
/home/jovyan/temporary/Preprocessing/TweetTopic/class.csv
/home/jovyan/temporary/Clustering/TweetTopic/BERTopic/pred/0.npy
/home/jovyan/temporary/Postprocessing/TweetTopic/BERTopic/prob/0.npy


In [21]:
stats

{'mi': 0.2309421558442277}

In [7]:
with open(f"{root_path_temporary}Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
model_nums = config["clustering"]["LDA"]["max_model_num"]

In [10]:
label = df["class"].to_numpy()

In [13]:
pred_path = f"{root_path_temporary}Clustering/data/{data_type}/LDA/pred/"

mi = {}
for model_num in range(model_nums):
    pred = np.load(f"{pred_path}{model_num}.npy")
    mi[model_num] = adjusted_mutual_info_score(label, pred)

In [18]:
mi_path = f"{root_path_temporary}Postprocessing/{data_type}/LDA/mi.csv"

pd.DataFrame(mi.values(), index=mi.keys()).to_csv(make_filepath(mi_path))