# Import

In [45]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

## Add configuration file

In [34]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [35]:
from ALL import config 
from util import *

## Set condition

In [36]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [37]:
data_type="AgNewsTitle"
vectorize_type = "sentenceBERT"

# Read data

In [38]:
df = pd.read_csv(
    f"../../Preprocessing/data/{data_type}/master.csv", index_col=0
)

In [39]:
with open(f"../../Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [40]:
max_vector_model_num = config["vectorize"][vectorize_type]["max_model_num"]
# vector_dims = config["vectorize"][vectorize_type]["dims"]
vector_dims = [20]
normalization = config["vectorize"][vectorize_type]["normalization"]
# model_nums = config["clustering"]["gmm"]["max_model_num"]
model_nums = 10
# covariance_types = config["clustering"]["gmm"]["covariance_types"]
covariance_types = ["spherical"]
topic_nums = [2, 4, 8, 16, 32, 64, 128]

In [41]:
# n_components = config["data"][data_type]["class_num"]

# Clustering

In [42]:
def getGMM(vectors, n_components, covariance_type, seed, path):
    gmm = GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        random_state=seed,
        max_iter=400,
        init_params="k-means++",
        n_init=3
    )
    gmm.fit(vectors)
    # save model
    os.makedirs(os.path.dirname(path), exist_ok=True)
    pickle.dump(gmm, open(path, "wb"))
    pred = gmm.predict(vectors)
    return pred

In [43]:
vectors_path = f"../data/{data_type}/{vectorize_type}/vector"
models_path = f"../data/{data_type}/{vectorize_type}/GMM/model"
pred_path = f"../data/{data_type}/{vectorize_type}/GMM/pred"
for vector_model_num in range(max_vector_model_num):
    for vector_dim in tqdm(vector_dims):
        for model_num in range(model_nums):
            for covariance_type in covariance_types:
                for topic_num in topic_nums:
                    vectors = np.load(
                        f"{vectors_path}/{vector_dim}/{normalization}/{vector_model_num}.npy"
                    )

                    pred = getGMM(
                        vectors,
                        seed=model_num,
                        n_components=topic_num,
                        covariance_type=covariance_type,
                        path=f"{models_path}/{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.sav",
                    )

                    # save prediction
                    np.save(
                        make_filepath(
                            f"{pred_path}/{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.npy"
                        ),
                        pred,
                    )

100%|██████████| 1/1 [54:21<00:00, 3261.27s/it]


In [44]:
send_line_notify(f"end {data_type} {vectorize_type}")