# Import

In [2]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

## Add configuration file

In [3]:
sys.path.append("/home/jovyan/core/config/")

In [4]:
from ALL import config 

## Set condition

In [5]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [6]:
newsgroups_df = pd.read_csv("../../../Preprocessing/data/20News/master.csv", index_col=0)

In [7]:
with open("../../../Preprocessing/data/20News/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_num = config["vectorize"]["doc2vec"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]

In [9]:
n_components = 20

In [10]:
def getGMM(vectors, n_components, covariance_type, seed, path):
    gmm = GaussianMixture(
        n_components=n_components, covariance_type=covariance_type, random_state=seed
    )
    gmm.fit(vectors)
    # save model
    os.makedirs(os.path.dirname(path), exist_ok=True)
    pickle.dump(gmm, open(path, "wb"))
    pred = gmm.predict(vectors)
    return pred

In [11]:
vectors_path = "../../data/20News/doc2vec/vector/"
models_path = "../../data/20News/doc2vec/GMM/model/"
pred_path = "../../data/20News/doc2vec/GMM/pred/"
for vector_dim in tqdm(vector_dims):
    for model_num in range(model_nums):
        for covariance_type in covariance_types:
            vector_path = (
                f"{vectors_path}{vector_dim}/normalized/{vector_model_num}.csv"
            )
            vectors = np.loadtxt(vector_path, delimiter=",")

            pred = getGMM(
                vectors,
                seed=model_num,
                n_components=n_components,
                covariance_type=covariance_type,
                path=f"{models_path}{vector_dim}/{covariance_type}/{model_num}.sav",
            )

            # save prediction
            os.makedirs(f"{pred_path}{vector_dim}/{covariance_type}", exist_ok=True)
            with open(
                f"{pred_path}{vector_dim}/{covariance_type}/{model_num}.csv", "w"
            ) as f:
                writer = csv.writer(f)
                writer.writerow(pred)

100%|██████████| 14/14 [20:41<00:00, 88.70s/it] 
