# Import

In [15]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from scipy import linalg, sparse
from tqdm import tqdm
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_mutual_info_score

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../Function/")

In [3]:
from ALL import config 
from util import *
from extmath import row_norms

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_type="20News"
vectorize_type = "sentenceBERT"

In [6]:
vector_dims = config["vectorize"][vectorize_type]["dims"]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_nums = config["vectorize"][vectorize_type]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]

# Read data

In [7]:
df = pd.read_csv(
    f"../../Preprocessing/data/{data_type}/master.csv", index_col=0
)

In [8]:
with open(f"../../Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [9]:
label = df["class"].to_numpy()

In [10]:
def _estimate_maharanobis_dist(X, means, precisions_chol, covariance_type):
    """Estimate the log Gaussian probability.
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
    means : array-like of shape (n_components, n_features)
    precisions_chol : array-like
        Cholesky decompositions of the precision matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)
    covariance_type : {'full', 'tied', 'diag', 'spherical'}
    Returns
    -------
    log_prob : array, shape (n_samples, n_components)
    """
    n_samples, n_features = X.shape
    n_components, _ = means.shape
    
    if covariance_type == "full":
        log_prob = np.empty((n_samples, n_components))
        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
            y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == "tied":
        log_prob = np.empty((n_samples, n_components))
        for k, mu in enumerate(means):
            y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == "diag":
        precisions = precisions_chol**2
        log_prob = (
            np.sum((means**2 * precisions), 1)
            - 2.0 * np.dot(X, (means * precisions).T)
            + np.dot(X**2, precisions.T)
        )

    elif covariance_type == "spherical":
        precisions = precisions_chol**2
        log_prob = (
            np.sum(means**2, 1) * precisions
            - 2 * np.dot(X, means.T * precisions)
            + np.outer(row_norms(X, squared=True), precisions)
        )
    # Since we are using the precision of the Cholesky decomposition,
    # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol`
    return log_prob

In [11]:
def gmm_value(gmm, vectors, label):
    pred = gmm.predict(vectors)
    prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
    dist = _estimate_maharanobis_dist(
        vectors, gmm.means_, gmm.precisions_cholesky_, gmm.covariance_type
    )
    aic = gmm.aic(vectors)
    bic = gmm.bic(vectors)
    mi = adjusted_mutual_info_score(pred, label)
    logl = gmm.score(vectors, label)
    return {
        "pred": pred,
        "prob": prob,
        "dist": dist,
        "aic": aic,
        "bic": bic,
        "mi": mi,
        "logl": logl,
    }

In [12]:
def save_output(output, path):
    # save prediction
    os.makedirs(os.path.dirname(path), exist_ok=True)
    np.save(path, output)

In [None]:
vectors_path = f"../../Clustering/data/{data_type}/{vectorize_type}/vector"
models_path = f"../../Clustering/data/{data_type}/{vectorize_type}/GMM/model/"
pred_path = f"../../Clustering/data/{data_type}/{vectorize_type}/GMM/pred/"
prob_path = f"../../Clustering/data/{data_type}/{vectorize_type}/GMM/prob/"
dist_path = f"../../Clustering/data/{data_type}/{vectorize_type}/GMM/dist/"
stats_lists = []

for vector_model_num in range(vector_model_nums):
    for model_num in tqdm(range(model_nums)):
        stats_list = []
        for vector_dim in vector_dims:
            stats = []
            for covariance_type in covariance_types:
                vectors = np.load(
                    f"{vectors_path}/{vector_dim}/normalized/{vector_model_num}.npy"
                )

                gmm = pickle.load(
                    open(
                        f"{models_path}{vector_dim}/{covariance_type}/{model_num}.sav", "rb"
                    )
                )
                value = gmm_value(gmm, vectors, label)
                pred, prob, dist, *stat = value.items()
                save_output(
                    pred[1], f"{pred_path}{vector_dim}/{covariance_type}/{model_num}.npy"
                )
                save_output(
                    prob[1], f"{prob_path}{vector_dim}/{covariance_type}/{model_num}.npy"
                )
                save_output(
                    dist[1], f"{dist_path}{vector_dim}/{covariance_type}/{model_num}.npy"
                )
                stats.append(dict(stat))
            stats_list.append(stats)
        stats_lists.append(stats_list)

  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.ex

  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
  prob = np.ex

In [None]:
for model_num in range(model_nums):
    for covariance_type in covariance_types:
        stats_df = [
            _df.loc[covariance_type, :]
            for _df in [
                pd.DataFrame(stats_list, index=covariance_types)
                for stats_list in stats_lists[model_num]
            ]
        ]
        stats_df = pd.concat(stats_df, axis=1).T
        stats_df.index = vector_dims

        
        stats_path = f"../data/{data_type}/{vectorize_type}/GMM/stats/{covariance_type}/{model_num}.csv"
        os.makedirs(os.path.dirname(stats_path), exist_ok=True)
        stats_df.to_csv(stats_path)