# Import

In [1]:
import csv
import os
import pickle
import sys

import numpy as np
import pandas as pd
from scipy import linalg, sparse
from tqdm import tqdm
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_mutual_info_score

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../Function/")

In [3]:
from ALL import config 
from util import *
from extmath import row_norms

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [10]:
s3 = S3Manager()

In [11]:
data_type="AgNews"
vectorize_type = "sentenceBERT"
transformer_model = "sentence-transformers/all-MiniLM-L6-v2"#sys.argv[2]

In [12]:
vector_dims = config["vectorize"][vectorize_type]["dims"]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_nums = config["vectorize"][vectorize_type]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]
covariance_types = ["spherical"]
normalization = config["vectorize"][vectorize_type]["normalization"]
topic_nums = config["data"][data_type_classifier(data_type)]["class_num"]

# Read data

In [13]:
df_path = s3.download(f"Preprocessing/{data_type}/master.csv")

In [14]:
df = pd.read_csv(df_path[0], index_col=0)

In [15]:
labels_path = s3.download(f"Preprocessing/{data_type}/class.csv")

In [16]:
label = df["class"].to_numpy()

In [17]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [None]:
if vectorize_type == "doc2vec":
    vectors_path = f"../../temporary/Clustering/{data_type}/{vectorize_type}/vector"
    models_path = f"../../temporary/Clustering/{data_type}/{vectorize_type}/GMM/model/"
elif vectorize_type == "sentenceBERT":
    vectors_path = f"../../temporary/Clustering/{data_type}/{vectorize_type}/{transformer_model}/vector"
    models_path = f"../../temporary/Clustering/{data_type}/{vectorize_type}/{transformer_model}/GMM/model/"
else:
    raise NotImplementedError

In [18]:
s3.download(vectors_path)

['/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/1_Pooling/config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/README.md',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/config_sentence_transformers.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/modules.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/pytorch_model.bin',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/sentence_bert_config.json',
 '/home/jovyan/temporary/Vectorize/AgNews/sentenceBERT/sentence-transformers/all-MiniLM-L6-v2/model/0/special_tokens_map.json',
 '/home/j

In [None]:
s3.download(models_path)

# functions

In [19]:
def _estimate_maharanobis_dist(X, means, precisions_chol, covariance_type):
    """Estimate the log Gaussian probability.
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
    means : array-like of shape (n_components, n_features)
    precisions_chol : array-like
        Cholesky decompositions of the precision matrices.
        'full' : shape of (n_components, n_features, n_features)
        'tied' : shape of (n_features, n_features)
        'diag' : shape of (n_components, n_features)
        'spherical' : shape of (n_components,)
    covariance_type : {'full', 'tied', 'diag', 'spherical'}
    Returns
    -------
    log_prob : array, shape (n_samples, n_components)
    """
    n_samples, n_features = X.shape
    n_components, _ = means.shape
    
    if covariance_type == "full":
        log_prob = np.empty((n_samples, n_components))
        for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
            y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == "tied":
        log_prob = np.empty((n_samples, n_components))
        for k, mu in enumerate(means):
            y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)
            log_prob[:, k] = np.sum(np.square(y), axis=1)

    elif covariance_type == "diag":
        precisions = precisions_chol**2
        log_prob = (
            np.sum((means**2 * precisions), 1)
            - 2.0 * np.dot(X, (means * precisions).T)
            + np.dot(X**2, precisions.T)
        )

    elif covariance_type == "spherical":
        precisions = precisions_chol**2
        log_prob = (
            np.sum(means**2, 1) * precisions
            - 2 * np.dot(X, means.T * precisions)
            + np.outer(row_norms(X, squared=True), precisions)
        )
    # Since we are using the precision of the Cholesky decomposition,
    # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol`
    return log_prob

In [20]:
def gmm_value(gmm, vectors, label):
    pred = gmm.predict(vectors)
    prob = np.exp(gmm._estimate_weighted_log_prob(vectors))
    dist = _estimate_maharanobis_dist(
        vectors, gmm.means_, gmm.precisions_cholesky_, gmm.covariance_type
    )
    aic = gmm.aic(vectors)
    bic = gmm.bic(vectors)
    mi = adjusted_mutual_info_score(pred, label)
    logl = gmm.score(vectors, label)
    return {
        "pred": pred,
        "prob": prob,
        "dist": dist,
        "aic": aic,
        "bic": bic,
        "mi": mi,
        "logl": logl,
    }

In [21]:
def save_output(output, path):
    # save prediction
    os.makedirs(os.path.dirname(path), exist_ok=True)
    np.save(path, output)

# Calculate Stats

In [None]:
if vectorize_type == "doc2vec":
    pred_path = (
        f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/GMM/pred/"
    )
    prob_path = (
        f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/GMM/prob/"
    )
    dist_path = (
        f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/GMM/dist/"
    )
if vectorize_type == "sentenceBERT":
    pred_path = f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/{transformer_model}/GMM/pred/"
    prob_path = f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/{transformer_model}/GMM/prob/"
    dist_path = f"../../temporary/Postprocessing/data/{data_type}/{vectorize_type}/{transformer_model}/GMM/dist/"
else:
    raise NotImplementedError

In [51]:
stats_lists = {
    model_num: {
        vector_dim: {
            covariance_type: {topic_num: dict() for topic_num in topic_nums}
            for covariance_type in covariance_types
        }
        for vector_dim in vector_dims
    }
    for model_num in range(model_nums)
}

In [52]:
for vector_model_num in range(vector_model_nums):
    for model_num in tqdm(range(model_nums)):
        for vector_dim in vector_dims:
            for covariance_type in covariance_types:
                for topic_num in topic_nums:
                    vectors = np.load(
                        f"{vectors_path}/{vector_dim}/{normalization}/{vector_model_num}.npy"
                    )

                    gmm = pickle.load(
                        open(
                            f"{models_path}{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.sav",
                            "rb",
                        )
                    )
                    value = gmm_value(gmm, vectors, label)
                    pred, prob, dist, *stat = value.items()
                    save_output(
                        pred[1],
                        f"{pred_path}{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.npy",
                    )
                    save_output(
                        prob[1],
                        f"{prob_path}{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.npy",
                    )
                    save_output(
                        dist[1],
                        f"{dist_path}{vector_dim}/{normalization}/{covariance_type}/{topic_num}/{model_num}.npy",
                    )
                    stats_lists[model_num][vector_dim][covariance_type][
                        topic_num
                    ] = dict(stat)

100%|██████████| 10/10 [01:47<00:00, 10.75s/it]


In [74]:
for vector_model_num in range(vector_model_nums):
    for model_num in tqdm(range(model_nums)):
        for vector_dim in vector_dims:
            for covariance_type in covariance_types:
                for topic_num in topic_nums:
                    _df = pd.DataFrame(
                        stats_lists[model_num][vector_dim][covariance_type]
                    )
                    stats_df = pd.concat(
                        {
                            vector_dim: _df.loc[:, topic_num]
                            for vector_dim in vector_dims
                        },
                        axis=1,
                    ).T
                    stats_path = f"../data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{topic_num}/{model_num}.csv"
                    os.makedirs(os.path.dirname(stats_path), exist_ok=True)
                    stats_df.to_csv(stats_path)

100%|██████████| 10/10 [00:00<00:00, 40.82it/s]


In [None]:
send_line_notify(f"calcStats {data_type} {vectorize_type}")

In [None]:
for model_num in range(model_nums):
    for covariance_type in covariance_types:
        for topic_num in topic_nums:
            stats_df = [
                _df.loc[covariance_type, :]
                for _df in [
                    pd.DataFrame(stats_list, index=covariance_types)
                    for stats_list in stats_lists[model_num]
                ]
            ]
            stats_df = pd.concat(stats_df, axis=1).T
            stats_df.index = vector_dims

            stats_path = f"../data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{topic_num}/{model_num}.csv"
            os.makedirs(os.path.dirname(stats_path), exist_ok=True)
            stats_df.to_csv(stats_path)