# Import

In [1]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [3]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_type = "AgNews"
vectorize_types = ["doc2vec", "sentenceBERT"]

In [6]:
model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]
normalization = "centralized"
covariance_types = ["spherical", "diag", "full"]
vector_dims = {
    "doc2vec": config["vectorize"]["doc2vec"]["dims"],
    "sentenceBERT": config["vectorize"]["sentenceBERT"]["dims"] + [384],
}

In [7]:
stats_vals = ["aic", "bic", "mi", "logl"]

# Stats

## Read data

In [8]:
def load_stats_data(vectorize_type, stats_vals, covariance_types, model_nums):
    # データ型定義
    stats = {
        stats_val: {
            covariance_type: {
                model_num: pd.DataFrame for model_num in range(model_nums)
            }
            for covariance_type in covariance_types
        }
        for stats_val in stats_vals
    }

    # データ取得
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            for model_num in range(model_nums):
                stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{model_num}.csv"
                df = pd.read_csv(stats_path, index_col=0)
                stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]
    return stats

In [9]:
def load_lda_mi(data_type):
    lda_mi = pd.read_csv(
        f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0
    )
    describe_lda_mi, _ = get_describe(lda_mi, axis=0)
    return describe_lda_mi

In [10]:
stats_dict = {}
for vectorize_type in vectorize_types:
    stats_dict[vectorize_type] = load_stats_data(
        vectorize_type, stats_vals, covariance_types, model_nums
    )

In [11]:
describe_lda_mi = load_lda_mi(data_type)

## Data shaping

In [12]:
def shape_stats_df(stats):
    stats_df = {
        stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            # model_numについて取得データstatsを結合
            stats_df[stats_val][covariance_type] = pd.concat(
                stats[stats_val][covariance_type], axis=1
            )
    return stats_df

In [13]:
def shape_describe(stats_df):
    describe = {
        stats_val: {covariance_type: dict() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            describe[stats_val][covariance_type], describe_keys = get_describe(
                stats_df[stats_val][covariance_type], axis=1
            )
    return describe, describe_keys

In [14]:
def shape_data(describe, describe_keys):
    data = {
        stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
        for stats_val in stats_vals
    }
    for stats_val in stats_vals:
        for describe_key in describe_keys:
            # covariance_typeについてデータを結合
            _data = {
                covariance_type: describe[stats_val][covariance_type][describe_key]
                for covariance_type in covariance_types
            }
            data[stats_val][describe_key] = pd.concat(_data, axis=1)
    return data

In [15]:
data_stats = {}
for vectorize_type, stats in stats_dict.items():
    stats_df = shape_stats_df(stats)
    describe, describe_keys = shape_describe(stats_df)
    data_stats[vectorize_type] = shape_data(describe, describe_keys)

In [16]:
stats_df["mi"]["full"].style.highlight_max(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
2,0.559034,0.55868,0.501671,0.559035,0.558948,0.559038,0.559023,0.559086,0.501823,0.559021,0.558667,0.559075,0.501833,0.559021,0.559039,0.558888,0.501684,0.55903,0.559023,0.501936,0.559044,0.558915,0.501779,0.559023,0.558948,0.558724,0.559023,0.558979,0.559044,0.55897
4,0.609963,0.609998,0.610009,0.554481,0.554599,0.609963,0.610034,0.610004,0.554481,0.554498,0.609999,0.610018,0.609963,0.61002,0.609999,0.60999,0.554524,0.61002,0.554498,0.609999,0.610034,0.609998,0.554498,0.610006,0.609999,0.610005,0.609999,0.610005,0.61002,0.610039
6,0.611221,0.611243,0.611288,0.525048,0.52504,0.6113,0.590708,0.590425,0.52504,0.52504,0.52504,0.611408,0.611305,0.525048,0.590685,0.611288,0.611403,0.611403,0.525048,0.52504,0.590712,0.590712,0.61128,0.525048,0.611401,0.590708,0.611222,0.611403,0.611408,0.611408
8,0.59704,0.544894,0.59704,0.545506,0.544894,0.544854,0.59704,0.59704,0.59704,0.59703,0.596973,0.545506,0.59698,0.597008,0.59704,0.59704,0.545536,0.544884,0.59704,0.545536,0.597052,0.597029,0.59704,0.597033,0.544892,0.59698,0.59698,0.597047,0.59703,0.544894
10,0.521723,0.521723,0.565478,0.521723,0.565525,0.565493,0.521723,0.521723,0.565512,0.521723,0.521723,0.521723,0.565471,0.565474,0.521723,0.521723,0.521723,0.521723,0.521723,0.565512,0.565478,0.521723,0.565483,0.565512,0.521723,0.565465,0.521723,0.565493,0.521723,0.521723
20,0.565719,0.565687,0.579595,0.565706,0.579564,0.56573,0.579552,0.579552,0.57957,0.565724,0.579552,0.56573,0.56567,0.565719,0.565719,0.565654,0.579552,0.565654,0.565719,0.56567,0.56573,0.565654,0.579595,0.565719,0.565654,0.579552,0.579583,0.579563,0.579552,0.579587
40,0.542845,0.559191,0.590087,0.590087,0.590087,0.542845,0.542832,0.590087,0.559203,0.559203,0.559203,0.542832,0.590087,0.559191,0.542832,0.590087,0.559191,0.559203,0.559191,0.590087,0.542832,0.590092,0.590087,0.542832,0.559191,0.590087,0.590087,0.590087,0.590087,0.590092
80,0.590857,0.558812,0.542162,0.587414,0.591112,0.540588,0.556944,0.540588,0.540588,0.540588,0.542162,0.591129,0.587414,0.540588,0.590852,0.557532,0.558812,0.540588,0.558812,0.587418,0.540588,0.590869,0.540588,0.591068,0.590853,0.591068,0.587414,0.591116,0.590852,0.587414
160,0.563133,0.585039,0.55131,0.55131,0.585154,0.585043,0.55131,0.55131,0.563133,0.585154,0.55131,0.563133,0.585154,0.585154,0.585154,0.585154,0.585154,0.585154,0.563133,0.563133,0.585154,0.585154,0.55131,0.563133,0.550576,0.585154,0.585154,0.55131,0.585154,0.585154
384,0.608797,0.6087,0.608813,0.608788,0.608793,0.608793,0.60864,0.608759,0.608665,0.608722,0.608857,0.608665,0.608813,0.608734,0.608632,0.608777,0.608665,0.608811,0.608822,0.60858,0.510852,0.608772,0.608797,0.608757,0.608157,0.608769,0.60875,0.510756,0.608743,0.608801


# Coherence

## Read Data

In [17]:
def load_coherence(data_type, vectorize_type, vector_dims, normalization):
    coherences = {}
    for vector_dim in vector_dims:
        coherence_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/coherence/{normalization}/{vector_dim}.csv"
        coherence = pd.read_csv(coherence_path, index_col=0)
        coherences[vector_dim] = coherence
    return coherences

In [18]:
coherences = {}
for vectorize_type in vectorize_types:
    coherences[vectorize_type] = load_coherence(
        data_type, vectorize_type, vector_dims[vectorize_type], normalization
    )

In [19]:
coherence_lda_path = f"../../Postprocessing/data/{data_type}/LDA/coherence.csv"
coherence_lda = pd.read_csv(coherence_lda_path, index_col=0)
describe_lda_coherence, _ = get_describe(coherence_lda, axis=0)

## Data shaping

In [20]:
def shape_coherence_df(coherences, covariance_types, vector_dims):
    coherence_df = {
        covariance_type: pd.DataFrame() for covariance_type in covariance_types
    }
    for covariance_type in covariance_types:
        _coherence_df = {
            vector_dim: coherences[vector_dim].loc[:, covariance_type]
            for vector_dim in vector_dims
        }
        coherence_df[covariance_type] = pd.concat(_coherence_df, axis=1)
    return coherence_df

In [21]:
coherence_df = {}
for vectorize_type in vectorize_types:
    coherence_df[vectorize_type] = shape_coherence_df(
        coherences[vectorize_type], covariance_types, vector_dims[vectorize_type]
    )

In [22]:
def shape_coherence_describe(coherence_df, covariance_types):
    # 統計値の計算
    describe = {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
    for covariance_type in covariance_types:
        describe[covariance_type], describe_keys = get_describe(
            coherence_df[covariance_type], axis=0
        )
    return describe, describe_keys

In [23]:
coherence_describe = {}
for vectorize_type in vectorize_types:
    coherence_describe[vectorize_type], describe_keys = shape_coherence_describe(
        coherence_df[vectorize_type], covariance_types
    )

In [24]:
def shape_coherence_data(describe, describe_keys):
    data = {describe_key: pd.DataFrame() for describe_key in describe_keys}

    for describe_key in describe_keys:
        # covariance_typeについてデータを結合
        _data = {
            covariance_type: describe[covariance_type][describe_key]
            for covariance_type in covariance_types
        }
        data[describe_key] = pd.concat(_data, axis=1)
    return data

In [25]:
coherence_data = {}
for vectorize_type in vectorize_types:
    coherence_data[vectorize_type] = shape_coherence_data(
        coherence_describe[vectorize_type], describe_keys
    )

# Make Chart

In [26]:
chart_data = {}
for vectorize_type in vectorize_types:
    mi_max_idx = data_stats[vectorize_type]["mi"]["mean"].stack().idxmax()
    chart_data[vectorize_type] = {
        "埋め込み次元": mi_max_idx[0],
        "mutual information": data_stats[vectorize_type]["mi"]["mean"].loc[mi_max_idx],
        "分散": data_stats[vectorize_type]["mi"]["std"].loc[mi_max_idx],
        "coherence": coherence_data[vectorize_type]["mean"].loc[mi_max_idx],
    }

In [27]:
chart_df = pd.DataFrame(chart_data)

In [28]:
chart_df["LDA"] = [
    np.NaN,
    describe_lda_mi["mean"].to_numpy()[0],
    describe_lda_mi["std"].to_numpy()[0],
    describe_lda_coherence["mean"].to_numpy()[0]
]

In [29]:
chart_df.T.loc[:, ["mutual information", "coherence"]].style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,mutual information,coherence
doc2vec,0.430702,0.783172
sentenceBERT,0.609755,0.925091
LDA,0.407566,0.806057


In [30]:
print(
    chart_df.T.loc[:, ["mutual information", "coherence"]]
    .style.format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="miの比較と埋め込み次元",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{miの比較と埋め込み次元}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 & mutual information & coherence \\
\midrule
doc2vec & 0.431 & 0.783 \\
sentenceBERT & 0.610 & 0.925 \\
LDA & 0.408 & 0.806 \\
\bottomrule
\end{tabular}
\end{table}

