# Import

In [31]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [32]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [33]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [34]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [35]:
data_type = "AgNewsTitle"
vectorize_types = ["doc2vec", "sentenceBERT"]

In [36]:
model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]
normalization = "centralized"
covariance_types = ["spherical", "diag", "full"]
vector_dims = {
    "doc2vec": config["vectorize"]["doc2vec"]["dims"],
    "sentenceBERT": config["vectorize"]["sentenceBERT"]["dims"] + [384],
}

In [37]:
stats_vals = ["aic", "bic", "mi", "logl"]

# Stats

## Read data

In [38]:
def load_stats_data(vectorize_type, stats_vals, covariance_types, model_nums):
    # データ型定義
    stats = {
        stats_val: {
            covariance_type: {
                model_num: pd.DataFrame for model_num in range(model_nums)
            }
            for covariance_type in covariance_types
        }
        for stats_val in stats_vals
    }

    # データ取得
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            for model_num in range(model_nums):
                stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{model_num}.csv"
                df = pd.read_csv(stats_path, index_col=0)
                stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]
    return stats

In [39]:
def load_lda_mi(data_type):
    lda_mi = pd.read_csv(
        f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0
    )
    describe_lda_mi, _ = get_describe(lda_mi, axis=0)
    return describe_lda_mi

In [40]:
stats_dict = {}
for vectorize_type in vectorize_types:
    stats_dict[vectorize_type] = load_stats_data(
        vectorize_type, stats_vals, covariance_types, model_nums
    )

In [41]:
describe_lda_mi = load_lda_mi(data_type)

## Data shaping

In [42]:
def shape_stats_df(stats):
    stats_df = {
        stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            # model_numについて取得データstatsを結合
            stats_df[stats_val][covariance_type] = pd.concat(
                stats[stats_val][covariance_type], axis=1
            )
    return stats_df

In [43]:
def shape_describe(stats_df):
    describe = {
        stats_val: {covariance_type: dict() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            describe[stats_val][covariance_type], describe_keys = get_describe(
                stats_df[stats_val][covariance_type], axis=1
            )
    return describe, describe_keys

In [44]:
def shape_data(describe, describe_keys):
    data = {
        stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
        for stats_val in stats_vals
    }
    for stats_val in stats_vals:
        for describe_key in describe_keys:
            # covariance_typeについてデータを結合
            _data = {
                covariance_type: describe[stats_val][covariance_type][describe_key]
                for covariance_type in covariance_types
            }
            data[stats_val][describe_key] = pd.concat(_data, axis=1)
    return data

In [45]:
data_stats = {}
for vectorize_type, stats in stats_dict.items():
    stats_df = shape_stats_df(stats)
    describe, describe_keys = shape_describe(stats_df)
    data_stats[vectorize_type] = shape_data(describe, describe_keys)

In [46]:
stats_df["mi"]["full"].style.highlight_max(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
2,0.387009,0.387105,0.386822,0.385757,0.306069,0.387086,0.387101,0.387111,0.387086,0.387064,0.387074,0.306107,0.386398,0.386927,0.386989,0.385715,0.387094,0.386986,0.387111,0.387074,0.333532,0.387094,0.387034,0.387082,0.387057,0.38562,0.386947,0.387101,0.38706,0.385746
4,0.492016,0.492024,0.492592,0.47882,0.492418,0.492449,0.492418,0.492463,0.492024,0.492617,0.478843,0.49242,0.478838,0.492385,0.492375,0.492552,0.492036,0.492407,0.492024,0.492391,0.478798,0.492592,0.492592,0.492013,0.492596,0.478822,0.478787,0.487707,0.492375,0.492596
6,0.496961,0.496981,0.49708,0.496981,0.496961,0.496973,0.496968,0.496968,0.496961,0.496968,0.496961,0.496961,0.496974,0.496961,0.496981,0.496974,0.496961,0.496981,0.449132,0.496959,0.496961,0.496961,0.49697,0.496968,0.496959,0.496968,0.496974,0.497064,0.496961,0.496959
8,0.502793,0.501973,0.502022,0.502959,0.502772,0.501721,0.502772,0.502606,0.501863,0.501936,0.502772,0.502772,0.501739,0.501717,0.501672,0.501739,0.501755,0.501751,0.501775,0.502612,0.501672,0.501672,0.501746,0.501672,0.501705,0.501717,0.502629,0.502154,0.501672,0.502793
10,0.509529,0.509548,0.509505,0.462207,0.509516,0.509579,0.509261,0.509405,0.509524,0.509597,0.509579,0.51004,0.509516,0.509394,0.509597,0.509348,0.509997,0.509523,0.509516,0.509529,0.509955,0.509597,0.509597,0.509529,0.509579,0.510017,0.510065,0.509597,0.509597,0.509955
20,0.482147,0.501295,0.481539,0.466976,0.470794,0.401461,0.462896,0.477781,0.455748,0.387262,0.419348,0.465241,0.474764,0.462857,0.463184,0.498162,0.441803,0.459765,0.465833,0.441571,0.501339,0.511805,0.479277,0.497783,0.470426,0.477706,0.462621,0.472803,0.497738,0.491264
40,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505198,0.505654,0.505654,0.505198,0.505198,0.505198
80,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.50274,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734,0.502734
160,0.50274,0.503151,0.502902,0.504264,0.503151,0.503151,0.504264,0.503151,0.503169,0.503169,0.503151,0.503169,0.503172,0.504106,0.504288,0.502902,0.503169,0.504264,0.503169,0.504264,0.502716,0.502902,0.502742,0.502902,0.502739,0.504264,0.504239,0.503169,0.504264,0.503169
384,0.500618,0.500739,0.500823,0.500821,0.500974,0.500898,0.500746,0.500734,0.501613,0.500642,0.501549,0.500877,0.50085,0.5007,0.501681,0.500768,0.500801,0.500863,0.500758,0.501108,0.501463,0.501314,0.500766,0.500633,0.501418,0.50074,0.500862,0.500771,0.501277,0.501626


# Coherence

## Read Data

In [47]:
def load_coherence(data_type, vectorize_type, vector_dims, normalization):
    coherences = {}
    for vector_dim in vector_dims:
        coherence_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/coherence/{normalization}/{vector_dim}.csv"
        coherence = pd.read_csv(coherence_path, index_col=0)
        coherences[vector_dim] = coherence
    return coherences

In [48]:
coherences = {}
for vectorize_type in vectorize_types:
    coherences[vectorize_type] = load_coherence(
        data_type, vectorize_type, vector_dims[vectorize_type], normalization
    )

In [49]:
coherence_lda_path = f"../../Postprocessing/data/{data_type}/LDA/coherence.csv"
coherence_lda = pd.read_csv(coherence_lda_path, index_col=0)
describe_lda_coherence, _ = get_describe(coherence_lda, axis=0)

## Data shaping

In [50]:
def shape_coherence_df(coherences, covariance_types, vector_dims):
    coherence_df = {
        covariance_type: pd.DataFrame() for covariance_type in covariance_types
    }
    for covariance_type in covariance_types:
        _coherence_df = {
            vector_dim: coherences[vector_dim].loc[:, covariance_type]
            for vector_dim in vector_dims
        }
        coherence_df[covariance_type] = pd.concat(_coherence_df, axis=1)
    return coherence_df

In [51]:
coherence_df = {}
for vectorize_type in vectorize_types:
    coherence_df[vectorize_type] = shape_coherence_df(
        coherences[vectorize_type], covariance_types, vector_dims[vectorize_type]
    )

In [52]:
def shape_coherence_describe(coherence_df, covariance_types):
    # 統計値の計算
    describe = {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
    for covariance_type in covariance_types:
        describe[covariance_type], describe_keys = get_describe(
            coherence_df[covariance_type], axis=0
        )
    return describe, describe_keys

In [53]:
coherence_describe = {}
for vectorize_type in vectorize_types:
    coherence_describe[vectorize_type], describe_keys = shape_coherence_describe(
        coherence_df[vectorize_type], covariance_types
    )

In [54]:
def shape_coherence_data(describe, describe_keys):
    data = {describe_key: pd.DataFrame() for describe_key in describe_keys}

    for describe_key in describe_keys:
        # covariance_typeについてデータを結合
        _data = {
            covariance_type: describe[covariance_type][describe_key]
            for covariance_type in covariance_types
        }
        data[describe_key] = pd.concat(_data, axis=1)
    return data

In [55]:
coherence_data = {}
for vectorize_type in vectorize_types:
    coherence_data[vectorize_type] = shape_coherence_data(
        coherence_describe[vectorize_type], describe_keys
    )

# Make Chart

In [56]:
chart_data = {}
for vectorize_type in vectorize_types:
    mi_max_idx = data_stats[vectorize_type]["mi"]["mean"].stack().idxmax()
    chart_data[vectorize_type] = {
        "埋め込み次元": mi_max_idx[0],
        "mutual information": data_stats[vectorize_type]["mi"]["mean"].loc[mi_max_idx],
        "分散": data_stats[vectorize_type]["mi"]["std"].loc[mi_max_idx],
        "coherence": coherence_data[vectorize_type]["mean"].loc[mi_max_idx],
    }

In [57]:
chart_df = pd.DataFrame(chart_data)

In [58]:
chart_df["LDA"] = [
    np.NaN,
    describe_lda_mi["mean"].to_numpy()[0],
    describe_lda_mi["std"].to_numpy()[0],
    describe_lda_coherence["mean"].to_numpy()[0]
]

In [59]:
chart_df.T.loc[:, ["mutual information", "coherence"]].style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,mutual information,coherence
doc2vec,0.194729,0.747001
sentenceBERT,0.517748,1.0
LDA,0.02258,0.704813


In [60]:
print(
    chart_df.T.loc[:, ["mutual information", "coherence"]]
    .style.format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="miの比較と埋め込み次元",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{miの比較と埋め込み次元}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 & mutual information & coherence \\
\midrule
doc2vec & 0.195 & 0.747 \\
sentenceBERT & 0.518 & 1.000 \\
LDA & 0.023 & 0.705 \\
\bottomrule
\end{tabular}
\end{table}

