# Import

In [1]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [3]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [23]:
data_type = "20News"
vectorize_types = ["doc2vec", "sentenceBERT"]

# Read data

In [24]:
model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]

In [25]:
stats_vals = ["aic", "bic", "mi", "logl"]

In [10]:
# データ型定義
stats = {
    stats_val: {
        covariance_type: {model_num: pd.DataFrame for model_num in range(model_nums)}
        for covariance_type in covariance_types
    }
    for stats_val in stats_vals
}

# データ取得
for stats_val in stats_vals:
    for covariance_type in covariance_types:
        for model_num in range(model_nums):
            stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{covariance_type}/{model_num}.csv"
            df = pd.read_csv(stats_path, index_col=0)
            stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]

In [11]:
lda_mi = pd.read_csv(f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0)

In [12]:
describe_lda_mi, _ = get_describe(lda_mi, axis=0)

# Data shaping

In [13]:
stats_df = {
    stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
    for stats_val in stats_vals
}

for stats_val in stats_vals:
    for covariance_type in covariance_types:
        # model_numについて取得データstatsを結合
        stats_df[stats_val][covariance_type] = pd.concat(
            stats[stats_val][covariance_type], axis=1
        )

In [14]:
describe = {
    stats_val: {covariance_type: dict() for covariance_type in covariance_types}
    for stats_val in stats_vals
}

for stats_val in stats_vals:
    for covariance_type in covariance_types:
        describe[stats_val][covariance_type], describe_keys = get_describe(
            stats_df[stats_val][covariance_type], axis=1
        )

In [15]:
data = {
    stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
    for stats_val in stats_vals
}
for stats_val in stats_vals:
    for describe_key in describe_keys:
        # covariance_typeについてデータを結合
        _data = {
            covariance_type: describe[stats_val][covariance_type][describe_key]
            for covariance_type in covariance_types
        }
        data[stats_val][describe_key] = pd.concat(_data, axis=1)

In [16]:
for stats_val in stats_vals:
    _data = data[stats_val]["mean"]

# Make Chart

In [17]:
mi_max_idx = data["mi"]["mean"].stack().idxmax()

In [18]:
chart_data = {vectorize_type: {
    "埋め込み次元": mi_max_idx[0],
    "mutual information": data["mi"]["mean"].loc[mi_max_idx],
    "分散": data["mi"]["std"].loc[mi_max_idx]
}}

In [19]:
chart_data

{'sentenceBERT': {'埋め込み次元': 10,
  'mutual information': 0.6328436557631199,
  '分散': 0.009302632150168468}}

In [20]:
chart_df = pd.DataFrame(chart_data)

In [21]:
chart_df["LDA"] = [
    describe_lda_mi["mean"].to_numpy()[0],
    describe_lda_mi["std"].to_numpy()[0],
        np.NaN,
]

In [96]:
chart_df.T.style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,mutual information,分散,埋め込み次元
sentenceBERT,0.632844,0.009303,10.0
LDA,0.332196,0.016865,


In [97]:
print(
    chart_df.style.format(precision=2, escape="latex").to_latex(
        column_format="rrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="データの統計値",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{データの統計値}
\label{table:1}
\begin{tabular}{rrr}
\toprule
 & sentenceBERT & LDA \\
\midrule
mutual information & 0.63 & 0.33 \\
分散 & 0.01 & 0.02 \\
埋め込み次元 & 10.00 & nan \\
\bottomrule
\end{tabular}
\end{table}

