# Import

In [2]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [3]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../../PlotFunction/lineplot/")
sys.path.append("../../PlotFunction/config/")

In [4]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [5]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [6]:
newsgroups_df = pd.read_csv("../../../Preprocessing/data/20News//master.csv", index_col=0)

In [7]:
with open("../../../Preprocessing/data/20News/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [8]:
vector_dims = config["vectorize"]["doc2vec"]["dims"]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_num = config["vectorize"]["doc2vec"]["max_model_num"]
covariance_types = config["clustering"]["gmm"]["covariance_types"]

In [9]:
stats_vals = ["aic", "bic", "mi", "logl"]

In [10]:
# データ型定義
stats = {
    stats_val: {
        covariance_type: {model_num: pd.DataFrame for model_num in range(model_nums)}
        for covariance_type in covariance_types
    }
    for stats_val in stats_vals
}

# データ取得
for stats_val in stats_vals:
    for covariance_type in covariance_types:
        for model_num in range(model_nums):
            stats_path = f"../../../Postprocessing/data/20News/doc2vec/GMM/stats/{covariance_type}/{model_num}.csv"
            df = pd.read_csv(stats_path, index_col=0)
            stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]

In [11]:
lda_mi = pd.read_csv("../../../Postprocessing/data/20News/LDA/mi.csv", index_col=0)

In [12]:
describe_lda_mi, _ = get_describe(lda_mi, axis=0)

In [32]:
describe_lda_mi

{'mean': 0    0.332196
 dtype: float64,
 'median': 0    0.332461
 dtype: float64,
 'std': 0    0.016865
 dtype: float64,
 'var': 0    0.000284
 dtype: float64,
 '75': 0    0.344277
 Name: 0.75, dtype: float64,
 '25': 0    0.318369
 Name: 0.25, dtype: float64}

# Data shaping

In [13]:
stats_df = {
    stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
    for stats_val in stats_vals
}

for stats_val in stats_vals:
    for covariance_type in covariance_types:
        # model_numについて取得データstatsを結合
        stats_df[stats_val][covariance_type] = pd.concat(
            stats[stats_val][covariance_type], axis=1
        )

In [14]:
describe = {
    stats_val: {covariance_type: dict() for covariance_type in covariance_types}
    for stats_val in stats_vals
}

for stats_val in stats_vals:
    for covariance_type in covariance_types:
        describe[stats_val][covariance_type], describe_keys = get_describe(
            stats_df[stats_val][covariance_type], axis=1
        )

In [15]:
data = {
    stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
    for stats_val in stats_vals
}
for stats_val in stats_vals:
    for describe_key in describe_keys:
        # covariance_typeについてデータを結合
        _data = {
            covariance_type: describe[stats_val][covariance_type][describe_key]
            for covariance_type in covariance_types
        }
        data[stats_val][describe_key] = pd.concat(_data, axis=1)

In [16]:
for stats_val in stats_vals:
    _data = data[stats_val]["mean"]

# Make Chart

In [31]:
data["mi"]["mean"].idxmax()

spherical     80
diag          80
tied         160
full         160
dtype: int64

In [27]:
data["mi"]["std"].loc[data["mi"]["mean"].idxmax()]

Unnamed: 0,spherical,diag,tied,full
80,0.00522,0.005102,0.005088,0.008136
80,0.00522,0.005102,0.005088,0.008136
160,0.007952,0.008268,0.00869,0.008906
160,0.007952,0.008268,0.00869,0.008906


In [17]:
chart_data = {
    "埋め込み次元": data["mi"]["mean"].idxmax(),
    "mutual information": data["mi"]["mean"].max(),
    "分散": data["mi"]["std"]
}

In [18]:
chart_df = pd.DataFrame(chart_data).T

In [19]:
chart_df["LDA"] = [
    np.NaN,
    describe_lda_mi["mean"].to_numpy()[0],
    describe_lda_mi["std"].to_numpy()[0],
]

In [20]:
chart_df.loc["埋め込み次元", :].apply(lambda x: round(x, 0))

spherical     80.0
diag          80.0
tied         160.0
full         160.0
LDA            NaN
Name: 埋め込み次元, dtype: float64

In [21]:
chart_df.loc["mutual information", :].apply(lambda x: round(x, 3))

spherical    0.536
diag         0.535
tied         0.565
full         0.531
LDA          0.332
Name: mutual information, dtype: float64

In [24]:
print(chart_df.style.to_latex())

\begin{tabular}{lrrrrr}
 & spherical & diag & tied & full & LDA \\
埋め込み次元 & 80.000000 & 80.000000 & 160.000000 & 160.000000 & nan \\
mutual information & 0.535979 & 0.535291 & 0.565293 & 0.531171 & 0.332196 \\
\end{tabular}

