# Import

In [24]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [25]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [26]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
# from line_plot_error_1_layout import layout
from util import *

## Set condition

In [27]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [49]:
data_types = ["AgNews", "20News"]
vectorize_types = ["doc2vec", "sentenceBERT"]

# Read data

In [51]:
vector_dims = {"doc2vec": config["vectorize"]["doc2vec"]["dims"], "sentenceBERT": [384]}
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_num = config["vectorize"]["doc2vec"]["max_model_num"]
normalizations = ["centralized", "normalized"]
covariance_type = "spherical"

In [53]:
stats_vals = ["mi"]

In [54]:
def get_stats_vals(
    data_type, vectorize_type, covariance_type, normalizations, stats_vals
):
    # データ型定義
    stats = {
        stats_val: {
            normalization: {model_num: pd.DataFrame for model_num in range(model_nums)}
            for normalization in normalizations
        }
        for stats_val in stats_vals
    }

    # データ取得
    for stats_val in stats_vals:
        for normalization in normalizations:
            for model_num in range(model_nums):
                stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{model_num}.csv"
                df = pd.read_csv(stats_path, index_col=0)
                stats[stats_val][normalization][model_num] = df.loc[:, stats_val]
    return stats

In [55]:
stats = {
    data_type: {
        vectorize_type: get_stats_vals(
            data_type, vectorize_type, covariance_type, normalizations, stats_vals
        )
        for vectorize_type in vectorize_types
    }
    for data_type in data_types
}

In [56]:
stats_const = {
    data_type: {
        vectorize_type: get_stats_vals(
            data_type, vectorize_type, covariance_type, normalizations, stats_vals
        )
        for vectorize_type in vectorize_types
    }
    for data_type in data_types
}

In [59]:
lda_mi = {
    data_type: pd.read_csv(
        f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0
    )
    for data_type in data_types
}

In [60]:
describe_lda_mi, _ = {data_type :get_describe(lda_mi[data_type], axis=0) for data_type in data_types}

# Data shaping

In [63]:
def make_stats_df(stats, normalizations, stats_vals):
    stats_df = {
        stats_val: {normalization: pd.DataFrame() for normalization in normalizations}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for normalization in normalizations:
            # model_numについて取得データstatsを結合
            stats_df[stats_val][normalization] = pd.concat(
                stats[stats_val][normalization], axis=1
            )
    return stats_df

In [65]:
stats_df = {
    data_type: {
        vectorize_type: make_stats_df(stats[data_type][vectorize_type], normalizations, stats_vals)
        for vectorize_type in vectorize_types
    }
    for data_type in data_types
}

In [66]:
def make_describe(stats_df, normalization, stats_vals):
    describe = {
        stats_val: {normalization: dict() for normalization in normalizations}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for normalization in normalizations:
            describe[stats_val][normalization], describe_keys = get_describe(
                stats_df[stats_val][normalization], axis=1
            )
    return describe, describe_keys

In [67]:
describe = {data_type: {vectorize_type: {} for vectorize_type in vectorize_types} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        _describe, describe_keys = make_describe(stats_df[data_type][vectorize_type], normalizations, stats_vals)
        describe[data_type][vectorize_type] = _describe

In [68]:
def make_data(describe, normalizations, stats_vals, describe_keys):
    data = {
        stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
        for stats_val in stats_vals
    }
    for stats_val in stats_vals:
        for describe_key in describe_keys:
            # normalizationについてデータを結合
            _data = {
                normalization: describe[stats_val][normalization][describe_key]
                for normalization in normalizations
            }
            data[stats_val][describe_key] = pd.concat(_data, axis=1)
    return data

In [71]:
data = {
    data_type: {
        vectorize_type: make_data(
            describe[data_type][vectorize_type],
            normalizations,
            stats_vals,
            describe_keys,
        )
        for vectorize_type in vectorize_types
    }
    for data_type in data_types
}

In [74]:
chart_data = {
    data_type: data[data_type]["sentenceBERT"]["mi"]["mean"].loc[384, :]
    for data_type in data_types
}

In [77]:
chart_df = pd.concat(chart_data, axis=1)

In [79]:
chart_df

Unnamed: 0,AgNews,20News
centralized,0.578525,0.576343
normalized,0.582227,0.582425


In [78]:
print(
    chart_df
    .style.format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="miの比較と埋め込み次元",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{miの比較と埋め込み次元}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 & centralized & normalized \\
\midrule
AgNews & 0.579 & 0.582 \\
20News & 0.576 & 0.582 \\
\bottomrule
\end{tabular}
\end{table}

