# Import

In [1]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [3]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_types = ["AgNews", "AgNewsTitle", "20News"]
vectorize_types = ["doc2vec", "sentenceBERT"]

In [6]:
model_nums = config["clustering"]["gmm"]["max_model_num"]
covariance_types = {
    "AgNews": {"doc2vec": ["full"], "sentenceBERT": ["full"]},
    "20News": {"doc2vec": ["full"], "sentenceBERT": ["full"]},
    "AgNewsTitle": {
        "doc2vec": ["full"],
        "sentenceBERT": ["full"],
    },
}
normalization = {
    "AgNews": {"doc2vec": "normalized", "sentenceBERT": "normalized"},
    "20News": {"doc2vec": "normalized", "sentenceBERT": "normalized"},
    "AgNewsTitle": {"doc2vec": "normalized", "sentenceBERT": "normalized"},
}
vector_dims = {
    "AgNews": {"doc2vec": [8], "sentenceBERT": [384]},
    "20News": {"doc2vec": [80], "sentenceBERT": [384]},
    "AgNewsTitle": {
        "doc2vec": [2, 4, 6, 8, 10, 20, 40, 80, 160],
        "sentenceBERT": [384],
    },
}

In [7]:
stats_vals = ["aic", "bic", "mi", "logl"]

# Stats

## Read data

In [8]:
def load_stats_data(
    data_type, vectorize_type, stats_vals, covariance_types, model_nums, normalization
):
    # データ型定義
    stats = {
        stats_val: {
            covariance_type: {
                model_num: pd.DataFrame for model_num in range(model_nums)
            }
            for covariance_type in covariance_types
        }
        for stats_val in stats_vals
    }

    # データ取得
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            for model_num in range(model_nums):
                stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{model_num}.csv"
                df = pd.read_csv(stats_path, index_col=0)
                stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]
    return stats

In [9]:
def load_lda_mi(data_type):
    lda_mi = pd.read_csv(
        f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0
    )
    describe_lda_mi, _ = get_describe(lda_mi, axis=0)
    return describe_lda_mi

In [10]:
stats_dict = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        stats_dict[data_type][vectorize_type] = load_stats_data(
            data_type,
            vectorize_type,
            stats_vals,
            covariance_types[data_type][vectorize_type],
            model_nums,
            normalization[data_type][vectorize_type]
        )

In [11]:
describe_lda_mi = {}
for data_type in data_types:
    describe_lda_mi[data_type] = load_lda_mi(data_type)

## Data shaping

In [12]:
def shape_stats_df(stats, covariance_types):
    stats_df = {
        stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            # model_numについて取得データstatsを結合
            stats_df[stats_val][covariance_type] = pd.concat(
                stats[stats_val][covariance_type], axis=1
            )
    return stats_df

In [13]:
def shape_describe(stats_df, covariance_types):
    describe = {
        stats_val: {covariance_type: dict() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            describe[stats_val][covariance_type], describe_keys = get_describe(
                stats_df[stats_val][covariance_type], axis=1
            )
    return describe, describe_keys

In [14]:
def shape_data(describe, describe_keys, covariance_types):
    data = {
        stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
        for stats_val in stats_vals
    }
    for stats_val in stats_vals:
        for describe_key in describe_keys:
            # covariance_typeについてデータを結合
            _data = {
                covariance_type: describe[stats_val][covariance_type][describe_key]
                for covariance_type in covariance_types
            }
            data[stats_val][describe_key] = pd.concat(_data, axis=1)
    return data

In [15]:
data_stats = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type, stats in stats_dict[data_type].items():
        stats_df = shape_stats_df(
            stats, covariance_types=covariance_types[data_type][vectorize_type]
        )
        describe, describe_keys = shape_describe(
            stats_df, covariance_types=covariance_types[data_type][vectorize_type]
        )
        data_stats[data_type][vectorize_type] = shape_data(
            describe,
            describe_keys,
            covariance_types=covariance_types[data_type][vectorize_type],
        )

# Coherence

## Read Data

In [16]:
def load_coherence(data_type, vectorize_type, vector_dims, normalization):
    coherences = {}
    for vector_dim in vector_dims:
        coherence_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/coherence/{normalization}/{vector_dim}.csv"
        coherence = pd.read_csv(coherence_path, index_col=0)
        coherences[vector_dim] = coherence
    return coherences

In [17]:
coherences = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        coherences[data_type][vectorize_type] = load_coherence(
            data_type,
            vectorize_type,
            vector_dims[data_type][vectorize_type],
            normalization[data_type][vectorize_type],
        )

In [18]:
describe_lda_coherence = {}
for data_type in data_types:
    coherence_lda_path = f"../../Postprocessing/data/{data_type}/LDA/coherence.csv"
    coherence_lda = pd.read_csv(coherence_lda_path, index_col=0)
    describe_lda_coherence[data_type], _ = get_describe(coherence_lda, axis=0)

## Data shaping

In [19]:
def shape_coherence_df(coherences, covariance_types, vector_dims):
    coherence_df = {
        covariance_type: pd.DataFrame() for covariance_type in covariance_types
    }
    for covariance_type in covariance_types:
        _coherence_df = {
            vector_dim: coherences[vector_dim].loc[:, covariance_type]
            for vector_dim in vector_dims
        }
        coherence_df[covariance_type] = pd.concat(_coherence_df, axis=1)
    return coherence_df

In [20]:
coherence_df = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        coherence_df[data_type][vectorize_type] = shape_coherence_df(
            coherences[data_type][vectorize_type], covariance_types[data_type][vectorize_type], vector_dims[data_type][vectorize_type]
        )

In [21]:
def shape_coherence_describe(coherence_df, covariance_types):
    # 統計値の計算
    describe = {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
    for covariance_type in covariance_types:
        describe[covariance_type], describe_keys = get_describe(
            coherence_df[covariance_type], axis=0
        )
    return describe, describe_keys

In [22]:
coherence_describe = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        coherence_describe[data_type][vectorize_type], describe_keys = shape_coherence_describe(
            coherence_df[data_type][vectorize_type], covariance_types[data_type][vectorize_type]
        )

In [23]:
def shape_coherence_data(describe, describe_keys, covariance_types):
    data = {describe_key: pd.DataFrame() for describe_key in describe_keys}

    for describe_key in describe_keys:
        # covariance_typeについてデータを結合
        _data = {
            covariance_type: describe[covariance_type][describe_key]
            for covariance_type in covariance_types
        }
        data[describe_key] = pd.concat(_data, axis=1)
    return data

In [24]:
coherence_data = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        coherence_data[data_type][vectorize_type] = shape_coherence_data(
            coherence_describe[data_type][vectorize_type],
            describe_keys,
            covariance_types[data_type][vectorize_type],
        )

# Make Chart

In [25]:
chart_data = {data_type: {} for data_type in data_types}
for data_type in data_types:
    for vectorize_type in vectorize_types:
        mi_max_idx = (
            data_stats[data_type][vectorize_type]["mi"]["mean"]
            .loc[
                vector_dims[data_type][vectorize_type],
                covariance_types[data_type][vectorize_type],
            ]
            .stack()
            .idxmax()
        )
        chart_data[data_type][vectorize_type] = {
            "埋め込み次元": mi_max_idx[0],
            "mutual information": data_stats[data_type][vectorize_type]["mi"][
                "mean"
            ].loc[mi_max_idx],
            "分散": data_stats[data_type][vectorize_type]["mi"]["std"].loc[mi_max_idx],
            "coherence": coherence_data[data_type][vectorize_type]["mean"].loc[
                mi_max_idx
            ],
        }
    chart_data[data_type]["LDA"] = {
        "埋め込み次元": np.NaN,
        "mutual information": describe_lda_mi[data_type]["mean"].to_numpy()[0],
        "分散": describe_lda_mi[data_type]["std"].to_numpy()[0],
        "coherence": describe_lda_coherence[data_type]["mean"].to_numpy()[0],
    }

In [26]:
chart_df = pd.concat(
    {
        data_type: pd.DataFrame(_chart_data).T
        for data_type, _chart_data in chart_data.items()
    }
)

In [27]:
def highlight_edge(Series: pd.Series, highlight_type="bold", max_min="max", axis=0):
    if max_min == "max":
        highlight_idx = Series.idxmax(axis)
    elif max_min == "min":
        highlight_idx = Series.idxmax(axis)
    else:
        raise NotImplementedError
    return [
        "font-weight: %s" % highlight_type if _id == highlight_idx else "font-weight: "
        for _id in Series.index
    ]

In [28]:
def find_edge(Series: pd.Series, max_min="max", axis=0):
    if max_min == "max":
        highlight_idx = Series.idxmax(axis)
    elif max_min == "min":
        highlight_idx = Series.idxmax(axis)
    else:
        raise NotImplementedError
    return [True if _id ==highlight_idx else False for _id in Series.index]

In [29]:
max_val = chart_df.groupby(level=0).transform(find_edge , axis=0)

In [30]:
max_style = chart_df.where(max_val, "nan").style.applymap(
    lambda x: "font-weight: bold" if x != "nan" else "font-weight:"
)

In [31]:
max_style

Unnamed: 0,Unnamed: 1,埋め込み次元,mutual information,分散,coherence
AgNews,doc2vec,,,,
AgNews,sentenceBERT,384.0,0.607587,,0.957356
AgNews,LDA,,,0.07255,
AgNewsTitle,doc2vec,,,,
AgNewsTitle,sentenceBERT,384.0,0.501186,,0.836579
AgNewsTitle,LDA,,,0.008628,
20News,doc2vec,,,,
20News,sentenceBERT,384.0,0.593456,,0.591931
20News,LDA,,,0.016865,


In [32]:
chart_df.style.use(max_style.export())

Unnamed: 0,Unnamed: 1,埋め込み次元,mutual information,分散,coherence
AgNews,doc2vec,8.0,0.475413,0.000222,0.79772
AgNews,sentenceBERT,384.0,0.607587,0.018031,0.957356
AgNews,LDA,,0.407566,0.07255,0.806057
AgNewsTitle,doc2vec,6.0,0.21608,0.000567,0.716951
AgNewsTitle,sentenceBERT,384.0,0.501186,0.000344,0.836579
AgNewsTitle,LDA,,0.02258,0.008628,0.704813
20News,doc2vec,80.0,0.522142,0.008136,0.587233
20News,sentenceBERT,384.0,0.593456,0.011134,0.591931
20News,LDA,,0.332196,0.016865,0.538046


In [33]:
chart_df.loc[:, ["mutual information", "coherence"]].loc[pd.IndexSlice[ "AgNews", :], :]

Unnamed: 0,Unnamed: 1,mutual information,coherence
AgNews,doc2vec,0.475413,0.79772
AgNews,sentenceBERT,0.607587,0.957356
AgNews,LDA,0.407566,0.806057


In [34]:
print(
    chart_df.loc[:, ["mutual information", "coherence"]]
    .style
    .highlight_max(subset=pd.IndexSlice["AgNews", :], props="font-weight: bold")
    .highlight_max(subset=pd.IndexSlice["20News", :], props="font-weight: bold")
    .highlight_max(subset=pd.IndexSlice["AgNewsTitle", :], props="font-weight: bold")
    .format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="ht",
        position_float="centering",
        hrules=True,
        caption="提案手法と既存手法のAMIと$C_v$の比較",
        label="table:1",
        multicol_align="r",
        convert_css=True,
    )
)

\begin{table}[ht]
\centering
\caption{提案手法と既存手法のAMIと$C_v$の比較}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 &  & mutual information & coherence \\
\midrule
\multirow[c]{3}{*}{AgNews} & doc2vec & 0.475 & 0.798 \\
 & sentenceBERT & \bfseries 0.608 & \bfseries 0.957 \\
 & LDA & 0.408 & 0.806 \\
\multirow[c]{3}{*}{AgNewsTitle} & doc2vec & 0.216 & 0.717 \\
 & sentenceBERT & \bfseries 0.501 & \bfseries 0.837 \\
 & LDA & 0.023 & 0.705 \\
\multirow[c]{3}{*}{20News} & doc2vec & 0.522 & 0.587 \\
 & sentenceBERT & \bfseries 0.593 & \bfseries 0.592 \\
 & LDA & 0.332 & 0.538 \\
\bottomrule
\end{tabular}
\end{table}



In [35]:
print(
    chart_df.loc[:, ["mutual information", "coherence"]]
    .style
    .highlight_max(subset=pd.IndexSlice["AgNews", :], props="font-weight: bold")
    .highlight_max(subset=pd.IndexSlice["20News", :], props="font-weight: bold")
    .highlight_max(subset=pd.IndexSlice["AgNewsTitle", :], props="font-weight: bold")
    .format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="ht",
        position_float="centering",
        hrules=True,
        caption="提案手法と既存手法のAMIと$C_v$の比較",
        label="table:1",
        multicol_align="c",
        multirow_align="t",
        convert_css=True,
    )
)

\begin{table}[ht]
\centering
\caption{提案手法と既存手法のAMIと$C_v$の比較}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 &  & mutual information & coherence \\
\midrule
\multirow[t]{3}{*}{AgNews} & doc2vec & 0.475 & 0.798 \\
 & sentenceBERT & \bfseries 0.608 & \bfseries 0.957 \\
 & LDA & 0.408 & 0.806 \\
\multirow[t]{3}{*}{AgNewsTitle} & doc2vec & 0.216 & 0.717 \\
 & sentenceBERT & \bfseries 0.501 & \bfseries 0.837 \\
 & LDA & 0.023 & 0.705 \\
\multirow[t]{3}{*}{20News} & doc2vec & 0.522 & 0.587 \\
 & sentenceBERT & \bfseries 0.593 & \bfseries 0.592 \\
 & LDA & 0.332 & 0.538 \\
\bottomrule
\end{tabular}
\end{table}



In [84]:
print(
    chart_df.loc[:, ["mutual information", "coherence"]]
    .style.format(precision=3, escape="latex")
    .to_latex(
#         column_format="rrrrrrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="miの比較と埋め込み次元",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{miの比較と埋め込み次元}
\label{table:1}
\begin{tabular}{llrr}
\toprule
 &  & mutual information & coherence \\
\midrule
\multirow[c]{3}{*}{AgNews} & doc2vec & 0.431 & 0.783 \\
 & sentenceBERT & 0.610 & 0.925 \\
 & LDA & 0.408 & 0.806 \\
\multirow[c]{3}{*}{AgNewsTitle} & doc2vec & 0.195 & 0.747 \\
 & sentenceBERT & 0.518 & 1.000 \\
 & LDA & 0.023 & 0.705 \\
\multirow[c]{3}{*}{20News} & doc2vec & 0.440 & 0.587 \\
 & sentenceBERT & 0.649 & 0.599 \\
 & LDA & 0.332 & 0.538 \\
\bottomrule
\end{tabular}
\end{table}

