# Import

In [245]:
import csv
import os
import pickle
import sys
import copy
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
import pandas as pd
from tqdm import tqdm
from sympy.combinatorics import Permutation

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../PlotFunction/lineplot/")
sys.path.append("../PlotFunction/config/")

In [3]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_1_layout import layout
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
data_types = ["AgNews"]
vectorize_types = ["sentenceBERT"]

In [6]:
# model_nums = config["clustering"]["gmm"]["max_model_num"]
model_nums = 10
# covariance_types = config["clustering"]["gmm"]["covariance_types"]
normalization = "normalized"
covariance_types = ["spherical"]
vector_dims = {
    "sentenceBERT": [20]
}
topic_nums = [2, 4, 8, 16, 32, 64, 128]

In [7]:
stats_vals = ["aic", "bic", "mi", "logl"]

# Stats

## Read data

In [8]:
def load_stats_data(
    vectorize_type, stats_vals, covariance_types, model_nums, topic_nums
):
    # データ型定義
    stats = make_multilayer_dict(
        [stats_vals, covariance_types, range(model_nums), topic_nums]
    )

    # データ取得
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            for model_num in range(model_nums):
                for topic_num in topic_nums:
                    stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{topic_num}/{model_num}.csv"
                    df = pd.read_csv(stats_path, index_col=0)
                    stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]
    return stats

In [8]:
def load_stats_data(vectorize_type, stats_vals, covariance_types, model_nums, topic_nums):
    # データ型定義
    stats = {
        stats_val: {
            covariance_type: {
                model_num: {topic_num: pd.DataFrame for topic_num in topic_nums}
                for model_num in range(model_nums)
            }
            for covariance_type in covariance_types
        }
        for stats_val in stats_vals
    }

    # データ取得
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            for model_num in range(model_nums):
                for topic_num in topic_nums:
                    stats_path = f"../../Postprocessing/data/{data_type}/{vectorize_type}/GMM/stats/{normalization}/{covariance_type}/{topic_num}/{model_num}.csv"
                    df = pd.read_csv(stats_path, index_col=0)
                    stats[stats_val][covariance_type][model_num] = df.loc[:, stats_val]
    return stats

In [9]:
def load_lda_mi(data_type):
    lda_mi = pd.read_csv(
        f"../../Postprocessing/data/{data_type}/LDA/mi.csv", index_col=0
    )
    describe_lda_mi, _ = get_describe(lda_mi, axis=0)
    return describe_lda_mi

In [10]:
stats_dict = {}
for vectorize_type in vectorize_types:
    for data_type in data_types:
        stats_dict[vectorize_type] = load_stats_data(
            vectorize_type, stats_vals, covariance_types, model_nums, topic_nums
        )

In [23]:
describe_lda_mi = load_lda_mi(data_type)

## Data shaping

In [24]:
def shape_stats_df(stats):
    stats_df = {
        stats_val: {covariance_type: pd.DataFrame() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            # model_numについて取得データstatsを結合
            stats_df[stats_val][covariance_type] = pd.concat(
                stats[stats_val][covariance_type], axis=1
            )
    return stats_df

In [25]:
def shape_describe(stats_df):
    describe = {
        stats_val: {covariance_type: dict() for covariance_type in covariance_types}
        for stats_val in stats_vals
    }

    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            describe[stats_val][covariance_type], describe_keys = get_describe(
                stats_df[stats_val][covariance_type], axis=1
            )
    return describe, describe_keys

In [26]:
def shape_data(describe, describe_keys):
    data = {
        stats_val: {describe_key: pd.DataFrame() for describe_key in describe_keys}
        for stats_val in stats_vals
    }
    for stats_val in stats_vals:
        for describe_key in describe_keys:
            # covariance_typeについてデータを結合
            _data = {
                covariance_type: describe[stats_val][covariance_type][describe_key]
                for covariance_type in covariance_types
            }
            data[stats_val][describe_key] = pd.concat(_data, axis=1)
    return data

In [73]:
describes = {
    vectorize_type: {
        stats_val: {covariance_type: {} for covariance_type in covariance_types}
        for stats_val in stats_vals
    }
    for vectorize_type in vectorize_types
}

for vectorize_type in vectorize_types:
    for stats_val in stats_vals:
        for covariance_type in covariance_types:
            df[covariance_type] = pd.concat(
                stats_dict["sentenceBERT"][stats_val][covariance_type], axis=1
            )
            (
                describes[vectorize_type][stats_val][covariance_type],
                describe_key,
            ) = get_describe(df[covariance_type], axis=1)

In [403]:
class multilayer_dict:
    def __init__(self, names_keys: dict):
        self.dict = self.make_multilayer_dict(list(names_keys.values()))
        self.names = list(names_keys.keys())
        self.names_keys = names_keys

    def make_multilayer_dict(self, keys: list):
        def _multilayer_dict_recursive(_d: dict, _keys: list):
            if not _keys:
                return _d, []
            else:
                return _multilayer_dict_recursive(
                    {_key: copy.deepcopy(_d) for _key in _keys[-1]}, _keys[:-1]
                )

        _multilayer_dict, _ = _multilayer_dict_recursive(dict(), keys)
        return _multilayer_dict

    def name_is_in(self, name):
        return name in self.name

    def loc(self, key_list: list):
        def _loc_recursive(_val, _key_list: list):
            if not _key_list:
                return _val
            else:
                return _loc_recursive(_val[_key_list[0]], _key_list[1:])

        return _loc_recursive(self.dict, key_list)

    def update(self, key_list: list, val):
        def _make_key_path_recursive(_key_path: dict, _key_list: list, val=None):
            if not (_key_path == dict() or val is None):
                raise ValueError(f"_key_path or val mast be empty dict(or None).")
            if not _key_path:
                _key_path = {_key_list[-1]: val}
                return _make_key_path_recursive(_key_path, _key_list[:-1])
            elif not _key_list:
                return _key_path
            else:
                return _make_key_path_recursive(
                    {_key_list[-1]: _key_path}, _key_list[:-1], val
                )

        _key_path = _make_key_path_recursive({}, key_list, val)
        print(_key_path)

        self.dict.update(_key_path)

In [404]:
# 外部関数にした方がいい
def swap_keys(old_multi_dict, new_names: list):
    def index_to_num(index_lists):
        def _make_index_list(_index_list):
            return {_index: _num for _num, _index in enumerate(_index_list)}

        _perm_dict = _make_index_list(index_lists[0])
        return [
            [_perm_dict[_index] for _index in _index_list]
            for _index_list in index_lists
        ]

    def lists_to_permutation(two_row_perm: list):
        if set(two_row_perm[0]) != set(two_row_perm[1]):
            raise ValueError("Permutations do not match.")
        _two_row_perm = dict(zip(*two_row_perm))
        _cycles = []
        _done = set()
        for _i in _two_row_perm.keys():
            if _i not in _done:
                _cycle = [_i]
                _next_elem = _two_row_perm[_i]
                _done.add(_next_elem)
                while _next_elem != _i:
                    _cycle.append(_next_elem)
                    _next_elem = _two_row_perm[_next_elem]
                    _done.add(_next_elem)
                _cycles.append(_cycle)
        return _cycles

    if not (set(new_names) == set(old_multi_dict.names)):
        raise KeyError(
            f"Keys {set(new_names).symmetric_difference(self.keys)} do not match."
        )

    _two_row_perm = index_to_num([old_multi_dict.names, new_names])
    _cyclic_perm = lists_to_permutation(_two_row_perm)
    _perm = Permutation(_cyclic_perm)

    _new_keys_names = dict(
        zip(
            _perm(list(old_multi_dict.names_keys.keys())),
            _perm(list(old_multi_dict.names_keys.values())),
        )
    )
    _new_multilayer_dict = multilayer_dict(_new_keys_names)
    for _index_keys in itertools.product(*old_multi_dict.names_keys.values()):
        _new_multilayer_dict.update(_perm(_index_keys), old_multi_dict.loc(_index_keys))
    return _new_multilayer_dict

In [405]:
for _index_keys in itertools.product(*multi_d.names_keys.values()):
    print(_index_keys)
    print(multi_d.loc(_index_keys))

('a', 'd', 'e', 'g')
a
('a', 'd', 'f', 'g')


KeyError: 'f'

In [442]:
data = {2: ["a", "b", "c"], 1: ["d", "c"], 3: ["e", "f"], 4: ["g"]}

In [443]:
multi_d = multilayer_dict(data)

In [444]:
multi_d.dict["a"]["d"]["e"]["g"] = ""

In [445]:
_d = multi_d.dict["a"]["d"]

In [446]:
__d = _d["e"]

In [447]:
__d["g"] = ""

In [431]:
multi_d.update(["a", "d", "e", "g"], "end")

{'a': {'d': {'e': {'g': 'end'}}}}


In [441]:
multi_d.dict

{'a': {'d': {'e': {'g': ''}}},
 'b': {'d': {'e': {'g': {}}, 'f': {'g': {}}},
  'c': {'e': {'g': {}}, 'f': {'g': {}}}},
 'c': {'d': {'e': {'g': {}}, 'f': {'g': {}}},
  'c': {'e': {'g': {}}, 'f': {'g': {}}}}}

In [388]:
multi_d.names

[2, 1, 3, 4]

In [421]:
_d = {1:{2:"a", 3: "c"}}

In [422]:
_d

{1: {2: 'a', 3: 'c'}}

In [423]:
_d.update({1: {2: "b"}})

In [424]:
_d

{1: {2: 'b'}}

In [389]:
swap_keys(multi_d, [1, 2, 3, 4])

KeyError: 'f'

In [344]:
multi_d.names_keys

{2: ['a', 'b', 'c'], 1: ['d', 'c'], 3: ['e', 'f'], 4: ['g']}

In [248]:
l_ = list(itertools.product(*[["a"], ["b"], ["c"], ["d" ,"e"]]))

In [251]:
p = Permutation(1, 2, 3)

In [314]:
index_lists = [["a", "b", "c", "d"], ["a", "c", "b", "d"]]

In [315]:
def index_to_num(index_lists):
    def _make_index_list(_index_list):
        return {_index: _num for _num, _index in enumerate(_index_list)}

    _perm_dict = _make_index_list(index_lists[0])
    return [[_perm_dict[_index] for _index in _index_list] for _index_list in index_lists]

In [316]:
def lists_to_permutation(two_row_perm: list):
    if set(two_row_perm[0]) != set(two_row_perm[1]):
        raise ValueError("Permutations do not match.")
    _two_row_perm = dict(zip(*two_row_perm))
    _cycles = []
    _done = set()
    for _i in _two_row_perm.keys():
        if _i not in _done:
            _cycle = [_i]
            _next_elem = _two_row_perm[_i]
            _done.add(_next_elem)
            while _next_elem != _i:
                _cycle.append(_next_elem)
                _next_elem = _two_row_perm[_next_elem]
                _done.add(_next_elem)
            _cycles.append(_cycle)
    return _cycles

In [317]:
two_row_perm = index_to_num(index_lists)

In [318]:
two_row_perm

[[0, 1, 2, 3], [0, 2, 1, 3]]

In [320]:
cyclic_perm = lists_to_permutation(two_row_perm)

In [321]:
p = Permutation(cyclic_perm)

In [337]:
p({0: "a", 2: "b", 1:"c", 3: "d"})

['a', 'b', 'c', 'd']

In [338]:
p({0: "a", 1: "b", 2:"c", 3: "d"})

['a', 'c', 'b', 'd']

In [234]:
multi_d.loc(["a", "d", "e", "g"])

{}

In [142]:
d = make_multilayer_dict([["a", "b", "c"], ["d", "c"], ["e", "f"], ["g"]])

In [165]:
_ = {1: {1: 2}}

In [167]:
_.update({1: {1: 3}})

In [168]:
_

{1: {1: 3}}

In [None]:
def to_DataFrame_recursive(d, keys):
    if isinstance(d, pd.DataFrame):
        return d
    else:
        for _k in keys:
            d[k] = pd.concat({k: pd.DataFrame(v).T for k, v in _d.items()}, axis=0)
        return (
            to_DataFrame_recursive(
                pd.concat({k: pd.DataFrame(v).T for k, v in _d.items()}, axis=0)
            ),
            [],
        )

In [37]:
not {}

True

In [45]:
df = pd.concat({k: pd.DataFrame(v).T for k, v in d.items()}, axis=0)

In [48]:
df

Unnamed: 0,Unnamed: 1,e,f
a,d,{'g': {}},{'g': {}}
a,c,{'g': {}},{'g': {}}
b,d,{'g': {}},{'g': {}}
b,c,{'g': {}},{'g': {}}
c,d,{'g': {}},{'g': {}}
c,c,{'g': {}},{'g': {}}


In [51]:
df.swaplevel(0, 1).mean(axis=1)

  df.swaplevel(0, 1).mean(axis=0)


Series([], dtype: float64)

In [43]:
d

{'a': {'d': {'e': {'g': {}}, 'f': {'g': {}}},
  'c': {'e': {'g': {}}, 'f': {'g': {}}}},
 'b': {'d': {'e': {'g': {}}, 'f': {'g': {}}},
  'c': {'e': {'g': {}}, 'f': {'g': {}}}},
 'c': {'d': {'e': {'g': {}}, 'f': {'g': {}}},
  'c': {'e': {'g': {}}, 'f': {'g': {}}}}}

In [44]:
pd.read_csv("test.csv")

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,e,f
0,a,d,{'g': {}},{'g': {}}
1,a,c,{'g': {}},{'g': {}}
2,b,d,{'g': {}},{'g': {}}
3,b,c,{'g': {}},{'g': {}}
4,c,d,{'g': {}},{'g': {}}
5,c,c,{'g': {}},{'g': {}}


In [23]:
flatten_dict(s)

{}

In [21]:
pd.DataFrame.from_dict(s)

Unnamed: 0,a,b,c
d,"{'e': {}, 'f': {}}","{'e': {}, 'f': {}}","{'e': {}, 'f': {}}"
c,"{'e': {}, 'f': {}}","{'e': {}, 'f': {}}","{'e': {}, 'f': {}}"


In [27]:
data_stats = {}
for stats_val in stats_vals:
    for vectorize_type, stats in stats_dict["sentenceBERT"][stats_val].items():
        stats_df = shape_stats_df(stats)
        describe, describe_keys = shape_describe(stats_df)
        data_stats[vectorize_type] = shape_data(describe, describe_keys)

In [30]:
stats_df["mi"]["spherical"].style.highlight_max(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
20,0.341989,0.341835,0.341814,0.338648,0.343823,0.342038,0.343106,0.341574,0.344153,0.34305


# Make Chart

In [32]:
chart_data = {}
chart_data["sentenceBERT"] = data_stats["sentenceBERT"]["mi"]["mean"].loc[384, :]

KeyError: 384

In [33]:
pd.DataFrame(chart_data)

In [132]:
chart_df = pd.DataFrame(chart_data)

In [133]:
chart_df.T.style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,spherical,diag,full
doc2vec,0.454906,0.429935,0.475413
sentenceBERT,0.582227,0.585635,0.607587


In [134]:
chart_df.to_csv(make_filepath(f"../data/{data_type}/CovarianceChart.csv"))

In [135]:
print(
    chart_df.T
    .style.format(precision=3, escape="latex")
    .to_latex(
        column_format="rrrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="miの比較と埋め込み次元",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{miの比較と埋め込み次元}
\label{table:1}
\begin{tabular}{rrrr}
\toprule
 & spherical & diag & full \\
\midrule
doc2vec & 0.455 & 0.430 & 0.475 \\
sentenceBERT & 0.582 & 0.586 & 0.608 \\
\bottomrule
\end{tabular}
\end{table}

