# Import

In [63]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sps
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")
sys.path.append("../../Visualize/PlotFunction/lineplot/")
sys.path.append("../../Visualize/PlotFunction/config/")

In [3]:
from ALL import config
from line_plot_1 import line_plot_1
from line_plot_error_1 import line_plot_error_1
from line_plot_error_1_layout import layout
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [13]:
data_type = "AgNewsTitle"
vectorize_type = "sentenceBERT"
normalization = "normalized"

# Read data

In [14]:
df = pd.read_csv(f"../../Preprocessing/data/{data_type}/master.csv", index_col=0)

In [15]:
with open(f"../../Preprocessing/data/{data_type}/class.csv", mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [32]:
# vector_dims = config["vectorize"][vectorize_type]["dims"]
vector_dims = [384]
model_nums = config["clustering"]["gmm"]["max_model_num"]
vector_model_num = config["vectorize"][vectorize_type]["max_model_num"]
# covariance_types = config["clustering"]["gmm"]["covariance_types"]
covariance_types = ["full"]

In [33]:
label = df["class"].to_numpy()

# Data shaping

In [56]:
threshold = 0.005

In [57]:
dist_path = f"../../Clustering/data/{data_type}/{vectorize_type}/GMM/dist"

In [58]:
# データ型定義
high_confirm_index = {
    covariance_type: {
        vector_dim: {model_num: pd.DataFrame for model_num in range(model_nums)}
        for vector_dim in vector_dims
    }
    for covariance_type in covariance_types
}

# データ取得
for covariance_type in tqdm(covariance_types):
    for vector_dim in vector_dims:
        for model_num in range(model_nums):
            dist = np.load(
                f"{dist_path}/{vector_dim}/{normalization}/{covariance_type}/{model_num}.npy",
            )
            high_confirm_index[covariance_type][vector_dim][model_num] = df[
                dist.min(axis=1) < sps.chi2.ppf(q=threshold, df=vector_dim)
            ].index

100%|██████████| 1/1 [00:00<00:00,  3.08it/s]


In [62]:
def getmax_rev(series, topnum=100, getmin=False, getindex=False):
    out = series.nsmallest(topnum) if getmin else series.nlargest(topnum)
    return out.index if getindex else out.set_axis(pd.RangeIndex(1, topnum+1))

In [64]:
high_confirm_index

{'full': {384: {0: Int64Index([     4,      5,      6,     14,     27,     79,     80,     81,
                   83,     86,
               ...
               119971, 119974, 119975, 119978, 119982, 119983, 119993, 119994,
               119998, 119999],
              dtype='int64', length=28971),
   1: Int64Index([     4,      5,      6,     14,     27,     79,     80,     81,
                   83,     86,
               ...
               119971, 119974, 119975, 119978, 119982, 119983, 119993, 119994,
               119998, 119999],
              dtype='int64', length=28995),
   2: Int64Index([     4,      5,      6,     14,     27,     79,     80,     81,
                   83,     86,
               ...
               119971, 119974, 119975, 119978, 119982, 119983, 119993, 119994,
               119998, 119999],
              dtype='int64', length=28948),
   3: Int64Index([     4,      5,      6,     14,     27,     79,     80,     81,
                   83,     86,
             