In [None]:
# ライブラリの読み込み
import json
import pickle
import random
import re
import statistics

import japanize_matplotlib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pprint import pprint
from sklearn.metrics import mean_absolute_error
from tqdm.auto import tqdm

In [None]:
# 設定
plt.rcParams["font.family"] = "IPAexGothic"

pd.options.display.max_columns = 200
pd.options.display.max_rows = 100

In [None]:
# データの読み込み
df_result = pd.read_csv("human0821.csv")
df_gt = pd.read_csv("all_data.csv", index_col=0)
df_t = df_gt.T
selected_ids = list(set(df_result["target_ID"]))
df_drop = df_gt.drop(selected_ids, axis=0)
df_train = pd.read_csv("train.csv", index_col=0).T
df_test = pd.read_csv("test.csv", index_col=0)

In [None]:
# 重みの読み込み
matrix_dict = {}

for dim in tqdm(range(5, 30)):
    matrix_dict[dim] = [
        np.load(f"nmf_w/train/nmf_w_r{i}/nmf_50000_w_{dim}_r{i}.npy") for i in range(1000)
    ]

In [None]:
# 推論
def calculate_score(target_id: int, selected_images: list, dims: tuple):
    result = {}
    q_gt = df_gt.iloc[target_id].to_numpy()
    x = df_gt.iloc[target_id, selected_images].to_numpy()

    for dim in dims:
        q_array = []
        result[dim] = {}
        for random_state in range(1000):
            W = matrix_dict[dim][random_state]
            W1 = W[selected_images]
            W1_inv = np.linalg.pinv(W1)
            h_est = np.dot(W1_inv, x)
            q = np.dot(W, h_est)
            q[q<0] = 0
            q[q>10] = 10
            q_array.append(q)
        q = sum(q_array) / len(q_array)
        result[dim]["q"] = q
        result[dim]["MAE"] = mean_absolute_error(q, q_gt)
    return result

In [None]:
# 次元
dims = (5, 10, 15)

In [None]:
# 訓練用データの分散
variances = df_train.var()
# 分散が大きい順に列をソート
sorted_columns = variances.sort_values(ascending=False).index.to_list()

In [None]:
# 列名をimage_idに変換
def find_index(col_name):
    pattern = r"No\.(\d+)"
    match = re.search(pattern, col_name)

    if match:
        return int(match.group(1))-1
    else:
        return -1

In [None]:
plt.hist(variances, width=0.5)

In [None]:
# 分散が大きい順に10枚聞く
var_large_10 = sorted([find_index(col_name) for col_name in sorted_columns[:10]])
print(var_large_10)
var_large_10_results = []
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_large_10, dims)
    var_large_10_results.append(result)

# 分散が小さい順に10枚聞く
var_small_10 = sorted([find_index(col_name) for col_name in sorted_columns[-10:]])
print(var_small_10)
var_small_10_results = []
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_small_10, dims)
    var_small_10_results.append(result)

# 分散が大きい順に5枚、小さい順に5枚聞く
var_large_5_small_5 = sorted([find_index(col_name) for col_name in sorted_columns[-5:]] + [find_index(col_name) for col_name in sorted_columns[:5]])
var_large_5_small_5_results = []
print(var_large_5_small_5)
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_large_5_small_5, dims)
    var_large_5_small_5_results.append(result)

MAE_var_large_10 = {5: [], 10: [], 15: []}
for result in var_large_10_results:
    for dim in dims:
        MAE_var_large_10[dim].append(result[dim]["MAE"])

MAE_var_small_10 = {5: [], 10: [], 15: []}
for result in var_small_10_results:
    for dim in dims:
        MAE_var_small_10[dim].append(result[dim]["MAE"])

MAE_var_large_5_small_5 = {5: [], 10: [], 15: []}
for result in var_large_5_small_5_results:
    for dim in dims:
        MAE_var_large_5_small_5[dim].append(result[dim]["MAE"])

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
for index, dim in enumerate(dims):
    plt.subplot(3, 1, index+1)
    plt.hist(MAE_var_large_10[dim], label="大10", alpha=0.5)
    plt.hist(MAE_var_small_10[dim], label="小10", alpha=0.5)
    plt.hist(MAE_var_large_5_small_5[dim], label="大5小5", alpha=0.5)
    plt.title(f"{dim}次元")
    plt.legend()

plt.tight_layout()

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
plt.subplot(3, 1, 1)
for index, dim in enumerate(dims):
    plt.hist(MAE_var_large_10[dim], label=f"{dim}次元", alpha=0.5)
    plt.title("分散が大きい画像10枚")
    plt.xticks(list(range(7)))
    plt.legend()

plt.subplot(3, 1, 2)
for index, dim in enumerate(dims):
    plt.hist(MAE_var_small_10[dim], label=f"{dim}次元", alpha=0.5)
    plt.title("分散が小さい画像10枚")
    plt.xticks(list(range(7)))
    plt.legend()

plt.subplot(3, 1, 3)
for index, dim in enumerate(dims):
    plt.hist(MAE_var_large_5_small_5[dim], label=f"{dim}次元", alpha=0.5)
    plt.title("分散が大きい画像5枚と小さい画像5枚")
    plt.xticks(list(range(7)))
    plt.legend()
plt.tight_layout()

In [None]:
df_MAE_var_large_10 = pd.DataFrame({'5次元':MAE_var_large_10[5],'10次元':MAE_var_large_10[10],'15次元':MAE_var_large_10[15]})
df_MAE_var_large_10.describe()

In [None]:
df_MAE_var_small_10 = pd.DataFrame({'5次元':MAE_var_small_10[5],'10次元':MAE_var_small_10[10],'15次元':MAE_var_small_10[15]})
df_MAE_var_small_10.describe()

In [None]:
df_MAE_var_large_5_small_5 = pd.DataFrame({'5次元':MAE_var_large_5_small_5[5],'10次元':MAE_var_large_5_small_5[10],'15次元':MAE_var_large_5_small_5[15]})
df_MAE_var_large_5_small_5.describe()

In [None]:
df_train_corr = df_train.corr()

In [None]:
print(sorted_columns[0])
var_max_col = df_train_corr[sorted_columns[0]]
var_max_col.nlargest(2).iloc[-1]
var_max_col[var_max_col == var_max_col.nlargest(2).iloc[-1]].index[0]

In [None]:
# 分散が大きいものとそれと相関が強いもの

var_corr_max = []
current_index = 0
n_largest = 2
while len(var_corr_max) < 10:
    if len(var_corr_max) % 2 == 0:
        col_name = sorted_columns[current_index]
        if col_name not in var_corr_max:
            var_corr_max.append(col_name)
        current_index += 1
    else:
        raw = df_train_corr[var_corr_max[-1]]
        n_largest_index = raw[raw == raw.nlargest(n_largest).iloc[-1]].index[0]
        if n_largest_index not in var_corr_max:
            var_corr_max.append(n_largest_index)
            n_largest = 2
        else:
            n_largest += 1

var_corr_max_ids = sorted([find_index(col_name) for col_name in var_corr_max])

var_corr_max_results = []

for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_corr_max_ids, dims)
    var_corr_max_results.append(result)

In [None]:
# 分散が大きいものとそれと相関が弱いもの

var_corr_zero = []
current_index = 0
n_smallest = 1
while len(var_corr_zero) < 10:
    if len(var_corr_zero)%2==0:
        col_name = sorted_columns[current_index]
        if col_name not in var_corr_zero:
            var_corr_zero.append(col_name)
            current_index += 1
    else:
        raw = df_train_corr[var_corr_zero[-1]]
        n_smallest_index = raw[raw == raw.nsmallest(n_smallest).iloc[-1]].index[0]
        if n_smallest_index not in var_corr_zero:
            var_corr_zero.append(n_smallest_index)
            n_smallest = 1
        else:
            n_smallest += 1

var_corr_zero_ids = sorted([find_index(col_name) for col_name in var_corr_zero])
var_corr_zero_results = []

for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_corr_zero_ids, dims)
    var_corr_zero_results.append(result)

In [None]:
MAE_var_corr_max = {5: [], 10: [], 15: []}
for result in var_corr_max_results:
    for dim in dims:
        MAE_var_corr_max[dim].append(result[dim]["MAE"])

MAE_var_corr_zero = {5: [], 10: [], 15: []}
for result in var_corr_zero_results:
    for dim in dims:
        MAE_var_corr_zero[dim].append(result[dim]["MAE"])

In [None]:
df_MAE_var_corr_max = pd.DataFrame({'5次元':MAE_var_corr_max[5],'10次元':MAE_var_corr_max[10],'15次元':MAE_var_corr_max[15]})
df_MAE_var_corr_max.describe()

In [None]:
df_MAE_var_corr_zero = pd.DataFrame({'5次元':MAE_var_corr_zero[5],'10次元':MAE_var_corr_zero[10],'15次元':MAE_var_corr_zero[15]})
df_MAE_var_corr_zero.describe()

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
for index, dim in enumerate(dims):
    plt.subplot(3, 1, index+1)
    plt.hist(MAE_var_corr_max[dim], label="相関強", alpha=0.5)
    plt.hist(MAE_var_corr_zero[dim], label="相関弱", alpha=0.5)
    plt.title(f"{dim}次元")
    plt.legend()

plt.tight_layout()

In [None]:
plt.rcParams["figure.figsize"] = (6, 4)
plt.subplot(2, 1, 1)
for index, dim in enumerate(dims):
    plt.hist(MAE_var_corr_max[dim], label=f"{dim}次元", alpha=0.5)
    plt.title("分散が大きい画像5枚と相関が強い画像")
    plt.xticks(list(range(7)))
    plt.legend()

plt.subplot(2, 1, 2)
for index, dim in enumerate(dims):
    plt.hist(MAE_var_corr_zero[dim], label=f"{dim}次元", alpha=0.5)
    plt.title("分散が大きい画像5枚と相関が弱い画像")
    plt.xticks(list(range(7)))
    plt.legend()

plt.tight_layout()

In [None]:
#オブジェクトの呼び出し(sample_object を保存する)
with open('random_samples.pkl', mode='rb') as f:
    random_samples = pickle.load(f)

In [None]:
# ランダム
random_results_0_10 = []

for index, row in tqdm(df_test.iterrows()):
    print(row.name)
    for random_sample in random_samples:
        result = calculate_score(row.name, random_sample, dims)
        random_results_0_10.append(result)

with open(f"random_results_0_10.pkl", "wb") as file:
    pickle.dump(random_results, file)

In [None]:
# 分散が大きいものとそれと相関が強いもの

var_corr_max_6 = []
current_index = 0
n_largest = 2
while len(var_corr_max) < 10:
    if len(var_corr_max) % 2 == 0:
        col_name = sorted_columns[current_index]
        if col_name not in var_corr_max_6:
            var_corr_max.append(col_name)
        current_index += 1
    else:
        raw = df_train_corr[var_corr_max_6[-1]]
        n_largest_index = raw[raw == raw.nlargest(n_largest).iloc[-1]].index[0]
        if n_largest_index not in var_corr_max_6:
            var_corr_max_6.append(n_largest_index)
            n_largest = 2
        else:
            n_largest += 1

var_corr_max_6_ids = sorted([find_index(col_name) for col_name in var_corr_max_6])

var_corr_max_6_results = []


for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_corr_max_6_ids, dims)
    var_corr_max_6_results.append(result)

# 分散が大きいものとそれと相関が弱いもの
var_corr_zero_6 = []
current_index = 0
n_smallest = 1
while len(var_corr_zero_6) < 6:
    if len(var_corr_zero_6)%2==0:
        col_name = sorted_columns[current_index]
        if col_name not in var_corr_zero_6:
            var_corr_zero_6.append(col_name)
            current_index += 1
    else:
        row = df_train_corr[var_corr_zero_6[-1]]
        n_smallest_index = row[row == row.nsmallest(n_smallest).iloc[-1]].index[0]
        if n_smallest_index not in var_corr_zero_6:
            var_corr_zero_6.append(n_smallest_index)
            n_smallest = 1
        else:
            n_smallest += 1

var_corr_zero_6_ids = sorted([find_index(col_name) for col_name in var_corr_zero_6])
var_corr_zero_6_results = []

for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_corr_zero_6_ids, dims)
    var_corr_zero_6_results.append(result)


MAE_var_corr_max_6 = {5: [], 10: [], 15: []}
for result in var_corr_max_6_results:
    for dim in dims:
        MAE_var_corr_max_6[dim].append(result[dim]["MAE"])

MAE_var_corr_zero_6 = {5: [], 10: [], 15: []}
for result in var_corr_zero_6_results:
    for dim in dims:
        MAE_var_corr_zero_6[dim].append(result[dim]["MAE"])

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
for index, dim in enumerate(dims):
    plt.subplot(3, 1, index+1)
    plt.hist(MAE_var_corr_max_6[dim], label="相関強", alpha=0.5)
    plt.hist(MAE_var_corr_zero_6[dim], label="相関弱", alpha=0.5)
    plt.title(f"{dim}次元")
    plt.legend()
    plt.xlim(0, 7)

plt.tight_layout()

In [None]:
var_corr_zero_6_index = [find_index(col_name) for col_name in sorted_columns[:6]]

with open("filename_list.json", "r") as file:
    data = json.load(file)
    var_corr_zero_6_file_name = [data[index] for index in var_corr_zero_6_index]

In [None]:
with open("sample_var_corr_zero_6.json", "w") as file:
    json.dump(var_corr_zero_6_file_name, file)

In [None]:
# 分散が大きい順に6枚聞く
var_large_6 = sorted([find_index(col_name) for col_name in sorted_columns[:6]])
print(var_large_6)
var_large_6_results = []
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_large_6, dims)
    var_large_6_results.append(result)

# 分散が小さい順に6枚聞く
var_small_6 = sorted([find_index(col_name) for col_name in sorted_columns[-6:]])
print(var_small_6)
var_small_6_results = []
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_small_6, dims)
    var_small_6_results.append(result)

# 分散が大きい順に3枚、小さい順に3枚聞く
var_large_3_small_3 = sorted([find_index(col_name) for col_name in sorted_columns[-3:]] + [find_index(col_name) for col_name in sorted_columns[:3]])
var_large_3_small_3_results = []
print(var_large_3_small_3)
for index, row in tqdm(df_test.iterrows()):
    result = calculate_score(row.name, var_large_3_small_3, dims)
    var_large_3_small_3_results.append(result)

MAE_var_large_6 = {5: [], 10: [], 15: []}
for result in var_large_6_results:
    for dim in dims:
        MAE_var_large_6[dim].append(result[dim]["MAE"])

MAE_var_small_6 = {5: [], 10: [], 15: []}
for result in var_small_6_results:
    for dim in dims:
        MAE_var_small_6[dim].append(result[dim]["MAE"])

MAE_var_large_3_small_3 = {5: [], 10: [], 15: []}
for result in var_large_3_small_3_results:
    for dim in dims:
        MAE_var_large_3_small_3[dim].append(result[dim]["MAE"])

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
for index, dim in enumerate(dims):
    plt.subplot(3, 1, index+1)
    plt.hist(MAE_var_large_6[dim], label="分散大6", alpha=0.5)
    plt.hist(MAE_var_small_6[dim], label="分散小6", alpha=0.5)
    plt.hist(MAE_var_large_3_small_3[dim], label="分散大3小3", alpha=0.5)
    plt.title(f"{dim}次元")
    plt.xlim(0, 7)
    plt.legend()

plt.tight_layout()

In [None]:
df_var_large_6 = pd.DataFrame({'5次元':MAE_var_large_6[5],'10次元':MAE_var_large_6[10],'15次元':MAE_var_large_6[15]})
df_var_large_6.describe()

In [None]:
df_var_small_6 = pd.DataFrame({'5次元':MAE_var_small_6[5],'10次元':MAE_var_small_6[10],'15次元':MAE_var_small_6[15]})
df_var_small_6.describe()

In [None]:
df_var_large_3_small_3 = pd.DataFrame({'5次元':MAE_var_large_3_small_3[5],'10次元':MAE_var_large_3_small_3[10],'15次元':MAE_var_large_3_small_3[15]})
df_var_large_3_small_3.describe()

In [None]:
df_var_corr_max_6 = pd.DataFrame({'5次元':MAE_var_corr_max_6[5],'10次元':MAE_var_corr_max_6[10],'15次元':MAE_var_corr_max_6[15]})
df_var_corr_max_6.describe()

In [None]:
df_var_corr_zero_6 = pd.DataFrame({'5次元':MAE_var_corr_zero_6[5],'10次元':MAE_var_corr_zero_6[10],'15次元':MAE_var_corr_zero_6[15]})
df_var_corr_zero_6.describe()

In [None]:
# 訓練用データの平均
means = df_train.mean().values

In [None]:
# 分散最大+相関弱
var_corr_2 = []
n_smallest = 1
var_corr_2.append(sorted_columns[0])
n_smallest_index = row[row == row.nsmallest(n_smallest).iloc[-1]].index[0]
var_corr_2.append(n_smallest_index)
var_corr_2_ids = sorted([find_index(col_name) for col_name in var_corr_2])

filnal_result_list = []
for index, row in tqdm(df_test.iterrows()):
    result_first = calculate_score(row.name, var_corr_2_ids, dims)
    result_final = {}
    for dim in dims:
        selected_ids = set(var_corr_2_ids)
        diff_first = np.abs(means-result_first[dim]["q"])
        sorted_indices = np.argsort(diff_first)
        index = 1
        while len(selected_ids) < 4:
            if sorted_indices[index * -1] not in selected_ids:
                selected_ids.add(sorted_indices[index * -1])
            index += 1
        result_second = calculate_score(row.name, list(selected_ids), [dim])
        diff_second = np.abs(result_second[dim]["q"] - result_first[dim]["q"])
        sorted_indices = np.argsort(diff_second)
        index = 1
        while len(selected_ids) < 6:
            if sorted_indices[index * -1] not in selected_ids:
                selected_ids.add(sorted_indices[index * -1])
            index += 1
        result_third = calculate_score(row.name, list(selected_ids), [dim])
        result_final[dim] = result_third[dim]
    filnal_result_list.append(result_final)

MAE_diff_est = {5: [], 10: [], 15: []}
for result in filnal_result_list:
    for dim in dims:
        MAE_diff_est[dim].append(result[dim]["MAE"])

In [None]:
plt.rcParams["figure.figsize"] = (6, 6)
for index, dim in enumerate(dims):
    plt.subplot(3, 1, index+1)
    plt.hist(MAE_diff_est[dim], alpha=0.5)
    plt.title(f"{dim}次元")
    plt.xlim(0, 7)

plt.tight_layout()

In [None]:
df_diff_est = pd.DataFrame({'5次元':MAE_diff_est[5],'10次元':MAE_diff_est[10],'15次元':MAE_diff_est[15]})
df_diff_est.describe()