In [1]:
import random
import os
import re
import pickle
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler
from scipy.signal import find_peaks
import seaborn as sns
import pickle
import torch

pd.set_option('display.expand_frame_repr', False)  # DataFrameを改行せずに表示
pd.set_option('display.max_columns', None)  # すべての列を表示

In [2]:
def extract_finalQP(filename):
    match = re.search(r'2ndQP(\d+)', filename)
    if match:
        return int(match.group(1))
    
    match = re.search(r'1stQP(\d+)', filename)
    if match:
        return int(match.group(1))
    
    return None


def extract_1stQP(filename):
    match = re.search(r'1stQP(\d+)', filename)
    if match:
        return int(match.group(1))
    
    return None

def is_double_compressed(mean_difference, final_QP, threshold):    
    mean_difference = mean_difference[0]
    final_QP = final_QP[0]

    energy_clamp = torch.clamp(mean_difference, min=0)
    energy = torch.sum(torch.square(energy_clamp))
    mean_difference_right_clamp = torch.clamp(mean_difference[final_QP+1:52], min=0)
    right_energy = torch.sum(torch.square(mean_difference_right_clamp))
    
    if energy != 0:
        energy_ratio = right_energy / energy
        if energy_ratio >= threshold:
            return True
        elif energy_ratio < threshold:
            return False
    else:
        # energyが0の場合、エラーを処理するか、適切な値を返す
        return -1

    # if (right_energy / energy) != 0 and (right_energy / energy) >= threshold:
    #     return True
    # elif (right_energy / energy) != 0 and (right_energy / energy) < threshold:
    #     return False
    # else:
    #     return -1
    
def calculate_mae(file_path):
    try:
        with open(file_path, 'rb') as file:
            loaded_data, loaded_data_shifted = pickle.load(file)
    except Exception as e:
        print(f"Error occurred while loading {file_path}: {e}")
        return None

    # タプル内のリストを抽出
    original_mae = loaded_data
    shifted_mae = loaded_data_shifted

    # Coding ghostを計算してリストに格納する
    mae_difference = [shifted - original for original, shifted in zip(original_mae, shifted_mae)]
    
    # mae_differenceをtensorに変換
    mae_difference_tensor = torch.tensor(mae_difference)
    
    # mae_differenceの各要素においてマイナスの値を0に変換
    mae_difference_positive = [0 if val <= 0 else val for val in mae_difference]
    
    return mae_difference_positive, mae_difference_tensor

In [3]:
rootpath = "/Prove/Yoshihisa/HEIF_ghost/HEIF_IMAGES_CSV/"

single_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_single_csv')
single_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_second_sameQP_csv')

second_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_second_csv')
second_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_triple_csv')


single_train_csv_path1_list = [os.path.join(single_train_csv_path1, file) for file in sorted(os.listdir(single_train_csv_path1))]
single_train_csv_path2_list = [os.path.join(single_train_csv_path2, file) for file in sorted(os.listdir(single_train_csv_path2))]

second_train_csv_path1_list = [os.path.join(second_train_csv_path1, file) for file in sorted(os.listdir(second_train_csv_path1))]
second_train_csv_path2_list = [os.path.join(second_train_csv_path2, file) for file in sorted(os.listdir(second_train_csv_path2))]


print("single_train_csv_path1_list: ", len(single_train_csv_path1_list))
print("single_train_csv_path2_list: ", len(single_train_csv_path2_list))

print("second_train_csv_path1_list: ", len(second_train_csv_path1_list))
print("second_train_csv_path2_list: ", len(second_train_csv_path2_list))


single_train_csv_path1_list:  3080
single_train_csv_path2_list:  3080
second_train_csv_path1_list:  17556
second_train_csv_path2_list:  17556


In [4]:
second_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_second_sameQP_csv')
second_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_triple_sameQP_csv')

second_sameQP_train_csv_path1_list = [os.path.join(second_train_csv_path1, file) for file in sorted(os.listdir(second_train_csv_path1))]
second_sameQP_train_csv_path2_list = [os.path.join(second_train_csv_path2, file) for file in sorted(os.listdir(second_train_csv_path2))]

print("second_sameQP_train_csv_path1_list: ", len(second_sameQP_train_csv_path1_list))
print("second_sameQP_train_csv_path2_list: ", len(second_sameQP_train_csv_path2_list))

second_sameQP_train_csv_path1_list:  3080
second_sameQP_train_csv_path2_list:  3080


In [5]:
second_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_second_largeQP_csv')
second_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_triple_largeQP_csv')

second_largeQP_train_csv_path1_list = [os.path.join(second_train_csv_path1, file) for file in sorted(os.listdir(second_train_csv_path1))]
second_largeQP_train_csv_path2_list = [os.path.join(second_train_csv_path2, file) for file in sorted(os.listdir(second_train_csv_path2))]

print("second_largeQP_train_csv_path1_list: ", len(second_largeQP_train_csv_path1_list))
print("second_largeQP_train_csv_path2_list: ", len(second_largeQP_train_csv_path2_list))

second_largeQP_train_csv_path1_list:  12012
second_largeQP_train_csv_path2_list:  12012


In [6]:
rootpath = "/Prove/Yoshihisa/HEIF_ghost/PKL/"

single_train_pkl_path1 = os.path.join(rootpath, 'pkl_single')
single_train_pkl_path2 = os.path.join(rootpath, 'pkl_second_sameQP')

second_train_pkl_path1 = os.path.join(rootpath, 'pkl_second')
second_train_pkl_path2 = os.path.join(rootpath, 'pkl_triple')


single_train_pkl_path1_list = [os.path.join(single_train_pkl_path1, file) for file in sorted(os.listdir(single_train_pkl_path1))]
single_train_pkl_path2_list = [os.path.join(single_train_pkl_path2, file) for file in sorted(os.listdir(single_train_pkl_path2))]

second_train_pkl_path1_list = [os.path.join(second_train_pkl_path1, file) for file in sorted(os.listdir(second_train_pkl_path1))]
second_train_pkl_path2_list = [os.path.join(second_train_pkl_path2, file) for file in sorted(os.listdir(second_train_pkl_path2))]


print("single_train_pkl_path1_list: ", len(single_train_pkl_path1_list))
print("single_train_pkl_path2_list: ", len(single_train_pkl_path2_list))

print("second_train_pkl_path1_list: ", len(second_train_pkl_path1_list))
print("second_train_pkl_path2_list: ", len(second_train_pkl_path2_list))


single_train_pkl_path1_list:  3080
single_train_pkl_path2_list:  3080
second_train_pkl_path1_list:  17556
second_train_pkl_path2_list:  17556


In [7]:
second_train_pkl_path1 = os.path.join(rootpath, 'pkl_second_sameQP')
second_train_pkl_path2 = os.path.join(rootpath, 'pkl_triple_sameQP')

second_sameQP_train_pkl_path1_list = [os.path.join(second_train_pkl_path1, file) for file in sorted(os.listdir(second_train_pkl_path1))]
second_sameQP_train_pkl_path2_list = [os.path.join(second_train_pkl_path2, file) for file in sorted(os.listdir(second_train_pkl_path2))]

print("second_sameQP_train_pkl_path1_list: ", len(second_sameQP_train_pkl_path1_list))
print("second_sameQP_train_pkl_path2_list: ", len(second_sameQP_train_pkl_path2_list))

second_sameQP_train_pkl_path1_list:  3080
second_sameQP_train_pkl_path2_list:  3080


In [8]:
second_train_pkl_path1 = os.path.join(rootpath, 'pkl_second_largeQP')
second_train_pkl_path2 = os.path.join(rootpath, 'pkl_triple_largeQP')

second_largeQP_train_pkl_path1_list = [os.path.join(second_train_pkl_path1, file) for file in sorted(os.listdir(second_train_pkl_path1))]
second_largeQP_train_pkl_path2_list = [os.path.join(second_train_pkl_path2, file) for file in sorted(os.listdir(second_train_pkl_path2))]

print("second_largeQP_train_pkl_path1_list: ", len(second_largeQP_train_pkl_path1_list))
print("second_largeQP_train_pkl_path2_list: ", len(second_largeQP_train_pkl_path2_list))

second_largeQP_train_pkl_path1_list:  12012
second_largeQP_train_pkl_path2_list:  12012


In [9]:
single_train_csv = list(zip(single_train_csv_path1_list, single_train_pkl_path1_list, single_train_csv_path2_list, single_train_pkl_path2_list))

second_train_csv = list(zip(second_train_csv_path1_list, second_train_pkl_path1_list, second_train_csv_path2_list, second_train_pkl_path2_list))
second_sameQP_train_csv = list(zip(second_sameQP_train_csv_path1_list, second_sameQP_train_pkl_path1_list, second_sameQP_train_csv_path2_list, second_sameQP_train_pkl_path2_list))
second_largeQP_train_csv = list(zip(second_largeQP_train_csv_path1_list, second_largeQP_train_pkl_path1_list, second_largeQP_train_csv_path2_list, second_largeQP_train_pkl_path2_list))

single_train_csv = random.sample(single_train_csv, 3000)
second_train_csv = random.sample(second_train_csv, 1000)
second_sameQP_train_csv = random.sample(second_sameQP_train_csv, 1000)
second_largeQP_train_csv = random.sample(second_largeQP_train_csv, 1000)

train_csv_list = single_train_csv + second_train_csv + second_sameQP_train_csv + second_largeQP_train_csv
# train_csv_list = single_train_csv + second_train_csv

print("train_csv_list: ", len(train_csv_list))

train_csv_list:  6000


In [10]:
rootpath = "/Prove/Yoshihisa/HEIF_ghost/DATA_QP2_GROUP/"

single_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_single_csv/QP10')
single_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_second_sameQP_csv/2ndQP10')

second_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_second_csv/2ndQP10')
second_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_triple_csv/2ndQP10')


single_train_csv_path1_list = [os.path.join(single_train_csv_path1, file) for file in sorted(os.listdir(single_train_csv_path1))]
single_train_csv_path2_list = [os.path.join(single_train_csv_path2, file) for file in sorted(os.listdir(single_train_csv_path2))]

second_train_csv_path1_list = [os.path.join(second_train_csv_path1, file) for file in sorted(os.listdir(second_train_csv_path1))]
second_train_csv_path2_list = [os.path.join(second_train_csv_path2, file) for file in sorted(os.listdir(second_train_csv_path2))]


print("single_train_csv_path1_list: ", len(single_train_csv_path1_list))
print("single_train_csv_path2_list: ", len(single_train_csv_path2_list))

print("second_train_csv_path1_list: ", len(second_train_csv_path1_list))
print("second_train_csv_path2_list: ", len(second_train_csv_path2_list))


single_train_csv_path1_list:  308
single_train_csv_path2_list:  308
second_train_csv_path1_list:  2772
second_train_csv_path2_list:  2772


In [11]:
second_train_csv_path1 = os.path.join(rootpath, 'HEIF_images_second_sameQP_csv/2ndQP10')
second_train_csv_path2 = os.path.join(rootpath, 'HEIF_images_triple_sameQP_csv/2ndQP10')

second_sameQP_train_csv_path1_list = [os.path.join(second_train_csv_path1, file) for file in sorted(os.listdir(second_train_csv_path1))]
second_sameQP_train_csv_path2_list = [os.path.join(second_train_csv_path2, file) for file in sorted(os.listdir(second_train_csv_path2))]

print("second_sameQP_train_csv_path1_list: ", len(second_sameQP_train_csv_path1_list))
print("second_sameQP_train_csv_path2_list: ", len(second_sameQP_train_csv_path2_list))

second_sameQP_train_csv_path1_list:  308
second_sameQP_train_csv_path2_list:  308


In [12]:
# rootpath = "/Prove/Yoshihisa/HEIF_ghost/PKL/"

single_train_pkl_path1 = os.path.join(rootpath, 'pkl_single/QP10')
single_train_pkl_path2 = os.path.join(rootpath, 'pkl_second_sameQP/2ndQP10')

second_train_pkl_path1 = os.path.join(rootpath, 'pkl_second/2ndQP10')
second_train_pkl_path2 = os.path.join(rootpath, 'pkl_triple/2ndQP10')


single_train_pkl_path1_list = [os.path.join(single_train_pkl_path1, file) for file in sorted(os.listdir(single_train_pkl_path1))]
single_train_pkl_path2_list = [os.path.join(single_train_pkl_path2, file) for file in sorted(os.listdir(single_train_pkl_path2))]

second_train_pkl_path1_list = [os.path.join(second_train_pkl_path1, file) for file in sorted(os.listdir(second_train_pkl_path1))]
second_train_pkl_path2_list = [os.path.join(second_train_pkl_path2, file) for file in sorted(os.listdir(second_train_pkl_path2))]


print("single_train_pkl_path1_list: ", len(single_train_pkl_path1_list))
print("single_train_pkl_path2_list: ", len(single_train_pkl_path2_list))

print("second_train_pkl_path1_list: ", len(second_train_pkl_path1_list))
print("second_train_pkl_path2_list: ", len(second_train_pkl_path2_list))

single_train_pkl_path1_list:  308
single_train_pkl_path2_list:  308
second_train_pkl_path1_list:  2772
second_train_pkl_path2_list:  2772


In [13]:
second_train_pkl_path1 = os.path.join(rootpath, 'pkl_second_sameQP/2ndQP10')
second_train_pkl_path2 = os.path.join(rootpath, 'pkl_triple_sameQP/2ndQP10')

second_sameQP_train_pkl_path1_list = [os.path.join(second_train_pkl_path1, file) for file in sorted(os.listdir(second_train_pkl_path1))]
second_sameQP_train_pkl_path2_list = [os.path.join(second_train_pkl_path2, file) for file in sorted(os.listdir(second_train_pkl_path2))]

print("second_sameQP_train_pkl_path1_list: ", len(second_sameQP_train_pkl_path1_list))
print("second_sameQP_train_pkl_path2_list: ", len(second_sameQP_train_pkl_path2_list))

second_sameQP_train_pkl_path1_list:  308
second_sameQP_train_pkl_path2_list:  308


In [14]:
single_train_csv = list(zip(single_train_csv_path1_list, single_train_pkl_path1_list, single_train_csv_path2_list, single_train_pkl_path2_list))

second_train_csv = list(zip(second_train_csv_path1_list, second_train_pkl_path1_list, second_train_csv_path2_list, second_train_pkl_path2_list))
second_sameQP_train_csv = list(zip(second_sameQP_train_csv_path1_list, second_sameQP_train_pkl_path1_list, second_sameQP_train_csv_path2_list, second_sameQP_train_pkl_path2_list))
# second_largeQP_train_csv = list(zip(second_largeQP_train_csv_path1_list, second_largeQP_train_pkl_path1_list, second_largeQP_train_csv_path2_list, second_largeQP_train_pkl_path2_list))

single_train_csv = random.sample(single_train_csv, 300)
second_train_csv = random.sample(second_train_csv, 150)
second_sameQP_train_csv = random.sample(second_sameQP_train_csv, 150)
# second_largeQP_train_csv = random.sample(second_largeQP_train_csv, 1000)

test_csv_list = single_train_csv + second_train_csv + second_sameQP_train_csv
# train_csv_list = single_train_csv + second_train_csv

print("test_csv_list: ", len(test_csv_list))

test_csv_list:  600


In [15]:
# 列名をリストにまとめる
pu_columns = ["PU1_64", "PU1_32", "PU1_16", "PU1_8", "PU1_4",  "PU2_64","PU2_32", "PU2_16", "PU2_8", "PU2_4"]
label_columns = ["LABEL"]
mae1_columns = [f"MAE1_{i}" for i in range(52)]
mae2_columns = [f"MAE2_{i}" for i in range(52)]
mae_columns = ["MAE"]
final_qp_columns = ["FINAL_QP"]

# データフレームを初期化
train_df1 = pd.DataFrame(columns=pu_columns)
train_df2 = pd.DataFrame(columns=label_columns)
train_df3 = pd.DataFrame(columns=mae1_columns)
train_df4 = pd.DataFrame(columns=mae2_columns)
train_df5 = pd.DataFrame(columns=mae_columns)
train_df6 = pd.DataFrame(columns=final_qp_columns)

scaler = MinMaxScaler()

for path1, path2, path3, path4 in train_csv_list:
    label = 1 if ("2ndQP" in path1) and ("3rdQP" in path3) else 0
    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path3)
    train_pkl_list = [path2, path4]
    
    # pu_columnsの値を取得
    pu_values = [df1.loc[i, "pu_counts"] for i in range(5)] + [df2.loc[i, "pu_counts"] for i in range(5)]
    train_df1 = pd.concat([train_df1, pd.DataFrame([pu_values], columns=pu_columns)], ignore_index=True)
    
    # label_columnsの値を取得
    train_df2 = pd.concat([train_df2, pd.DataFrame({"LABEL": [label]})], ignore_index=True)
    
    final_QP = extract_finalQP(train_pkl_list[0])
    
    # MAEの値を取得
    mae_d1, mae_d1_old = calculate_mae(train_pkl_list[0])
    mae_d2, _ = calculate_mae(train_pkl_list[1])
    
    
    # mae_columnsの値を取得
    train_df5 = pd.concat([train_df5, pd.DataFrame({"MAE": [mae_d1_old]})], ignore_index=True)
    
    # final_qp_columnsの値を取得
    train_df6 = pd.concat([train_df6, pd.DataFrame({"FINAL_QP": [final_QP]})], ignore_index=True)
    
    # mae1_columnsの値を取得
    train_df3 = pd.concat([train_df3, pd.DataFrame({f"MAE1_{i}": [mae_d1[i]] for i in range(52)})], ignore_index=True)
    
    # mae2_columnsの値を取得
    train_df4 = pd.concat([train_df4, pd.DataFrame({f"MAE2_{i}": [mae_d2[i]] for i in range(52)})], ignore_index=True)

# インデックスをリセット
train_df1.reset_index(drop=True, inplace=True)
train_df2.reset_index(drop=True, inplace=True)
    
# データフレームを結合
train_df = pd.concat([train_df1, train_df3, train_df4], axis=1)
train_df_onlyGhost = pd.concat([train_df3, train_df4], axis=1)

# 各データフレームの長さを表示
print(f'Length of train_df: {len(train_df)}')
print(f'Length of train_df_onlyGhost: {len(train_df_onlyGhost)}')
print(f'Length of train_df5: {len(train_df5)}')
print(f'Length of train_df6: {len(train_df6)}')

Length of train_df: 6000
Length of train_df_onlyGhost: 6000
Length of train_df5: 6000
Length of train_df6: 6000


In [16]:
# 列名をリストにまとめる
pu_columns = ["PU1_64", "PU1_32", "PU1_16", "PU1_8", "PU1_4",  "PU2_64","PU2_32", "PU2_16", "PU2_8", "PU2_4"]
label_columns = ["LABEL"]
mae1_columns = [f"MAE1_{i}" for i in range(52)]
mae2_columns = [f"MAE2_{i}" for i in range(52)]
mae_columns = ["MAE"]
final_qp_columns = ["FINAL_QP"]

# データフレームを初期化
test_df1 = pd.DataFrame(columns=pu_columns)
test_df2 = pd.DataFrame(columns=label_columns)
test_df3 = pd.DataFrame(columns=mae1_columns)
test_df4 = pd.DataFrame(columns=mae2_columns)
test_df5 = pd.DataFrame(columns=mae_columns)
test_df6 = pd.DataFrame(columns=final_qp_columns)

scaler = MinMaxScaler()

for path1, path2, path3, path4 in test_csv_list:
    label = 1 if ("2ndQP" in path1) and ("3rdQP" in path3) else 0
    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path3)
    test_pkl_list = [path2, path4]
    
    # pu_columnsの値を取得
    pu_values = [df1.loc[i, "pu_counts"] for i in range(5)] + [df2.loc[i, "pu_counts"] for i in range(5)]
    test_df1 = pd.concat([test_df1, pd.DataFrame([pu_values], columns=pu_columns)], ignore_index=True)
    
    # label_columnsの値を取得
    test_df2 = pd.concat([test_df2, pd.DataFrame({"LABEL": [label]})], ignore_index=True)
    
    final_QP = extract_finalQP(test_pkl_list[0])
    
    # MAEの値を取得
    mae_d1, mae_d1_old = calculate_mae(test_pkl_list[0])
    mae_d2, _ = calculate_mae(test_pkl_list[1])
    
    
    # mae_columnsの値を取得
    test_df5 = pd.concat([test_df5, pd.DataFrame({"MAE": [mae_d1_old]})], ignore_index=True)
    
    # final_qp_columnsの値を取得
    test_df6 = pd.concat([test_df6, pd.DataFrame({"FINAL_QP": [final_QP]})], ignore_index=True)
    
    # mae1_columnsの値を取得
    test_df3 = pd.concat([test_df3, pd.DataFrame({f"MAE1_{i}": [mae_d1[i]] for i in range(52)})], ignore_index=True)
    
    # mae2_columnsの値を取得
    test_df4 = pd.concat([test_df4, pd.DataFrame({f"MAE2_{i}": [mae_d2[i]] for i in range(52)})], ignore_index=True)

# インデックスをリセット
test_df1.reset_index(drop=True, inplace=True)
test_df2.reset_index(drop=True, inplace=True)
    
# データフレームを結合
test_df = pd.concat([test_df1, test_df3, test_df4], axis=1)
test_df_onlyGhost = pd.concat([test_df3, test_df4], axis=1)

# 各データフレームの長さを表示
print(f'Length of test_df: {len(test_df)}')
print(f'Length of test_df_onlyGhost: {len(test_df_onlyGhost)}')
print(f'Length of test_df5: {len(test_df5)}')
print(f'Length of test_df6: {len(test_df6)}')

Length of test_df: 600
Length of test_df_onlyGhost: 600
Length of test_df5: 600
Length of test_df6: 600


In [17]:
# スケーラーを使って結合したデータをスケーリング
X_train = scaler.fit_transform(train_df)
X_train_onlyGhost = scaler.fit_transform(train_df_onlyGhost)

# pandasをndarrayに変換
train_df5_np = train_df5.values
FINAL_QP_train = train_df6.values

# ラベルの準備
Y_train = train_df2['LABEL'].astype(int)

print(f'Length of X_train: {len(X_train)}')
print(f'Length of X_train_onlyGhost: {len(X_train_onlyGhost)}')
print(f'Length of Y_train: {len(Y_train)}')
print(f'Length of train_df5_np: {len(train_df5_np)}')
print(f'Length of FINAL_QP: {len(FINAL_QP_train)}')

Length of X_train: 6000
Length of X_train_onlyGhost: 6000
Length of Y_train: 6000
Length of train_df5_np: 6000
Length of FINAL_QP: 6000


In [18]:
# スケーラーを使って結合したデータをスケーリング
X_test = scaler.fit_transform(test_df)
X_test_onlyGhost = scaler.fit_transform(test_df_onlyGhost)

# pandasをndarrayに変換
test_df5_np = test_df5.values
FINAL_QP_test = test_df6.values

# ラベルの準備
Y_test = test_df2['LABEL'].astype(int)

print(f'Length of X_train: {len(X_test)}')
print(f'Length of X_train_onlyGhost: {len(X_test_onlyGhost)}')
print(f'Length of Y_train: {len(Y_test)}')
print(f'Length of train_df5_np: {len(test_df5_np)}')
print(f'Length of FINAL_QP: {len(FINAL_QP_test)}')

Length of X_train: 600
Length of X_train_onlyGhost: 600
Length of Y_train: 600
Length of train_df5_np: 600
Length of FINAL_QP: 600


In [None]:
# Cの範囲を指定
C_values = {'C': [0.01, 0.1, 1, 10, 100, 1000, 2000, 3000, 4000, 5000]}
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
kfold2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)

# 結果のデータフレームを初期化
results = pd.DataFrame(columns=['C_RBF', 'Test_Score_RBF', 'C_LINEAR', 'Test_Score_LINEAR', 
                                'C_onlyGhost_RBF', 'Test_Score_onlyGhost_RBF', 'C_onlyGhost_LINEAR', 'Test_Score_onlyGhost_LINEAR',
                                'Threshold', 'Test_Score_old'])

original_X_train, original_X_train_onlyGhost = X_train, X_train_onlyGhost
original_X_test, original_X_test_onlyGhost = X_test, X_test_onlyGhost
original_Y_train, original_Y_test = Y_train, Y_test

original_old_train = train_df5_np
original_old_test, original_final_QP_test = test_df5_np, FINAL_QP_test

# k-fold cross-validation
for fold, (train_ids, test_ids) in enumerate(kfold.split(original_X_train, original_Y_train)):
    print(f"<Fold-{fold+1}>")
    print()
    
    results_old = []

    # 全体を訓練・検証データに分割
    X_train_val, _ = original_X_train[train_ids], original_X_train[test_ids]
    X_train_onlyGhost_val, _ = original_X_train_onlyGhost[train_ids], original_X_train_onlyGhost[test_ids]
    X_train_old_val, _ = original_old_train[train_ids], original_old_train[test_ids]
    Y_train_val, _ = original_Y_train[train_ids], original_Y_train[test_ids]

    # 訓練・検証データをさらに訓練データと検証データに分割
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=600, random_state=42)
    X_train_onlyGhost, X_val_onlyGhost, _, _ = train_test_split(X_train_onlyGhost_val, Y_train_val, test_size=600, random_state=42)
    
    #テストデータの生成
    X_test = original_X_test
    X_test_onlyGhost = original_X_test_onlyGhost
    X_test_old = original_old_test
    final_QP = original_final_QP_test
    Y_test = original_Y_test
    
    best_threshold = 0
    best_accuracy = 0
    best_predicted_labels = []
    best_ground_truth_labels = []


    # 最適な閾値を見つける
    for threshold in np.arange(0.01, 1.01, 0.01):        
        results_old = [is_double_compressed(X_test_old[i], final_QP[i], threshold) for i in range(600)]
        predicted_labels = [int(is_double) for is_double in results_old]
        ground_truth_labels = [label for label in Y_test]
        accuracy = sum(1 for true_label, pred_label in zip(ground_truth_labels, predicted_labels) if true_label == pred_label) / len(ground_truth_labels)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold
            best_predicted_labels = predicted_labels
            best_ground_truth_labels = ground_truth_labels

    best_val_score_RBF, best_svm_model_RBF, best_c_value_RBF = 0, None, None
    best_val_score_onlyGhost_RBF, best_svm_model_onlyGhost_RBF, best_c_value_onlyGhost_RBF = 0, None, None

    best_val_score_LINEAR, best_svm_model_LINEAR, best_c_value_LINEAR = 0, None, None
    best_val_score_onlyGhost_LINEAR, best_svm_model_onlyGhost_LINEAR, best_c_value_onlyGhost_LINEAR = 0, None, None

    for C_value in C_values['C']:    
        # SVMモデルのインスタンスを作成
        svm_model_RBF = SVC(kernel='rbf', C=C_value)
        svm_model_onlyGhost_RBF = SVC(kernel='rbf', C=C_value)

        svm_model_LINEAR = SVC(kernel='linear', C=C_value)
        svm_model_onlyGhost_LINEAR = SVC(kernel='linear', C=C_value)

        # 訓練データで訓練
        svm_model_RBF.fit(X_train, Y_train)
        svm_model_onlyGhost_RBF.fit(X_train_onlyGhost, Y_train)

        svm_model_LINEAR.fit(X_train, Y_train)
        svm_model_onlyGhost_LINEAR.fit(X_train_onlyGhost, Y_train)

        # 検証データでの精度を評価
        val_accuracy_RBF = accuracy_score(Y_val, svm_model_RBF.predict(X_val))
        val_accuracy_onlyGhost_RBF = accuracy_score(Y_val, svm_model_onlyGhost_RBF.predict(X_val_onlyGhost))
        val_accuracy_LINEAR = accuracy_score(Y_val, svm_model_LINEAR.predict(X_val))
        val_accuracy_onlyGhost_LINEAR = accuracy_score(Y_val, svm_model_onlyGhost_LINEAR.predict(X_val_onlyGhost))

        # 最も高い精度を持つモデルを選択
        if val_accuracy_RBF > best_val_score_RBF:
            best_val_score_RBF, best_svm_model_RBF, best_c_value_RBF = val_accuracy_RBF, svm_model_RBF, C_value
        if val_accuracy_onlyGhost_RBF > best_val_score_onlyGhost_RBF:
            best_val_score_onlyGhost_RBF, best_svm_model_onlyGhost_RBF, best_c_value_onlyGhost_RBF = val_accuracy_onlyGhost_RBF, svm_model_onlyGhost_RBF, C_value
        if val_accuracy_LINEAR > best_val_score_LINEAR:
            best_val_score_LINEAR, best_svm_model_LINEAR, best_c_value_LINEAR = val_accuracy_LINEAR, svm_model_LINEAR, C_value
        if val_accuracy_onlyGhost_LINEAR > best_val_score_onlyGhost_LINEAR:
            best_val_score_onlyGhost_LINEAR, best_svm_model_onlyGhost_LINEAR, best_c_value_onlyGhost_LINEAR = val_accuracy_onlyGhost_LINEAR, svm_model_onlyGhost_LINEAR, C_value

    # テストデータでの評価
    test_predictions_RBF = best_svm_model_RBF.predict(X_test)
    test_accuracy_RBF = accuracy_score(Y_test, test_predictions_RBF)
    report_RBF = classification_report(Y_test, test_predictions_RBF)
    print(f'Summary_RBF:\n{report_RBF}')

    test_predictions_LINEAR = best_svm_model_LINEAR.predict(X_test)
    test_accuracy_LINEAR = accuracy_score(Y_test, test_predictions_LINEAR)
    report_LINEAR = classification_report(Y_test, test_predictions_LINEAR)
    print(f'Summary_LINEAR:\n{report_LINEAR}')

    # テストデータでの評価
    test_predictions_onlyGhost_RBF = best_svm_model_onlyGhost_RBF.predict(X_test_onlyGhost)
    test_accuracy_onlyGhost_RBF = accuracy_score(Y_test, test_predictions_onlyGhost_RBF)
    report_onlyGhost_RBF = classification_report(Y_test, test_predictions_onlyGhost_RBF)
    print(f'Summary_onlyGhost_RBF:\n{report_onlyGhost_RBF}')

    test_predictions_onlyGhost_LINEAR = best_svm_model_onlyGhost_LINEAR.predict(X_test_onlyGhost)
    test_accuracy_onlyGhost_LINEAR = accuracy_score(Y_test, test_predictions_onlyGhost_LINEAR)
    report_onlyGhost_LINEAR = classification_report(Y_test, test_predictions_onlyGhost_LINEAR)
    print(f'Summary_onlyGhost_LINEAR:\n{report_onlyGhost_LINEAR}')
    
    report_old = classification_report(best_ground_truth_labels, best_predicted_labels, labels=[0,1], target_names=['0', '1'], zero_division=0)
    print(f'Summary old_model:\n{report_old}')

    # Test結果を保存
    result_row = {'C_RBF': best_c_value_RBF, 'Test_Score_RBF': test_accuracy_RBF,
                  'C_LINEAR': best_c_value_LINEAR, 'Test_Score_LINEAR': test_accuracy_LINEAR,
                  'C_onlyGhost_RBF': best_c_value_onlyGhost_RBF, 'Test_Score_onlyGhost_RBF': test_accuracy_onlyGhost_RBF,
                  'C_onlyGhost_LINEAR': best_c_value_onlyGhost_LINEAR, 'Test_Score_onlyGhost_LINEAR': test_accuracy_onlyGhost_LINEAR,
                  'Threshold': best_threshold, 'Test_Score_old': best_accuracy}

    results = pd.concat([results, pd.DataFrame([result_row])], ignore_index=True)

# 結果を表示
print(results)


<Fold-1>

Summary_RBF:
              precision    recall  f1-score   support

           0       0.95      0.44      0.60       300
           1       0.64      0.98      0.77       300

    accuracy                           0.71       600
   macro avg       0.79      0.71      0.69       600
weighted avg       0.79      0.71      0.69       600

Summary_LINEAR:
              precision    recall  f1-score   support

           0       0.95      0.41      0.58       300
           1       0.62      0.98      0.76       300

    accuracy                           0.69       600
   macro avg       0.79      0.70      0.67       600
weighted avg       0.79      0.69      0.67       600

Summary_onlyGhost_RBF:
              precision    recall  f1-score   support

           0       0.99      0.46      0.63       300
           1       0.65      0.99      0.79       300

    accuracy                           0.73       600
   macro avg       0.82      0.73      0.71       600
weighted avg

In [None]:
def print_stats(column_name, label):
    average = round(results[column_name].mean(), 2)
    std_dev = round(results[column_name].std(), 2)
    max_value = round(results[column_name].max(), 2)
    min_value = round(results[column_name].min(), 2)

    print(f'Average Test Score {label}: {average}')
    print(f'Standard Deviation of Test Score {label}: {std_dev}')
    print(f'Maximum Test Score {label}: {max_value}')
    print(f'Minimum Test Score {label}: {min_value}')
    print()

# 'Test_Score'列に関して統計情報を表示
print_stats('Test_Score_RBF', 'with RBF')
print_stats('Test_Score_LINEAR', 'with LINEAR')

# 'Test_Score_onlyGhost'列に関して統計情報を表示
print_stats('Test_Score_onlyGhost_RBF', 'with only Ghost and RBF')
print_stats('Test_Score_onlyGhost_LINEAR', 'with only Ghost and LINEAR')

# 'Test_Score_old'列に関して統計情報を表示
print_stats('Test_Score_old', 'with old model')

In [None]:
# plt.rcParams["font.size"]=5
# plt.rcParams["figure.figsize"]=(2.0, 1.0)
# plt.rcParams["figure.dpi"]= 300


# # Cの範囲を指定
# C_values = {'C': [0.01, 0.1, 1, 10, 100, 1000, 2000, 3000, 4000, 5000]}
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# # 結果のデータフレームを初期化
# results = pd.DataFrame(columns=['C_RBF', 'Test_Score_RBF', 'C_LINEAR', 'Test_Score_LINEAR', 
#                                 'C_onlyGhost_RBF', 'Test_Score_onlyGhost_RBF', 'C_onlyGhost_LINEAR', 'Test_Score_onlyGhost_LINEAR',
#                                 'Threshold', 'Test_Score_old'])

# original_X_train, original_X_train_onlyGhost = X_train, X_train_onlyGhost
# original_X_test, original_X_test_onlyGhost = X_test, X_test_onlyGhost
# original_Y_train, original_Y_test = Y_train, Y_test

# original_old_train = train_df5_np
# original_old_test, original_final_QP_test = test_df5_np, FINAL_QP_test

# # k-fold cross-validation
# for fold, (train_ids, test_ids) in enumerate(kfold.split(original_X_train, original_Y_train)):
#     print(f"<Fold-{fold+1}>")
#     print()
    
#     results_old = []

#     # 全体を訓練・検証データとテストデータに分割
#     X_train_val, _ = original_X_train[train_ids], original_X_train[test_ids]
#     X_train_onlyGhost_val, _ = original_X_train_onlyGhost[train_ids], original_X_train_onlyGhost[test_ids]
#     X_train_old_val, _ = original_old_train[train_ids], original_old_train[test_ids]
    
#     # 全体を訓練・検証ラベルとテストラベルに分割
#     Y_train_val, _ = original_Y_train[train_ids], original_Y_train[test_ids]
    
#     # 訓練・検証データ（ラベル）を訓練データ（ラベル）と検証データ（ラベル）に分割
#     X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=600, random_state=42)
#     X_train_onlyGhost, X_val_onlyGhost, Y_train, Y_val = train_test_split(X_train_onlyGhost_val, Y_train_val, test_size=600, random_state=42)
    
#     for fold, (train2_ids, test2_ids) in enumerate(kfold.split(original_X_test, original_Y_test)):
#         _, X_test = original_X_test[train2_ids], original_X_test[test2_ids]
#         _, X_test_onlyGhost = original_X_test_onlyGhost[train2_ids], original_X_test_onlyGhost[test2_ids]
#         _, X_test_old = original_old_test[train2_ids], original_old_test[test2_ids]
        
#         final_QP = original_final_QP_test[test2_ids]
        
#         _, Y_test = original_Y_test[train2_ids], original_Y_test[test2_ids]
        
#         # print(len(X_test), len(X_test_onlyGhost), len(X_test_old), len(Y_test), len(final_QP))
        
#         best_threshold = 0
#         best_accuracy = 0
#         best_predicted_labels = []
#         best_ground_truth_labels = []
    
#         for threshold in np.arange(0.00,1.01,0.01):
#             results_old = [is_double_compressed(X_test_old[i], final_QP[i], threshold) for i in range(60)]
#             predicted_labels = [int(is_double) for is_double in results_old]
#             ground_truth_labels = [label for label in Y_test]
#             accuracy = sum(1 for true_label, pred_label in zip(ground_truth_labels, predicted_labels) if true_label == pred_label) / len(ground_truth_labels)

#             if accuracy > best_accuracy:
#                 best_accuracy = accuracy
#                 best_threshold = threshold
#                 best_predicted_labels = predicted_labels
#                 best_ground_truth_labels = ground_truth_labels

#         best_val_score_RBF, best_svm_model_RBF, best_c_value_RBF = 0, None, None
#         best_val_score_onlyGhost_RBF, best_svm_model_onlyGhost_RBF, best_c_value_onlyGhost_RBF = 0, None, None

#         best_val_score_LINEAR, best_svm_model_LINEAR, best_c_value_LINEAR = 0, None, None
#         best_val_score_onlyGhost_LINEAR, best_svm_model_onlyGhost_LINEAR, best_c_value_onlyGhost_LINEAR = 0, None, None

#         for C_value in C_values['C']:    
#             # SVMモデルのインスタンスを作成
#             svm_model_RBF = SVC(kernel='rbf', C=C_value)
#             svm_model_onlyGhost_RBF = SVC(kernel='rbf', C=C_value)

#             svm_model_LINEAR = SVC(kernel='linear', C=C_value)
#             svm_model_onlyGhost_LINEAR = SVC(kernel='linear', C=C_value)

#             # 訓練データで訓練
#             svm_model_RBF.fit(X_train, Y_train)
#             svm_model_onlyGhost_RBF.fit(X_train_onlyGhost, Y_train)

#             svm_model_LINEAR.fit(X_train, Y_train)
#             svm_model_onlyGhost_LINEAR.fit(X_train_onlyGhost, Y_train)


#             val_accuracy_RBF = accuracy_score(Y_val, svm_model_RBF.predict(X_val))
#             val_accuracy_onlyGhost_RBF = accuracy_score(Y_val, svm_model_onlyGhost_RBF.predict(X_val_onlyGhost))

#             val_accuracy_LINEAR = accuracy_score(Y_val, svm_model_LINEAR.predict(X_val))
#             val_accuracy_onlyGhost_LINEAR = accuracy_score(Y_val, svm_model_onlyGhost_LINEAR.predict(X_val_onlyGhost))


#             # 検証データでの精度が最も高かった場合、そのモデルを保存
#             if val_accuracy_RBF > best_val_score_RBF:
#                 best_val_score_RBF, best_svm_model_RBF, best_c_value_RBF = val_accuracy_RBF, svm_model_RBF, C_value

#             if val_accuracy_onlyGhost_RBF > best_val_score_onlyGhost_RBF:
#                 best_val_score_onlyGhost_RBF, best_svm_model_onlyGhost_RBF, best_c_value_onlyGhost_RBF = val_accuracy_onlyGhost_RBF, svm_model_onlyGhost_RBF, C_value

#             if val_accuracy_LINEAR > best_val_score_LINEAR:
#                 best_val_score_LINEAR, best_svm_model_LINEAR, best_c_value_LINEAR = val_accuracy_LINEAR, svm_model_LINEAR, C_value

#             if val_accuracy_onlyGhost_LINEAR > best_val_score_onlyGhost_LINEAR:
#                 best_val_score_onlyGhost_LINEAR, best_svm_model_onlyGhost_LINEAR, best_c_value_onlyGhost_LINEAR = val_accuracy_onlyGhost_LINEAR, svm_model_onlyGhost_LINEAR, C_value


#         # テストデータで評価
#         test_predictions_RBF = best_svm_model_RBF.predict(X_test)
#         test_predictions_prob_RBF = best_svm_model_RBF.decision_function(X_test)
#         test_accuracy_RBF = accuracy_score(Y_test, test_predictions_RBF)
#         report_RBF = classification_report(Y_test, test_predictions_RBF)
#         print(f'Summary_RBF:\n{report_RBF}')


#         test_predictions_LINEAR = best_svm_model_LINEAR.predict(X_test)
#         test_predictions_prob_LINEAR = best_svm_model_LINEAR.decision_function(X_test)
#         test_accuracy_LINEAR = accuracy_score(Y_test, test_predictions_LINEAR)
#         report_LINEAR = classification_report(Y_test, test_predictions_LINEAR)
#         print(f'Summary_LINEAR:\n{report_LINEAR}')


#         # テストデータで評価
#         test_predictions_onlyGhost_RBF = best_svm_model_onlyGhost_RBF.predict(X_test_onlyGhost)
#         test_predictions_prob_onlyGhost_RBF = best_svm_model_onlyGhost_RBF.decision_function(X_test_onlyGhost)
#         test_accuracy_onlyGhost_RBF = accuracy_score(Y_test, test_predictions_onlyGhost_RBF)
#         report_onlyGhost_RBF = classification_report(Y_test, test_predictions_onlyGhost_RBF)
#         print(f'Summary_onlyGhost_RBF:\n{report_onlyGhost_RBF}')

#         test_predictions_onlyGhost_LINEAR = best_svm_model_onlyGhost_LINEAR.predict(X_test_onlyGhost)
#         test_predictions_prob_onlyGhost_LINEAR = best_svm_model_onlyGhost_LINEAR.decision_function(X_test_onlyGhost)
#         test_accuracy_onlyGhost_LINEAR = accuracy_score(Y_test, test_predictions_onlyGhost_LINEAR)
#         report_onlyGhost_LINEAR = classification_report(Y_test, test_predictions_onlyGhost_LINEAR)
#         print(f'Summary_onlyGhost_LINEAR:\n{report_onlyGhost_LINEAR}')


#         report_old = classification_report(best_ground_truth_labels, best_predicted_labels, labels=[0,1], target_names=['0', '1'], zero_division=0)
#         print(f'Summary old_model:\n{report_old}')

#         # Test結果を保存


#         result_row = {'C_RBF': best_c_value_RBF, 'Test_Score_RBF': test_accuracy_RBF,
#                   'C_LINEAR': best_c_value_LINEAR, 'Test_Score_LINEAR': test_accuracy_LINEAR,
#                   'C_onlyGhost_RBF': best_c_value_onlyGhost_RBF, 'Test_Score_onlyGhost_RBF': test_accuracy_onlyGhost_RBF,
#                   'C_onlyGhost_LINEAR': best_c_value_onlyGhost_LINEAR, 'Test_Score_onlyGhost_LINEAR': test_accuracy_onlyGhost_LINEAR,
#                   'Threshold': best_threshold, 'Test_Score_old': best_accuracy}

#         results = pd.concat([results, pd.DataFrame([result_row])], ignore_index=True)

# # 結果を表示
# print(results)