1. 96개를 다써서 training한다
2. 96개의 데이터셋 중 랜덤하게 10개를 뽑아서 각 technique에 대해 optimal tnc score의 평균을 구한다 -> 최종 Dr technique들의 rank 구하기
3. 2번을 50번 반복 -> (50개의 rank가 생기겠쥬)
4. 50개의 rank에 대해 pairwise하게 rank corrleation 계산 (rank들의 consistency 구하기)
5. 2~4번을 반복하는데 이번에는 그 96개로 train한 각 dataset의 optimal score를 바탕으로 점수들을 normalize해서 똑같이 진행
6. normalize했을때와 안했을때 유의미한 차이가 생기는지 report

In [1]:
# Step 0: Setup constants

INPUT_TYPE = ["dc_5", "nc_3", "nc_5", "nc_10", "nc_25", "nc_30", "nc_50", "nc_75"]
OUTPUT_TYPE = [
    "umato_srho_0",
    "pca_tnc_25",
    "tsne_pr_0",
    "umato_tnc_25",
    "isomap_tnc_25",
    "lle_pr_0",
    "isomap_pr_0",
    "tsne_tnc_25",
    "umap_pr_0",
    "umap_tnc_25",
    "pca_pr_0",
    "lle_tnc_25",
    "umato_pr_0",
]
SCORE_TYPE = [
    "explained_variance_score",
    "max_error",
    "mean_absolute_error",
    "mean_squared_error",
    "root_mean_squared_error",
    "mean_squared_log_error",
    "root_mean_squared_log_error",
    "median_absolute_error",
    "r2_score",
    "mean_poisson_deviance",
    "mean_gamma_deviance",
    "mean_absolute_percentage_error",
    "d2_absolute_error_score",
    "d2_pinball_score",
    "d2_tweedie_score",
]

In [2]:
import os

import pandas as pd

input = pd.read_csv("data/input.csv", index_col=0)
label = pd.read_csv("data/output.csv", index_col=0)

RAND_SEED = 0
RESULT_DIR = "result/application2/"

MODEL_DIR = "pretrained_model/"

if not os.path.exists(RESULT_DIR):
    os.makedirs(RESULT_DIR)
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [3]:
# Step 2: Select 10 samples from the input data and calculate Mean of Optimal TNC Score
import joblib
import numpy as np

for seed in range(50):
    np.random.seed(seed)
    idx = np.random.choice(input.index, 10, replace=False)  # Randomly select 86 samples
    print(f"Seed {seed} - idx: ", idx)

    score_dict = {}

    for t in OUTPUT_TYPE:  # For each Model
        score = 0

        model = joblib.load(f"{MODEL_DIR}/{t}.pkl")
        y_pred = model.predict(input.loc[idx])
        score += y_pred.mean()

        score_dict[t] = score  # store the score in a dictionary
        print(f"\tScore of model {t}: {score}")

    # Rank scores
    score_df = pd.DataFrame(score_dict, index=["score"]).T
    score_df = score_df.sort_values("score", ascending=False)
    # Save df as JSON
    score_df.to_json(f"{RESULT_DIR}/rand-{seed}.json")
    print(score_df.head())

Seed 0 - idx:  ['breast_tissue' 'ecoli' 'hiva' 'secom' 'fetal_health_classification'
 'magic_gamma_telescope' 'spambase' 'cifar10' 'wine' 'pumpkin_seeds']
	Score of model umato_srho_0: 0.8149273309856653
	Score of model pca_tnc_25: 0.9182648601941764
	Score of model tsne_pr_0: 0.7923843702301383
	Score of model umato_tnc_25: 0.9277738513424992
	Score of model isomap_tnc_25: 0.9281607501208782
	Score of model lle_pr_0: 0.7438757816329599
	Score of model isomap_pr_0: 0.8698216893710196
	Score of model tsne_tnc_25: 0.9553808711469174
	Score of model umap_pr_0: 0.6421034961473197
	Score of model umap_tnc_25: 0.9452556381002069
	Score of model pca_pr_0: 0.8726403925567865
	Score of model lle_tnc_25: 0.909860360622406
	Score of model umato_pr_0: 0.7963905140757561
                  score
tsne_tnc_25    0.955381
umap_tnc_25    0.945256
isomap_tnc_25  0.928161
umato_tnc_25   0.927774
pca_tnc_25     0.918265
Seed 0 - idx:  ['human_stress_detection' 'mammographic_mass' 'customer_classification'


In [4]:
# Step 4: Calculate Rank Correlation between ranks

import pandas as pd

aggr_df = pd.DataFrame()

# Load ranks
for seed in range(50):
    rank_path = f"{RESULT_DIR}/rand-{seed}.json"
    score_df = pd.read_json(rank_path)
    aggr_df[seed] = score_df.index

              0              1              2              3              4   \
0    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25   
1    umap_tnc_25    umap_tnc_25    umap_tnc_25    umap_tnc_25    umap_tnc_25   
2  isomap_tnc_25  isomap_tnc_25  isomap_tnc_25  isomap_tnc_25  isomap_tnc_25   
3   umato_tnc_25   umato_tnc_25   umato_tnc_25   umato_tnc_25   umato_tnc_25   
4     pca_tnc_25     pca_tnc_25     pca_tnc_25     pca_tnc_25     pca_tnc_25   

              5              6              7              8              9   \
0    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25    tsne_tnc_25   
1    umap_tnc_25    umap_tnc_25    umap_tnc_25    umap_tnc_25    umap_tnc_25   
2   umato_tnc_25   umato_tnc_25  isomap_tnc_25  isomap_tnc_25  isomap_tnc_25   
3  isomap_tnc_25  isomap_tnc_25   umato_tnc_25   umato_tnc_25   umato_tnc_25   
4     pca_tnc_25     pca_tnc_25     pca_tnc_25     pca_tnc_25     pca_tnc_25   

   ...             40             41  

In [9]:
import pandas as pd
from scipy import stats

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(
    columns=["Spearman correlation", "Kendall tau", "rank1", "rank2"]
)

for i in range(50):
    for j in range(i):
        rankings1 = aggr_df[i]
        rankings2 = aggr_df[j]

        # Calculate Spearman rank-order correlation coefficient
        spearman_corr, _ = stats.spearmanr(rankings1, rankings2)

        # Calculate Kendall's tau
        kendall_tau, _ = stats.kendalltau(rankings1, rankings2)

        # Create a DataFrame with the results
        temp_df = pd.DataFrame(
            {
                "Spearman correlation": [spearman_corr],
                "Kendall tau": [kendall_tau],
                "rank1": [i],
                "rank2": [j],
            }
        )

        # Append the results to the DataFrame
        results_df = pd.concat([results_df, temp_df], ignore_index=True)

# Print the results DataFrame
print(results_df.head())

results_df.to_csv("result/app2_rank-corr.csv")

  results_df = pd.concat([results_df, temp_df], ignore_index=True)


   Spearman correlation  Kendall tau rank1 rank2
0              0.714286     0.564103     1     0
1              0.824176     0.641026     2     0
2              0.978022     0.923077     2     1
3              0.824176     0.641026     3     0
4              0.978022     0.923077     3     1


In [None]:
# Step 5: Normalize the scores with original TNC optimal score
