(Step 1) 96개의 데이터셋 중 86개를 랜덤으로 뽑아 cross validation 써서 t-SNE의 tnc_25 optimal score 예측하는 모델 학습 \
(Step 2) 나머지 10개의 데이터셋의 optimal score를 예측 \
(Step 3-A) Bayesian optimization을 50번 (init_point: 10, n_iter: 40) 활용해서 10개의 데이터셋에 대해 t-SNE를 돌려서 최적 score 구하기 (이때 dataset은 있는 그대로 쓰기, standardization 같은거 하면 안됨) \
(Step 3-B) Step 3-A과 같은 과정을 진행하되 이번에는 bayesain optimization이 optimal score 이상의 성능을 기록하는 경우 iteration을 끊어버리기 \
(Step 4) Step 3-A와 Step 3-B의 실행 시간과 Iteration에 따라 optimal score가 어떻게 업데이트되는지 기록 \
(Step 5) Step 1~4를 다른 난수 시드로 5번 반복 \
(Step 6) 반복 결과를 조합하여 Step 3-A와 Step 3-B의 실행 시간 / 최종 optimal score에 유의미한 차이가 존재하는지 분석

In [1]:
# Step 0: Setup constants

INPUT_TYPE = ["dc_5", "nc_3", "nc_5", "nc_10", "nc_25", "nc_30", "nc_50", "nc_75"]
OUTPUT_TYPE = [
    "umato_srho_0",
    "pca_tnc_25",
    "tsne_pr_0",
    "umato_tnc_25",
    "isomap_tnc_25",
    "lle_pr_0",
    "isomap_pr_0",
    "tsne_tnc_25",
    "umap_pr_0",
    "umap_tnc_25",
    "pca_pr_0",
    "lle_tnc_25",
    "umato_pr_0",
]
SCORE_TYPE = [
    "explained_variance_score",
    "max_error",
    "mean_absolute_error",
    "mean_squared_error",
    "root_mean_squared_error",
    "mean_squared_log_error",
    "root_mean_squared_log_error",
    "median_absolute_error",
    "r2_score",
    "mean_poisson_deviance",
    "mean_gamma_deviance",
    "mean_absolute_percentage_error",
    "d2_absolute_error_score",
    "d2_pinball_score",
    "d2_tweedie_score",
]

In [4]:
import os

import pandas as pd

input = pd.read_csv("../../data/input.csv", index_col=0)
label = pd.read_csv("../../data/output.csv", index_col=0)

RAND_SEED = 0
RESULT_DIR = f"result/application/{RAND_SEED}"

t = "tsne_tnc_25"
assert t in OUTPUT_TYPE
MODEL_DIR = "pretrained_model/application/"

if not os.path.exists(RESULT_DIR):
    os.makedirs(RESULT_DIR)
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [5]:
import numpy as np

np.random.seed(RAND_SEED)  # Set random seed
idx = np.random.choice(input.index, 86, replace=False)  # Randomly select 86 samples
print(idx)

['breast_tissue' 'ecoli' 'hiva' 'secom' 'fetal_health_classification'
 'magic_gamma_telescope' 'spambase' 'cifar10' 'wine' 'pumpkin_seeds'
 'date_fruit' 'image_segmentation' 'diabetic_retinopathy_debrecen'
 'heart_disease' 'birds_bones_and_living_habits' 'planning_relax'
 'letter_recognition' 'dry_bean' 'durum_wheat_features' 'coil20'
 'dermatology' 'extyaleb' 'fashion_mnist' 'banknote_authentication'
 'seeds' 'ionosphere' 'zoo' 'insurance_company_benchmark'
 'sentiment_labeld_sentences' 'world12d' 'hepatitis'
 'pima_indians_diabetes_database' 'boston' 'water_quality'
 'heart_attack_analysis_prediction_dataset' 'olivetti_faces'
 'classification_in_asteroseismology' 'breast_cancer_wisconsin_original'
 'weather' 'labeled_faces_in_the_wild' 'breast_cancer_coimbra'
 'website_phishing' 'harbermans_survival' 'har' 'spectf_heart'
 'wireless_indoor_localization' 'hate_speech' 'human_stress_detection'
 'mobile_price_classification' 'student_grade'
 'blood_transfusion_service_center' 'imdb'
 'br

In [6]:
import autosklearn.regression
import joblib
import sklearn

# Step 1: Select 86 samples out of whole dataset and Train the model
X = input.loc[idx, :]
y = label.loc[idx, t]

reg = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=600,
    per_run_time_limit=30,
    memory_limit=10000,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 5},
)
reg.fit(X, y)

joblib.dump(reg, f"{MODEL_DIR}/{t}.pkl")
print(reg.leaderboard())

pred = reg.predict(X)
print(f"{t} - R2:", sklearn.metrics.r2_score(y, pred))

ModuleNotFoundError: No module named 'autosklearn'

In [5]:
import joblib
import sklearn

# Step 2: Predict the Optimal Score with 10 samples
idx_not_trained = input.index.difference(idx)
print(f"Predicting {t} with {len(idx_not_trained)} samples")

X_not_trained = input.loc[idx_not_trained, :]
y_not_trained = label.loc[idx_not_trained, t]

reg = joblib.load(f"{MODEL_DIR}/{t}.pkl")
pred = reg.predict(X_not_trained)
print(f"{t} - R2:", sklearn.metrics.r2_score(y_not_trained, pred))
print(f"{t} - prediction:", pred)
print(f"{t} - actual:", y_not_trained)

Predicting tsne_tnc_25 with 10 samples


KeyboardInterrupt: 

In [6]:
# Step 3-A: Bayesian Optimization
import json
import time

from bayes_opt import BayesianOptimization
from sklearn.manifold import TSNE
from zadu import zadu

import reader as rd

# Load the dataset
data_idx = 0
data_name = idx_not_trained[data_idx]
data, label_ = rd.read_dataset(data_name, "labeled-datasets")


# Define the function to optimize
def optimize_tsne(perplexity):
    # Create the t-SNE model
    model = TSNE(perplexity=perplexity)

    # Fit and transform the data
    X_transformed = model.fit_transform(data)
    spec = [
        {
            "id": "tnc",
            "params": {"k": 25},
        }
    ]
    # Calculate the score
    score_module = zadu.ZADU(spec, data, return_local=True)
    score, local_list = score_module.measure(X_transformed)
    tr = score[0]["trustworthiness"]
    cn = score[0]["continuity"]
    ret = 2 * tr * cn / (tr + cn)

    return ret


pbounds = {"perplexity": (2, 500)}

# Create the optimizer
optimizer = BayesianOptimization(
    f=optimize_tsne,
    pbounds=pbounds,
    random_state=1,
)
print("Initialized")

start_time = time.time()
# Optimize
optimizer.maximize(
    init_points=10,
    n_iter=40,
)
exec_time = time.time() - start_time

# Print the best result
print(optimizer.max)

# Step 4: Save the result
scores = {}
for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))
    scores[i] = res
scores["total_time"] = exec_time

if not os.path.exists(f"{RESULT_DIR}/{RAND_SEED}"):
    os.makedirs(f"{RESULT_DIR}/{RAND_SEED}")

with open(f"{RESULT_DIR}/{RAND_SEED}/{data_name}_A.json", "w") as f:
    json.dump(scores, f, indent=4)

print("Save to " + f"{RESULT_DIR}/{RAND_SEED}/{data_name}_A.json")

Initialized
|   iter    |  target   | perple... |
-------------------------------------
| [0m1        [0m | [0m0.9592   [0m | [0m209.7    [0m |
| [0m2        [0m | [0m0.9556   [0m | [0m360.7    [0m |
| [0m3        [0m | [0m0.929    [0m | [0m2.057    [0m |
| [95m4        [0m | [95m0.966    [0m | [95m152.6    [0m |
| [0m5        [0m | [0m0.965    [0m | [0m75.08    [0m |
| [0m6        [0m | [0m0.9634   [0m | [0m47.98    [0m |
| [0m7        [0m | [0m0.9647   [0m | [0m94.76    [0m |
| [0m8        [0m | [0m0.9659   [0m | [0m174.1    [0m |
| [0m9        [0m | [0m0.9655   [0m | [0m199.6    [0m |


In [None]:
# Step 3-B: Do the same thing with 3-A, but stop iteration when it achieves the optimal score
optimal_score = pred[data_idx]

# Create the optimizer
optimizer_dr = BayesianOptimization(
    f=optimize_tsne,
    pbounds=pbounds,
    random_state=1,
)

start_time = time.time()

# Manual optimization loop with stopping criterion
for i in range(50):  # Total of 50 iterations (10 initial points + 40 iterations)
    optimizer_dr.maximize(
        init_points=1 if i < 10 else 0,  # 10 initial points
        n_iter=1,  # 1 iteration at a time
    )
    if optimizer_dr.max["target"] >= optimal_score:
        break

exec_time = time.time() - start_time

# Print the best result
print(optimizer_dr.max)

# Step 4: Save the result
scores = {}
for i, res in enumerate(optimizer_dr.res):
    print("Iteration {}: \n\t{}".format(i, res))
    scores[i] = res
scores["total_time"] = exec_time

with open(f"{RESULT_DIR}/{RAND_SEED}/{data_name}_B.json", "w") as f:
    json.dump(scores, f, indent=4)

print("Save to " + f"{RESULT_DIR}/{RAND_SEED}/{data_name}_B.json")

(10,)
