In [1]:
from __future__ import annotations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from carps.analysis.process_data import get_interpolated_performance_df, load_logs


def print_overview(df_trials: pd.DataFrame) -> None:
    print(df_trials.columns)
    print(df_trials["optimizer_id"].unique())
    print(df_trials["benchmark_id"].unique())
    print(df_trials["problem_id"].unique())
    print("Number of seeds", df_trials["seed"].nunique())


rundir = "../runs"

df, df_cfg = load_logs(rundir=rundir)
print_overview(df)
perf = get_interpolated_performance_df(df)
perf_time = get_interpolated_performance_df(df, x_column="time_norm")

  df = pd.read_csv(logs_fn)


Index(['n_trials', 'n_function_calls', 'trial_info__config',
       'trial_info__instance', 'trial_info__seed', 'trial_info__budget',
       'trial_info__normalized_budget', 'trial_info__name',
       'trial_info__checkpoint', 'trial_value__cost', 'trial_value__time',
       'trial_value__virtual_time', 'trial_value__status',
       'trial_value__starttime', 'trial_value__endtime', 'seed',
       'benchmark_id', 'problem_id', 'optimizer_id', 'problem.model_id',
       'problem.dataset_id', 'problem.surrogates_dir', 'task.n_trials',
       'task.time_budget', 'task.n_workers', 'task.n_objectives',
       'task.objectives', 'task.is_multifidelity', 'task.fidelity_type',
       'task.min_budget', 'task.max_budget', 'task.has_constraints',
       'task.domain', 'task.objective_function_approximation',
       'task.has_virtual_time', 'task.deterministic', 'task.dimensions',
       'task.search_space_n_categoricals', 'task.search_space_n_ordinals',
       'task.search_space_n_integers', 'tas

In [None]:
ids = (df["scenario"] == "multi-fidelity") & (df["benchmark_id"] == "HPOBench")
ids_ = (perf["scenario"] == "multi-fidelity") & (perf["benchmark_id"] == "HPOBench")
df = df[ids]
df_cfg = df_cfg[ids]
perf = perf[ids_]

## Installation of SDSSP Heuristics
```bash
git clone https://github.com/frclement/SDSSP_Heuristics.git lib/SDSSP_Heuristics
cd lib/SDSSP_Heuristics

# On the cluster, make sure to have gcc and gsl cblas available
# Those commands are cluster specific
# If you are not on the cluster, install those libraries
ml numlib/GSL/2.7-GCC-13.2.0
ml compiler/GCC/13.2.0

# Compile
gcc shift_TAnobrute.c -o shift -O3 -lm -lgsl -lgslcblas

# Run
./shift pointfile.txt dim npoints kpoints outfile.txt
```
Where pointfile.txt is a textfile containing the points (Format: one point per line ex: Line 1: x_1,1 x_1,2 .... x_1,d then Line 2:x_2,1 .... x_2,d and so on).

dim is the dimension

npoints is n the number of points we select from

kpoints is the number of points to obtain

outfile.txt is whatever file you want the output to go to

The number of runs for a single instance can be adapted Line 1767 (nb_tries, for the moment the loop goes up to 10). From past experience, 5-10 mins are to be expected per run per instance. 

In [2]:
from carps.analysis.run_autorank import get_df_crit
filename = "pointfile.txt"
filename_out = "pointfile_out.txt"
df_crit = get_df_crit(df[df["scenario"]=="blackbox"])
df_crit.to_csv(filename, header=False, index=False)

command = "./shift {filename} {dim} {npoints} {kpoints} {filename_out}".format(
    filename=filename,
    dim=df_crit.shape[1],
    npoints=df_crit.shape[0],
    kpoints=20,
    filename_out=filename_out
)
print(command)

NameError: name 'df_crit' is not defined

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# ax = sns.lineplot(data=perf, x="n_trials_norm", y="trial_value__cost_inc_norm", style="seed", hue="problem_id")
# ax.get_legend().remove()
# plt.show()

In [None]:
import tsfel
from functools import partial
from multiprocessing import Pool

cfg_file = tsfel.get_features_by_domain()               # All features will be extracted.
cgf_file = tsfel.get_features_by_domain("statistical")  # All statistical domain features will be extracted
# cgf_file = tsfel.get_features_by_domain("temporal")     # All temporal domain features will be extracted
# cgf_file = tsfel.get_features_by_domain("spectral") 

window_size = 21
groups = list(perf.groupby(by=["optimizer_id", "problem_id", "seed"])["trial_value__cost_inc_norm"])
gids = [g[0] for g in groups]
signal_windows = list([g[1].to_numpy() for g in groups])
signal_windows = np.array([s for s in signal_windows if len(s) == 21])
# signal_windows = pd.DataFrame(signal_windows)
print(signal_windows.shape)
# print(set([len(s) for s in signal_windows]))
# print(signal_windows)

def extract(signal_windows):
    return tsfel.time_series_features_extractor(
        dict_features=cfg_file, 
        signal_windows=signal_windows, 
        fs=None, 
        window_size=window_size, 
        verbose=0,
        n_jobs=None
        )

with Pool(processes=4) as pool:
    X_train = pool.map(extract, signal_windows)

X_train = np.concatenate(X_train)
X_train.shape

In [None]:
pd.DataFrame(X_train).to_csv("tsfeatures.csv")

In [None]:
perf["benchmark_id"].unique()

In [None]:
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

n_clusters = 20

print(np.any(np.isnan(X_train)))

X_clean = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_train)
X_clean = normalize(X_clean)

X_pca = PCA(n_components=2).fit_transform(X_clean)

print(X_pca.shape)

kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto").fit(X_pca)
clusters = kmeans.predict(X_pca)

centers = kmeans.cluster_centers_
print(centers.shape)

def idx_min_dist_to_center(P, c):
    D = np.sum((P - c) ** 2, axis=1)
    return np.argmin(D)


ids = [idx_min_dist_to_center(X_pca, c) for c in centers]
closest_points = X_pca[ids]
print(np.array(gids)[ids])

fig, ax = plt.subplots()
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters)
ax.scatter(centers[:, 0], centers[:, 1], color="black")
ax.scatter(closest_points[:, 0], closest_points[:, 1], color="red")
plt.show()