In [None]:
import pandas as pd
from pflacco.classical_ela_features import *
from pflacco.sampling import create_initial_sample
from umap import UMAP
from experiments.datasets import DATASETS
from distribution_optimization_py.problem import ScaledGaussianMixtureProblemForELA

def get_ela_features(problem):
    dim = len(problem.lower)
    X = create_initial_sample(dim, lower_bound=-5, upper_bound=5)
    y = X.apply(lambda x: problem(x), axis=1)
    ela_meta = calculate_ela_meta(X, y)
    ela_distr = calculate_ela_distribution(X, y)
    ela_level = calculate_ela_level(X, y)
    nbc = calculate_nbc(X, y)
    disp = calculate_dispersion(X, y)
    ic = calculate_information_content(X, y, seed=100)

    return pd.DataFrame(
        {
            **ic,
            **ela_meta,
            **ela_distr,
            **nbc,
            **disp,
            **ela_level,
            **{"dim": dim},
        },
        index=[0],
    )

def calculate_gaussian_ela_features() -> None:
    ela_features_dfs = []
    for dataset in DATASETS:
        problem = ScaledGaussianMixtureProblemForELA(dataset.data, dataset.nr_of_modes)
        ela_features_df = get_ela_features(problem)
        ela_features_df["fid"] = dataset.name
        ela_features_df["iid"] = 1
        ela_features_dfs.append(ela_features_df)
    pd.concat(ela_features_dfs).to_csv("gaussian_ela_features.csv", index=False)

In [None]:
gaussian_features_df = pd.read_csv("gaussian_ela_features.csv", index_col=0)

In [None]:
bbob_features_df = pd.read_csv("ela_features.csv", index_col=None)

In [None]:
PROBLEM_CLASSES = [
    "separable",
    "separable",
    "separable",
    "separable",
    "separable",
    "low-conditioning",
    "low-conditioning",
    "low-conditioning",
    "low-conditioning",
    "unimodal",
    "unimodal",
    "unimodal",
    "unimodal",
    "unimodal",
    "multimodal-adequate",
    "multimodal-adequate",
    "multimodal-adequate",
    "multimodal-adequate",
    "multimodal-adequate",
    "multimodal-weak",
    "multimodal-weak",
    "multimodal-weak",
    "multimodal-weak",
    "multimodal-weak",
]


def get_problem_class(fid: str | int) -> str:
    if isinstance(fid, int):
        return PROBLEM_CLASSES[fid - 1]
    return "distribution-optimization"


all_features_df = pd.concat([bbob_features_df, gaussian_features_df])
all_features_df["class"] = [get_problem_class(fid) for fid in all_features_df["fid"]]

UMAP visualization:

In [None]:
import plotly.express as px

umap = UMAP(n_components=2, random_state=42)
umap_all = umap.fit_transform(
    all_features_df.drop(columns=["fid", "iid", "dim", "class", "ela_level.lda_qda_25"])
)
umap_df = pd.DataFrame(umap_all, columns=["UMAP-1", "UMAP-2"])
umap_df["class"] = all_features_df["class"].values
color_map = {
    "distribution-optimization": "red",
    "low-conditioning": "green",
    "separable": "pink",
    "unimodal": "orange",
    "multimodal-adequate": "blue",
    "multimodal-weak": "purple",
}

fig = px.scatter(
    umap_df,
    x="UMAP-1",
    y="UMAP-2",
    color="class",
    title="UMAP Projection of ELA Features with Problem Class Labels",
    width=1200,  # width in pixels
    height=1200,  # height in pixels
    color_discrete_map=color_map,
)


fig.show()

Test to check if the model is correct:

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

X = bbob_features_df.drop(columns=["ela_level.lda_qda_25"])
SELECTED_IID = 20
X_train = X[X["iid"] != SELECTED_IID]
y_train = [get_problem_class(fid) for fid in X_train["fid"]]
X_train = X_train.drop(columns=["fid", "iid"])
X_test = X[X["iid"] == SELECTED_IID]
y_test = [get_problem_class(fid) for fid in X_test["fid"]]
X_test = X_test.drop(columns=["fid", "iid"])

assert len(X_train) == len(y_train)
assert "class" not in X_train.columns
assert "class" not in X_test.columns

pipeline = make_pipeline(
    StandardScaler(),
    SelectKBest(f_classif, k=10),
    RandomForestClassifier(random_state=42),
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
confusion_matrix(y_test, y_pred)

Predicting problem class:

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

X_train = bbob_features_df.drop(columns=["fid", "iid", "ela_level.lda_qda_25"])
y_train = [get_problem_class(fid) for fid in bbob_features_df["fid"]]
X_test = gaussian_features_df.drop(columns=["fid", "iid", "ela_level.lda_qda_25"])[X_train.columns]

assert len(X_train) == len(y_train)
assert "class" not in X_train.columns
assert "class" not in X_test.columns

pipeline = make_pipeline(
    StandardScaler(),
    SelectKBest(f_classif, k=10),
    RandomForestClassifier(random_state=42),
)

pipeline.fit(X_train, y_train)
pipeline.predict(X_test)

Predicting fid:

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

X_train = bbob_features_df.drop(columns=["fid", "iid", "ela_level.lda_qda_25"])
y_train = bbob_features_df["fid"]
X_test = gaussian_features_df.drop(columns=["fid", "iid", "ela_level.lda_qda_25"])[X_train.columns]

assert len(X_train) == len(y_train)
assert "class" not in X_train.columns
assert "class" not in X_test.columns
assert "fid" not in X_train.columns
assert "fid" not in X_test.columns

pipeline = make_pipeline(
    StandardScaler(),
    SelectKBest(f_classif, k=10),
    RandomForestClassifier(random_state=42),
)

pipeline.fit(X_train, y_train)
pipeline.predict(X_test)