In [2]:
import sys
import numpy as np
import pandas as pd
import itertools
import pickle
from natsort import natsort_keygen
import fanova

In [4]:
save_path = "Results"

algorithm_names = ["modCMA-ES", "modDE"]
dimensions = [5, 30]
budgets = {dim: [50*dim, 100*dim, 300*dim, 500*dim, 1000*dim, 1500*dim] for dim in dimensions}
targets = ["log1"]
f_ids = np.arange(1, 25)
n_configurations = {
    "modCMA-ES": 324,
    "modDE": 576
}

In [5]:
def load_and_preprocess_hyperparameters(data_path: str):
    hyperparameter_values_to_int_map = {
                "modCMA-ES": {
                    "base_sampler": {'gaussian': 0, 'sobol': 1, 'halton': 2},
                    "elitist": {False: 0, True: 1},
                    "local_restart": {'None': 0, 'IPOP': 1, 'BIPOP': 2},
                    "mirrored": {'None': 0, 'mirrored': 1, 'mirrored pairwise': 2},
                    "step_size_adaptation": {'csa': 0, 'psr': 1},
                    "weights_option": {'default': 0, 'equal': 1, '1/2^lambda': 2},
                },
                "modDE": {
                    "adaptation_method": {'None': 0, 'shade': 1, 'jDE': 2},
                    "crossover": {'bin': 0, 'exp': 1},
                    "lpsr": {False: 0, True: 1},
                    "mutation_base": {'rand': 0, 'best': 1, 'target': 2},
                    "mutation_n_comps": {1: 0, 2: 1},
                    "mutation_reference": {'None': 0, 'pbest': 1, 'best': 2, 'rand': 3},
                    "use_archive": {False: 0, True: 1},
                }
            }
    # Read data from cvs file
    data = pd.read_csv(data_path, index_col=0).reset_index().rename(columns={"index": "configuration_id"})
    data = data.fillna("None")
    assert data.isnull().sum().sum() == 0, f"NaN values exist in the DataFrame. Please check the data after reading. \n{data[data.isnull().any(axis=1)]}."
    # Map values to int
    for key, value in hyperparameter_values_to_int_map[algorithm_name].items():
        data[key] = data[key].map(value)
    assert data.isnull().sum().sum() == 0, f"NaN values exist in the DataFrame. Please check the data after mapping. \n{data[data.isnull().any(axis=1)]}."
    # Set configuration_id as index
    data.set_index("configuration_id", inplace=True)
 
    return hyperparameter_values_to_int_map, data.astype('int')

In [6]:
def load_and_preprocess_performance_data(data_path: str, budget: int, dimension: int, n_configurations:int):
    data = []
    for configuration_id in range(n_configurations):
        # Read data from csv file
        filepath = f"{data_path}/budget_{budget}_conf_{configuration_id}_{dimension}D.csv"
        data_temp = pd.read_csv(filepath).assign(configuration_id=configuration_id)
        # Agregate performance
        data_temp = data_temp.groupby(["configuration_id"]).mean().reset_index()            
        data.append(data_temp)
    
    return pd.concat(data, ignore_index=True)

In [7]:
def prepare_dataset(hyperparameters: pd.DataFrame, algorithm_performance: pd.DataFrame):
    # Create dataset
    dataset = pd.merge(hyperparameters, algorithm_performance, on="configuration_id")
    dataset = dataset.set_index(["configuration_id"])
    print("\ndataset: ")
    print(dataset.head())
    print(dataset.shape)

    y = dataset[["target"]]
    X = dataset.drop("target", axis=1)
    X = X.sort_index(axis=1)
    
    print("\nX: ")
    print(X.head())
    print(X.shape)
    
    return X, y

In [8]:
def calculate_hyperparameter_importance(X: pd.DataFrame, y: pd.DataFrame): 
    # Initialize fanova instance
    f = fanova.fANOVA(X.values, y.values, n_trees=100, bootstrapping=True, seed=42)
    # Calculate importance
    hyperparameters_int = np.arange(X.shape[1]).tolist()
    data_dict = {}
    # Calculate individual importance
    for hyperparameter in hyperparameters_int: 
        print(hyperparameter)
        hyperparameter = (hyperparameter, )
        data_dict[hyperparameter] = f.quantify_importance(hyperparameter)[hyperparameter]
    # Calculate pairwise importance
    for combination in list(itertools.combinations(hyperparameters_int, 2)): 
        print(combination)
        data_dict[combination] = f.quantify_importance(combination)[combination]
    # Calculate triple importance
    for combination in list(itertools.combinations(hyperparameters_int, 3)): 
        print(combination)
        data_dict[combination] = f.quantify_importance(combination)[combination]
    
    # Convert result to df
    data_df = pd.DataFrame(data_dict).T.reset_index()
    hyperparameter_names = X.columns.tolist()                  
    hyperparameter_map = {float(i): hyperparameter_names[i] for i in range(len(hyperparameter_names))}
    for column in ["level_0", "level_1", "level_2"]:
        data_df[column] = data_df[column].map(hyperparameter_map)
    data_df = data_df.fillna("")

    data_df = data_df.sort_values(by="individual importance", ascending=False)
    
    return f, data_df

In [9]:
for algorithm_name in algorithm_names:
    print(f"\nalgorithm_name: {algorithm_name}")
    hyperparameter_values_to_int_map, hyperparameters = load_and_preprocess_hyperparameters(data_path=f"Data/{algorithm_name}_conf_grid.csv")
    print("\nhyperparameters: ")
    print(hyperparameters)
    
    for dimension, target in itertools.product(dimensions, targets): 
        print(f"\ndimension: {dimension}")
        print(f"target: {target}")
        for budget in budgets[dimension]:
            print(f"budget: {budget}")
            algorithm_performance = load_and_preprocess_performance_data(f"Data/{algorithm_name}/{target}"
                                                                         ,budget , dimension, n_configurations[algorithm_name])
            print(f"\nalgorithm_performance:")
            print(algorithm_performance)

            # Prepare dataset
            X, y = prepare_dataset(hyperparameters, algorithm_performance, f_id)
            print("\nX, y: ")
            print(X.head())
            print(X.shape)
            print(y.head())
            print(y.shape)
            print(X.columns)

            # Calculate hyperparameter importance
            f, hyperparameters_importance = calculate_hyperparameter_importance(X, y)
            print(f"\n hyperparameters_importance: ")
            print(hyperparameters_importance.head())
            # Save
            hyperparameters_importance.to_csv(f"Results/hyperparameter_importance_{algorithm_name}_{dimension}_{budget}_{target}.csv"
                                             , index=False)


algorithm_name: modCMA-ES

hyperparameters: 
                  elitist  mirrored  base_sampler  weights_option  \
configuration_id                                                    
0                       1         0             0               0   
1                       1         0             0               0   
2                       1         0             0               0   
3                       1         0             0               0   
4                       1         0             0               0   
...                   ...       ...           ...             ...   
319                     0         2             2               2   
320                     0         2             2               2   
321                     0         2             2               2   
322                     0         2             2               2   
323                     0         2             2               2   

                  local_restart  step_size_adaptation  


FileNotFoundError: [Errno 2] No such file or directory: 'Data/modCMA-ES/log/budget_250_conf_0_5D.csv'