In [1]:
# import libraries
# to handle paths 
import os

# for data manipulation
import numpy as np
import pandas as pd

# for iterating
from itertools import product

# to display all columns
pd.set_option('display.max_columns', 10)

In [2]:
# define directories
data_folder = f"Data"
raw_folder = f"{data_folder}/raw/experiment_1"

# create directory
preprocessed_folder = f"{data_folder}/preprocessed/experiment_1"
os.makedirs(preprocessed_folder, exist_ok=True)

In [3]:
# define variables
suites = ['cec2013', 'cec2014', 'cec2015', 'cec2017', 'bbob']
algorithms = ["DE", "RealSpacePSO", "CMA"]
budget = 100000

# Target preparation

In [4]:
# load raw target data
target = pd.read_csv(f"{raw_folder}/performance_data.csv", index_col=0)
target.head(15)

Unnamed: 0,optimizer_name,optimizer_budget,benchmark_suite,fid,iid,f_opt,dimensions,function_evaluations,f_best
0,DE,100000,CEC2013,1,1,-1400.0,10,98327.0,-1399.999822
1,DE,100000,CEC2013,1,1,-1400.0,10,83418.0,-1397.565531
2,DE,100000,CEC2013,1,1,-1400.0,10,96597.0,-1399.529409
3,DE,100000,CEC2013,1,1,-1400.0,10,90473.0,-1399.842448
4,DE,100000,CEC2013,1,1,-1400.0,10,99745.0,-1399.685237
5,DE,100000,CEC2013,1,1,-1400.0,10,86820.0,-1399.804799
6,DE,100000,CEC2013,1,1,-1400.0,10,96680.0,-1399.729248
7,DE,100000,CEC2013,1,1,-1400.0,10,61177.0,-1395.332492
8,DE,100000,CEC2013,1,1,-1400.0,10,88784.0,-1399.912419
9,DE,100000,CEC2013,1,1,-1400.0,10,77293.0,-1399.190778


In [5]:
# preview data
target.shape

(19980, 9)

In [6]:
target.dtypes

optimizer_name           object
optimizer_budget          int64
benchmark_suite          object
fid                       int64
iid                       int64
f_opt                   float64
dimensions                int64
function_evaluations    float64
f_best                  float64
dtype: object

In [7]:
for column in ["optimizer_name", "optimizer_budget", "benchmark_suite", "dimensions"]:
    print(column)
    print(target[column].unique())
    print("\n")

optimizer_name
['DE' 'RealSpacePSO' 'CMA']


optimizer_budget
[100000]


benchmark_suite
['CEC2013' 'CEC2014' 'CEC2015' 'CEC2017' 'BBOB']


dimensions
[10]




In [8]:
# calculate precision 
target["precision"] = abs(target["f_opt"] - target["f_best"])
target.head(31)

Unnamed: 0,optimizer_name,optimizer_budget,benchmark_suite,fid,iid,f_opt,dimensions,function_evaluations,f_best,precision
0,DE,100000,CEC2013,1,1,-1400.0,10,98327.0,-1399.999822,0.000178
1,DE,100000,CEC2013,1,1,-1400.0,10,83418.0,-1397.565531,2.434469
2,DE,100000,CEC2013,1,1,-1400.0,10,96597.0,-1399.529409,0.470591
3,DE,100000,CEC2013,1,1,-1400.0,10,90473.0,-1399.842448,0.157552
4,DE,100000,CEC2013,1,1,-1400.0,10,99745.0,-1399.685237,0.314763
5,DE,100000,CEC2013,1,1,-1400.0,10,86820.0,-1399.804799,0.195201
6,DE,100000,CEC2013,1,1,-1400.0,10,96680.0,-1399.729248,0.270752
7,DE,100000,CEC2013,1,1,-1400.0,10,61177.0,-1395.332492,4.667508
8,DE,100000,CEC2013,1,1,-1400.0,10,88784.0,-1399.912419,0.087581
9,DE,100000,CEC2013,1,1,-1400.0,10,77293.0,-1399.190778,0.809222


In [9]:
# aggregate runs with median
target = target.groupby(["optimizer_name", "benchmark_suite", "fid", "iid"])["precision"].median().reset_index()
target.head()

Unnamed: 0,optimizer_name,benchmark_suite,fid,iid,precision
0,CMA,BBOB,1,1,1.675725e-08
1,CMA,BBOB,1,2,1.689847e-08
2,CMA,BBOB,1,3,1.986993e-08
3,CMA,BBOB,1,4,1.943977e-08
4,CMA,BBOB,1,5,2.301761e-08


In [10]:
# 30 runs * 3 algorithms * 5 suites * cal
target.shape

(666, 5)

In [11]:
# transform benchmark_suite column to match features column
target["benchmark_suite"] = target["benchmark_suite"].str.lower()
target

Unnamed: 0,optimizer_name,benchmark_suite,fid,iid,precision
0,CMA,bbob,1,1,1.675725e-08
1,CMA,bbob,1,2,1.689847e-08
2,CMA,bbob,1,3,1.986993e-08
3,CMA,bbob,1,4,1.943977e-08
4,CMA,bbob,1,5,2.301761e-08
...,...,...,...,...,...
661,RealSpacePSO,cec2017,25,1,4.456342e+02
662,RealSpacePSO,cec2017,26,1,3.471494e+02
663,RealSpacePSO,cec2017,27,1,4.032556e+02
664,RealSpacePSO,cec2017,28,1,5.948495e+02


In [12]:
target.dtypes

optimizer_name      object
benchmark_suite     object
fid                  int64
iid                  int64
precision          float64
dtype: object

In [13]:
# rename columns to match features column names
target = target.rename(columns={"fid": "f_id", "iid": "i_id"})
target.head()

Unnamed: 0,optimizer_name,benchmark_suite,f_id,i_id,precision
0,CMA,bbob,1,1,1.675725e-08
1,CMA,bbob,1,2,1.689847e-08
2,CMA,bbob,1,3,1.986993e-08
3,CMA,bbob,1,4,1.943977e-08
4,CMA,bbob,1,5,2.301761e-08


In [14]:
# check for nan
target.isnull().sum()

optimizer_name     0
benchmark_suite    0
f_id               0
i_id               0
precision          0
dtype: int64

In [14]:
# save target
target.to_csv(f"{preprocessed_folder}/target_preprocessed.csv", index=False)

# Features preparation

In [15]:
# load raw feature data
features = pd.read_csv(f"{raw_folder}/ela.csv", index_col=0)
features.head()

Unnamed: 0,run_id,suite,dimensions,fid,iid,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init
1,1,bbob,10,1,1,...,0.9091,0.1234,0.1234,0.9455,0.1621
2,2,bbob,10,1,1,...,0.9091,0.1244,0.1244,0.9436,0.1527
3,3,bbob,10,1,1,...,0.9091,0.1183,0.1183,0.9478,0.1586
4,4,bbob,10,1,1,...,0.9091,0.1172,0.1172,0.949,0.1607
5,5,bbob,10,1,1,...,0.8182,0.1188,0.1188,0.9491,0.1598


In [16]:
# preview data
features.shape

(10170, 70)

In [17]:
features.dtypes

run_id                         int64
suite                         object
dimensions                     int64
fid                            int64
iid                            int64
                              ...   
pca.expl_var.cor_init        float64
pca.expl_var_PC1.cov_x       float64
pca.expl_var_PC1.cor_x       float64
pca.expl_var_PC1.cov_init    float64
pca.expl_var_PC1.cor_init    float64
Length: 70, dtype: object

In [18]:
for column in ["suite", "dimensions"]:
    print(column)
    print(features[column].unique())
    print("\n")

suite
['bbob' 'cec2013' 'cec2014' 'cec2015' 'cec2017' 'all']


dimensions
[10]




In [19]:
# drop run_id and cluster
features = features.drop(["run_id", "cluster", "dimensions"], axis=1)
features.head()

Unnamed: 0,suite,fid,iid,disp.ratio_mean_02,disp.ratio_mean_05,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init
1,bbob,1,1,0.6454,0.705,...,0.9091,0.1234,0.1234,0.9455,0.1621
2,bbob,1,1,0.6132,0.6832,...,0.9091,0.1244,0.1244,0.9436,0.1527
3,bbob,1,1,0.6083,0.6779,...,0.9091,0.1183,0.1183,0.9478,0.1586
4,bbob,1,1,0.645,0.7061,...,0.9091,0.1172,0.1172,0.949,0.1607
5,bbob,1,1,0.6263,0.6872,...,0.8182,0.1188,0.1188,0.9491,0.1598


In [20]:
# remove benchmark_suite == "all"
features = features[features["suite"] != "all"]
features.shape

(6630, 67)

In [21]:
# check for nan
features.columns[features.isna().any()].tolist()

['ic.eps.s']

In [22]:
# check for nan
features[features.isna().any(axis=1)][["suite", "fid", "iid"]].drop_duplicates()

Unnamed: 0,suite,fid,iid
3661,cec2013,3,1
3781,cec2013,7,1


In [23]:
# check for nan
features[features.isna().any(axis=1)].shape

(60, 67)

In [24]:
# remove nan 
features = features[~features.isna().any(axis=1)]

In [25]:
features.shape

(6570, 67)

In [26]:
# aggregate runs with median
features = features.groupby(["suite", "fid", "iid"]).median().reset_index()
features.head()

Unnamed: 0,suite,fid,iid,disp.ratio_mean_02,disp.ratio_mean_05,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init
0,bbob,1,1,0.62485,0.69035,...,0.8182,0.1202,0.1202,0.94845,0.16085
1,bbob,1,2,0.6648,0.7289,...,0.8182,0.11955,0.11955,0.9792,0.1754
2,bbob,1,3,0.6622,0.7297,...,0.8182,0.12045,0.12045,0.97665,0.17465
3,bbob,1,4,0.64655,0.71385,...,0.8182,0.11915,0.11915,0.9667,0.17005
4,bbob,1,5,0.63995,0.7086,...,0.8182,0.12035,0.12035,0.96695,0.1698


In [27]:
features.dtypes

suite                         object
fid                            int64
iid                            int64
disp.ratio_mean_02           float64
disp.ratio_mean_05           float64
                              ...   
pca.expl_var.cor_init        float64
pca.expl_var_PC1.cov_x       float64
pca.expl_var_PC1.cor_x       float64
pca.expl_var_PC1.cov_init    float64
pca.expl_var_PC1.cor_init    float64
Length: 67, dtype: object

In [28]:
# rename columns
features = features.rename(columns={"suite": "benchmark_suite", "fid": "f_id", "iid": "i_id"})
features.head()

Unnamed: 0,benchmark_suite,f_id,i_id,disp.ratio_mean_02,disp.ratio_mean_05,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init
0,bbob,1,1,0.62485,0.69035,...,0.8182,0.1202,0.1202,0.94845,0.16085
1,bbob,1,2,0.6648,0.7289,...,0.8182,0.11955,0.11955,0.9792,0.1754
2,bbob,1,3,0.6622,0.7297,...,0.8182,0.12045,0.12045,0.97665,0.17465
3,bbob,1,4,0.64655,0.71385,...,0.8182,0.11915,0.11915,0.9667,0.17005
4,bbob,1,5,0.63995,0.7086,...,0.8182,0.12035,0.12035,0.96695,0.1698


In [29]:
features.shape

(219, 67)

In [30]:
# save features
features.to_csv(f"{preprocessed_folder}/ela_preprocessed.csv", index=False)

# Merge features and target

In [31]:
# merge on benchmark_suite, f_id, i_id
dataset = pd.merge(target, features, how='inner', on=["benchmark_suite", "f_id", "i_id"])
dataset.head()

Unnamed: 0,optimizer_name,benchmark_suite,f_id,i_id,precision,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init
0,CMA,bbob,1,1,1.675725e-08,...,0.8182,0.1202,0.1202,0.94845,0.16085
1,DE,bbob,1,1,0.05714724,...,0.8182,0.1202,0.1202,0.94845,0.16085
2,RealSpacePSO,bbob,1,1,1.575405e-08,...,0.8182,0.1202,0.1202,0.94845,0.16085
3,CMA,bbob,1,2,1.689847e-08,...,0.8182,0.11955,0.11955,0.9792,0.1754
4,DE,bbob,1,2,0.03070117,...,0.8182,0.11955,0.11955,0.9792,0.1754


In [32]:
dataset.shape

(654, 69)

In [33]:
dataset.dtypes

optimizer_name                object
benchmark_suite               object
f_id                           int64
i_id                           int64
precision                    float64
                              ...   
pca.expl_var.cor_init        float64
pca.expl_var_PC1.cov_x       float64
pca.expl_var_PC1.cor_x       float64
pca.expl_var_PC1.cov_init    float64
pca.expl_var_PC1.cor_init    float64
Length: 69, dtype: object

In [34]:
# check for nans
dataset[dataset.isna().any(axis=1)]

Unnamed: 0,optimizer_name,benchmark_suite,f_id,i_id,precision,...,pca.expl_var.cor_init,pca.expl_var_PC1.cov_x,pca.expl_var_PC1.cor_x,pca.expl_var_PC1.cov_init,pca.expl_var_PC1.cor_init


In [35]:
for column in ["optimizer_name", "benchmark_suite"]:
    print(column)
    print(dataset[column].unique())
    print("\n")

optimizer_name
['CMA' 'DE' 'RealSpacePSO']


benchmark_suite
['bbob' 'cec2013' 'cec2014' 'cec2015' 'cec2017']




# Create str datasets

In [36]:
for algorithm_name, benchmark_suite in product(algorithms, suites):
    print("\n")
    print(f"algorithm_name: {algorithm_name}, benchmark_suite: {benchmark_suite}")
    
    dataset_str = dataset[(dataset["optimizer_name"]==algorithm_name) & (dataset["benchmark_suite"]==benchmark_suite)]
    
    # drop 
    dataset_str = dataset_str.drop(["optimizer_name", "benchmark_suite"], axis=1)
    
    print(dataset_str.head(1))
    print(dataset_str.shape)
    
    # save
    title = f"suite={benchmark_suite}_algorithm_name={algorithm_name}_budget={budget}_dataset.csv"
    dataset_str.to_csv(f"{preprocessed_folder}/{title}", index=False)



algorithm_name: DE, benchmark_suite: cec2013
     f_id  i_id  precision  disp.ratio_mean_02  disp.ratio_mean_05  ...  \
361     1     1   0.153794              0.6411             0.70705  ...   

     pca.expl_var.cor_init  pca.expl_var_PC1.cov_x  pca.expl_var_PC1.cor_x  \
361                 0.8182                 0.12065                 0.12065   

     pca.expl_var_PC1.cov_init  pca.expl_var_PC1.cor_init  
361                     0.9999                    0.16975  

[1 rows x 67 columns]
(25, 67)


algorithm_name: DE, benchmark_suite: cec2014
     f_id  i_id  precision  disp.ratio_mean_02  disp.ratio_mean_05  ...  \
436     1     1   8.093189               0.855             0.87865  ...   

     pca.expl_var.cor_init  pca.expl_var_PC1.cov_x  pca.expl_var_PC1.cor_x  \
436                 0.8182                  0.1212                  0.1212   

     pca.expl_var_PC1.cov_init  pca.expl_var_PC1.cor_init  
436                        1.0                    0.16475  

[1 rows x 67 colu