In [1]:
# import libraries 
# to handle paths 
import os

# for data manipulation
import numpy as np
import pandas as pd

# for iterating
from itertools import product

In [2]:
# define directory
data_folder = f"Data"
raw_folder = f"{data_folder}/raw/experiment_2"

# create directory
preprocessed_folder = f"{data_folder}/preprocessed/experiment_2"
os.makedirs(preprocessed_folder, exist_ok=True)

save_path = f"{data_folder}/preprocessed/experiment_2-BS6"
os.makedirs(save_path, exist_ok=True)

In [3]:
# define variables
algorithms = ["DiagonalCMA", "DifferentialEvolution", "PSO"]
algorithms_map = {"DiagonalCMA": "diagCMA", "DifferentialEvolution": "DE", "PSO": "PSO"}
seeds = [0, 100, 200, 300, 400]
dim = 5
suites_map = {0: "BS1", 100: "BS2", 200: "BS3", 300: "BS4", 400: "BS5"}
budgets = [10000]

In [4]:
# load features

features = pd.read_csv(f"{raw_folder}/dim_5_clusters.csv")
features.head()

Unnamed: 0,f0,f1,dimension,alpha,ela_distr.skewness,ela_meta.lin_simple.adjr2,ela_meta.lin_simple.intercept,ela_meta.lin_simple.coef_max,ela_meta.quad_simple.adjr2,ela_meta.quad_simple.coef.min_by_max,ela_level.lda_mmce_25,ela_level.qda_mmce_25,ela_level.lda_qda_25,ic.eps.s,ic.eps.ratio,disp.ratio_mean_02,entropy.y,entropy.sig_dth_order,cluster
0,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,5,0.895461,0.025004,0.000146,-0.971645,0.165889,0.016886,0.000421,0.016037,0.018428,0.019366,0.054461,0.036642,0.01143,0.148543,-0.001039,3
1,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,5,0.868363,0.069912,0.00047,-0.856249,0.357447,0.035296,0.000819,0.035375,0.040097,0.042442,0.123517,0.080178,0.025629,0.325461,-0.002638,3
2,bbob_f003_i01_d05,shifted-bbob_f009_i01_d05,5,0.001103,0.014462,0.005248,0.65366,0.753491,0.007445,0.000443,0.006992,0.007661,0.00796,0.02152,0.01471,0.004392,0.061782,-0.000631,1
3,bbob_f003_i01_d05,shifted-bbob_f023_i01_d05,5,0.78541,0.055693,0.054763,-0.28807,0.796018,0.089175,0.002576,0.074781,0.081935,0.08682,0.133785,0.082366,0.052081,0.471219,-0.00501,0
4,bbob_f017_i01_d05,shifted-bbob_f003_i01_d05,5,0.345955,0.043652,0.018836,-0.720596,0.632389,0.037757,0.001061,0.034868,0.03908,0.039672,0.08301,0.049183,0.020635,0.25114,-0.003837,3


In [5]:
features.shape

(236, 19)

In [6]:
for column in ["dimension", "cluster"]:
    print(column)
    print(features[column].unique())
    print("\n")

dimension
[5]


cluster
[3 1 0 2]




In [7]:
features.dtypes

f0                                       object
f1                                       object
dimension                                 int64
alpha                                   float64
ela_distr.skewness                      float64
ela_meta.lin_simple.adjr2               float64
ela_meta.lin_simple.intercept           float64
ela_meta.lin_simple.coef_max            float64
ela_meta.quad_simple.adjr2              float64
ela_meta.quad_simple.coef.min_by_max    float64
ela_level.lda_mmce_25                   float64
ela_level.qda_mmce_25                   float64
ela_level.lda_qda_25                    float64
ic.eps.s                                float64
ic.eps.ratio                            float64
disp.ratio_mean_02                      float64
entropy.y                               float64
entropy.sig_dth_order                   float64
cluster                                   int64
dtype: object

In [8]:
# change data types
features['alpha'] = features['alpha'].astype(np.float32)

In [9]:
# select cluster 3

features = features[features["cluster"]==3]
features.head()

Unnamed: 0,f0,f1,dimension,alpha,ela_distr.skewness,ela_meta.lin_simple.adjr2,ela_meta.lin_simple.intercept,ela_meta.lin_simple.coef_max,ela_meta.quad_simple.adjr2,ela_meta.quad_simple.coef.min_by_max,ela_level.lda_mmce_25,ela_level.qda_mmce_25,ela_level.lda_qda_25,ic.eps.s,ic.eps.ratio,disp.ratio_mean_02,entropy.y,entropy.sig_dth_order,cluster
0,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,5,0.895461,0.025004,0.000146,-0.971645,0.165889,0.016886,0.000421,0.016037,0.018428,0.019366,0.054461,0.036642,0.01143,0.148543,-0.001039,3
1,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,5,0.868363,0.069912,0.00047,-0.856249,0.357447,0.035296,0.000819,0.035375,0.040097,0.042442,0.123517,0.080178,0.025629,0.325461,-0.002638,3
4,bbob_f017_i01_d05,shifted-bbob_f003_i01_d05,5,0.345955,0.043652,0.018836,-0.720596,0.632389,0.037757,0.001061,0.034868,0.03908,0.039672,0.08301,0.049183,0.020635,0.25114,-0.003837,3
7,bbob_f010_i01_d05,shifted-bbob_f022_i01_d05,5,0.999951,0.00654,0.002294,-0.782031,0.622436,0.002573,3.6e-05,0.002558,0.003403,0.002765,0.010115,0.006946,0.002252,0.027659,-0.000153,3
9,bbob_f003_i01_d05,shifted-bbob_f005_i01_d05,5,0.873739,0.049904,0.071131,-0.437716,0.527886,0.13451,0.007059,0.109636,0.12213,0.125507,0.165839,0.095865,0.077062,0.647035,-0.008662,3


In [10]:
features = features.drop(["dimension", "cluster"], axis=1)

In [11]:
features.shape

(62, 17)

In [12]:
# load target

target = pd.read_csv(f"{preprocessed_folder}/target_affine.csv")
target.head()

Unnamed: 0,optimizer_name,budget,f0,f1,alpha,precision
0,DiagonalCMA,100,bbob_f001_i01_d05,shifted-bbob_f003_i01_d05,0.701592,10.260378
1,DiagonalCMA,100,bbob_f001_i01_d05,shifted-bbob_f019_i01_d05,0.724414,2.14759
2,DiagonalCMA,100,bbob_f001_i01_d05,shifted-bbob_f019_i01_d05,0.792491,1.587453
3,DiagonalCMA,100,bbob_f001_i01_d05,shifted-bbob_f019_i01_d05,0.803477,1.773336
4,DiagonalCMA,100,bbob_f001_i01_d05,shifted-bbob_f019_i01_d05,0.807921,1.784482


In [13]:
target.shape

(4956, 6)

In [14]:
for column in ["optimizer_name", "budget"]:
    print(column)
    print(target[column].unique())
    print("\n")

optimizer_name
['DiagonalCMA' 'DifferentialEvolution' 'PSO']


budget
[  100   250   500  1000  2000  5000 10000]




In [15]:
target.dtypes

optimizer_name     object
budget              int64
f0                 object
f1                 object
alpha             float64
precision         float64
dtype: object

In [16]:
# change data types
target['alpha'] = target['alpha'].astype(np.float32)

# Merge features and target

In [17]:
for algorithm_name, budget in product(algorithms, budgets):
    print(f"algorithm_name: {algorithm_name}, budget: {budget}")
    
    # select budget
    target_temp = target[(target["budget"]==budget)&(target["optimizer_name"]==algorithm_name)]
#     print(target_temp.head(1))
    print(target_temp.shape)
    
    # merge on "f0", "f1", "alpha", "dimension"
    dataset = pd.merge(features, target_temp, how='inner', on=["f0", "f1", "alpha"])

    # drop unecessary columns
    dataset = dataset.drop(["optimizer_name", "budget"], axis=1)
    
    print("\n")
    print(dataset.head(1))
    
    print(dataset.isna().sum())
    print(dataset.dtypes)
    print(dataset.shape)
    
    dataset.to_csv(f"{save_path}/suite=BS6_algorithm_name={algorithms_map[algorithm_name]}_budget={budget}_dataset.csv", index=False)

algorithm_name: DiagonalCMA, budget: 10000
(236, 6)


                  f0                         f1     alpha  ela_distr.skewness  \
0  bbob_f004_i01_d05  shifted-bbob_f003_i01_d05  0.895461            0.025004   

   ela_meta.lin_simple.adjr2  ela_meta.lin_simple.intercept  \
0                   0.000146                      -0.971645   

   ela_meta.lin_simple.coef_max  ela_meta.quad_simple.adjr2  \
0                      0.165889                    0.016886   

   ela_meta.quad_simple.coef.min_by_max  ela_level.lda_mmce_25  \
0                              0.000421               0.016037   

   ela_level.qda_mmce_25  ela_level.lda_qda_25  ic.eps.s  ic.eps.ratio  \
0               0.018428              0.019366  0.054461      0.036642   

   disp.ratio_mean_02  entropy.y  entropy.sig_dth_order  precision  
0             0.01143   0.148543              -0.001039   6.964708  
f0                                      0
f1                                      0
alpha                    

In [18]:
dataset.head()

Unnamed: 0,f0,f1,alpha,ela_distr.skewness,ela_meta.lin_simple.adjr2,ela_meta.lin_simple.intercept,ela_meta.lin_simple.coef_max,ela_meta.quad_simple.adjr2,ela_meta.quad_simple.coef.min_by_max,ela_level.lda_mmce_25,ela_level.qda_mmce_25,ela_level.lda_qda_25,ic.eps.s,ic.eps.ratio,disp.ratio_mean_02,entropy.y,entropy.sig_dth_order,precision
0,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,0.895461,0.025004,0.000146,-0.971645,0.165889,0.016886,0.000421,0.016037,0.018428,0.019366,0.054461,0.036642,0.01143,0.148543,-0.001039,43.258314
1,bbob_f004_i01_d05,shifted-bbob_f003_i01_d05,0.868363,0.069912,0.00047,-0.856249,0.357447,0.035296,0.000819,0.035375,0.040097,0.042442,0.123517,0.080178,0.025629,0.325461,-0.002638,53.656167
2,bbob_f017_i01_d05,shifted-bbob_f003_i01_d05,0.345955,0.043652,0.018836,-0.720596,0.632389,0.037757,0.001061,0.034868,0.03908,0.039672,0.08301,0.049183,0.020635,0.25114,-0.003837,45.609048
3,bbob_f010_i01_d05,shifted-bbob_f022_i01_d05,0.999951,0.00654,0.002294,-0.782031,0.622436,0.002573,3.6e-05,0.002558,0.003403,0.002765,0.010115,0.006946,0.002252,0.027659,-0.000153,40260.521364
4,bbob_f003_i01_d05,shifted-bbob_f005_i01_d05,0.873739,0.049904,0.071131,-0.437716,0.527886,0.13451,0.007059,0.109636,0.12213,0.125507,0.165839,0.095865,0.077062,0.647035,-0.008662,49.591396


In [19]:
dataset.shape

(62, 18)

In [20]:
dataset.dtypes

f0                                       object
f1                                       object
alpha                                   float32
ela_distr.skewness                      float64
ela_meta.lin_simple.adjr2               float64
ela_meta.lin_simple.intercept           float64
ela_meta.lin_simple.coef_max            float64
ela_meta.quad_simple.adjr2              float64
ela_meta.quad_simple.coef.min_by_max    float64
ela_level.lda_mmce_25                   float64
ela_level.qda_mmce_25                   float64
ela_level.lda_qda_25                    float64
ic.eps.s                                float64
ic.eps.ratio                            float64
disp.ratio_mean_02                      float64
entropy.y                               float64
entropy.sig_dth_order                   float64
precision                               float64
dtype: object