Analysis of the impact of samplesize for exploratory landscape analysis features

In [None]:
import cocoex
from pflacco.classical_ela_features import *
from pflacco.sampling import create_initial_sample

_features = []
# Get all 24 single-objective noiseless BBOB function in dimension 2 and 3 for the first five instances.
suite = cocoex.Suite("bbob", f"instances:1-5", f"function_indices:1-24 dimensions:2,3,5")
for problem in suite:
    dim = problem.dimension
    fid = problem.id_function
    iid = problem.id_instance
    #for sample_coefficient in [1, 50]:
    for n in [10,20,50]:
        #for i in range(100):
            

        # Create sample
        #X = create_initial_sample(dim, lower_bound = -5, upper_bound = 5, sample_coefficient=sample_coefficient, sample_type='lhs')
        X = create_initial_sample(dim, lower_bound = -5, upper_bound = 5, n=n, sample_type='lhs')
        y = X.apply(lambda x: problem(x), axis = 1)

        # Calculate ELA features
        ela_meta = calculate_ela_meta(X, y)
        ela_distr = calculate_ela_distribution(X, y)
        fast_k = max(math.ceil(0.05 * X.shape[0]),2)
        nbc = calculate_nbc(X, y, fast_k=fast_k)
        disp = calculate_dispersion(X, y)
        ic = calculate_information_content(X, y, seed = 100)
        pca = calculate_pca(X, y)
        #try:
        #    level = calculate_ela_level(X, y)
        #except Exception:
        #    level = {}

        # Store results in pandas dataframe ### **{'sample_coefficient': sample_coefficient}
        data = pd.DataFrame({**ic, **ela_meta, **ela_distr, **nbc, **disp, **pca, **{'fid': fid}, **{'dim': dim}, **{'iid': iid}, **{'n': n}}, index = [0])
        _features.append(data)
        

features_test = pd.concat(_features).reset_index(drop = True)

In [None]:
features.to_parquet('df.lhs.gzip', compression='gzip')

In [None]:
import cocoex
from pflacco.classical_ela_features import *
from pflacco.sampling import create_initial_sample
features = pd.read_parquet('df.lhs.gzip')

In [None]:
from scipy.optimize import minimize
import cma

import cocoex
from pflacco.classical_ela_features import *
from pflacco.sampling import create_initial_sample

optimizers = ['BFGS', 'L-BFGS-B', 'SLSQP', cma.CMAEvolutionStrategy]
func_evals = []
# Get all 24 single-objective noiseless BBOB function in dimension 2 and 3 for the first five instances.
suite = cocoex.Suite("bbob", f"instances:1-5", f"function_indices:1-24 dimensions:2,3,5,10")
for problem in suite:
    dim = problem.dimension
    fid = problem.id_function
    iid = problem.id_instance
    
    for optim in optimizers:
        for i in range(5):
            x0 = problem.initial_solution_proposal(i)
            
            if optim is str:
                sol = minimize(problem, x0, method=optim)
                func_evals.append(pd.DataFrame({'fid': fid, 'dim': dim, 'iid': iid, 'optim': optim, **dict(enumerate(x0)),'fun': sol.fun, 'nfev': sol.nfev, 'status': sol.status}, index = [0]))
            elif optim is cma.CMAEvolutionStrategy:
                x, es = cma.fmin2(problem, x0, 0.5)
                sol = es.result._asdict()
                func_evals.append(pd.DataFrame({'fid': fid, 'dim': dim, 'iid': iid, 'optim': 'CMA-ES', **dict(enumerate(x0)),'fun': sol['fbest'], 'nfev': sol['evaluations']}, index = [0]))
func_evals = pd.concat(features).reset_index(drop = True)

In [None]:
func_evals.columns = func_evals.columns.astype(str)
func_evals.to_parquet('func_evals.gzip', compression='gzip')

In [None]:
func_evals = pd.read_parquet('func_evals.gzip')

Read in the benchmark and find the optimal minimizer within an error margin

In [None]:
benchmark = pd.read_csv('imputed_relERT_costs_50d.csv').drop(['repetition'], axis=1).set_index(['dim', 'fid'])
deviation = 0.01
mins = benchmark.min(axis=1) * (1+deviation)
#benchmark = benchmark.le(mins, axis='index').melt(value_vars=benchmark.columns, var_name='optimizer', ignore_index=False)
benchmark = benchmark[benchmark.le(mins, axis='index')].melt(value_vars=benchmark.columns, var_name='optimizer', value_name='niter', ignore_index=False).dropna().sort_index()
cat_mapping = benchmark.optimizer.astype('category').cat.categories.to_list()
#cat_mapping = dict(zip(range(len(cat_mapping)), cat_mapping))
benchmark.optimizer = benchmark.optimizer.astype('category').cat.codes


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
clf = GradientBoostingClassifier()
_slice = features.loc[features.dim == 2,:].dropna(axis=1)
_slice = _slice.loc[:,~_slice.columns.str.contains('costs_runtime')]
levels = _slice.columns[_slice.columns.str.contains('ela_level')].to_list()
X = _slice.drop(['fid', 'dim', 'iid', 'sample_coefficient'] + levels, axis=1).replace([np.inf], 99999999)#.to_numpy()
y = _slice.loc[:,'fid'].to_numpy()-1
clf = clf.fit(X,y)

##Eval
clf_test = GradientBoostingClassifier()
_slice = features_test.loc[features_test.dim == 2,:].dropna(axis=1)
_slice = _slice.loc[:,~_slice.columns.str.contains('costs_runtime')]
levels = _slice.columns[_slice.columns.str.contains('ela_level')].to_list()
X = _slice.drop(['fid', 'dim', 'iid', 'sample_coefficient'] + levels, axis=1).replace([np.inf], 99999999)#.to_numpy()
y = _slice.loc[:,'fid'].to_numpy()-1
clf_test = clf_test.fit(X,y)



In [None]:
import gym
import numpy as np
from gym import spaces
from numpy.random import default_rng
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback

class ClassifierEnv(gym.Env):
    """Classifier Environment for ELA based AAS."""

    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(self, classifier, truth, fid='1-24', iid='1-5', dim=2, problem='bbob'):
        super().__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        #self.problem_def = (problem, f"instances:{iid}", f"function_indices:{fid} dimensions:{dim}")
        
        self.suite = None# cocoex.Suite(problem, problem_select)
        self.classifier = classifier
        self.dim = dim
        self.truth = truth
        self.fid = fid
        self.iid = iid
        self.rng = default_rng()
        
        self.action_space = spaces.Discrete(2)
        # Example for using image as input (channel-first; channel-last also works):
        self.observation_space = spaces.Box(-np.inf, np.inf, shape=(1,46), dtype=np.float32)
    
    @staticmethod
    def sigmoid(x):
        return 1/(1+math.exp(-0.1*x))
    
    @staticmethod
    def softplus(x):
        return math.ln(1.0+math.exp(x))-1.0
    
    @staticmethod
    def elu(x, alpha=1.0):
        return x if x>0.0 else alpha*(math.exp(x)-1.0)
    
    def step(self, action):
        if action == 0:
            #X = create_initial_sample(self.dim, lower_bound = -5, upper_bound = 5, n=10, sample_type='random')
            cols = [f"x{i}" for i in range(self.dim)]
            X = pd.DataFrame(self.rng.random((10, self.dim))*10-5, columns=cols)
            self.X = pd.concat([self.X, X], ignore_index=True)

            observation = self.calc_ela(self.X)
            reward = -1.0
            done = False
        elif action == 1:
            #problem ready for AAS
            observation = self.calc_ela(self.X)
            #print(observation)
            predict = self.classifier.predict(observation)[0]
            fid = self.problem.id_function
            dim = self.problem.dimension
            mask = self.truth.loc[(dim,fid),'optimizer'] == predict
            print()
            
            if mask.any():
                niter = self.truth.loc[(dim,fid),'niter'][mask].values[0]
                reward = self.elu(niter-self.X.shape[0])
            else:
                niter = self.truth.loc[(dim,fid),'niter'].min()
                reward = -self.elu(niter-self.X.shape[0])
            
            done = True
            
            
        else:
            raise ValueError(
                f"Received invalid action={action} which is not part of the action space"
            )
            
        truncated = False
        info = {}
        
        return observation, reward, done, info
    
        
    
    def calc_ela(self, X):
        
        y = X.apply(lambda x: self.problem(x), axis = 1)
        
        ela_meta = calculate_ela_meta(X, y)
        ela_distr = calculate_ela_distribution(X, y)
        fast_k = max(math.ceil(0.05 * X.shape[0]),2)
        try:
            nbc = calculate_nbc(X, y, fast_k=fast_k)
        except IndexError as e:
            print(X)
            print(y)
            raise e
        disp = calculate_dispersion(X, y)
        try:
            ic = calculate_information_content(X, y, seed = 100)
        except KeyError as e:
            assert (X.index==y.index).all()
            print(X)
            print(y)
            raise e
        pca = calculate_pca(X, y)
        #try:
        #    level = calculate_ela_level(X, y)
        #except Exception:
        #    level = {}
        data = {**ic, **ela_meta, **ela_distr, **nbc, **disp, **pca}
        del data['ela_meta.costs_runtime']
        del data['pca.costs_runtime']
        del data['nbc.costs_runtime']
        del data['disp.costs_runtime']
        del data['ic.costs_runtime']
        del data['ela_distr.costs_runtime']
        #del data['limo.costs_runtime']
        #del data['cm_angle.costs_runtime']
        #del data['cm_conv.costs_runtime']
        #del data['cm_grad.costs_runtime']
        #del data['ela_conv.costs_runtime']
        #del data['ela_level.costs_runtime']
        #del data['ela_curv.costs_runtime']
        #del data['ela_local.costs_runtime']
        #data = {k,v for k,v in data.items() if not "costs_runtime" in k}
        result = np.array(list(data.values()), dtype=np.float32).reshape(1, -1)
        result[np.isnan(result)] = -(2**32)#-np.inf
        return result
        #return {**ic, **ela_meta, **ela_distr, **nbc, **disp, **pca}#**level
    
    def next_problem(self, options=None):
        self.X = None
        self.problem = next(self.problem_selector)
        self.X = create_initial_sample(self.dim, lower_bound = -5, upper_bound = 5, n=10, sample_type='lhs')
        
        observation = self.calc_ela(self.X)
        return observation#, info
    
    def reset(self, fid='1-24', iid='1-5', seed=0, n_samples=10, options=None):
        
        #problem_def = ('bbob', f"instances:{iid}", f"function_indices:{fid} dimensions:{dim}")
        self.suite = cocoex.Suite('bbob', f"instances:{iid}", f"function_indices:{fid} dimensions:{self.dim}")
        self.problem_selector = iter(self.suite)
        
        return self.next_problem()
    
    def render(self):
        pass

    def close(self):
        pass
from stable_baselines3.common.env_checker import check_env
env = ClassifierEnv(clf, benchmark[~benchmark.index.duplicated(keep='first')], fid='1,2,6')
model = A2C("MlpPolicy", env, learning_rate=0.0007, verbose=1, tensorboard_log="./a2c_AAS_tensorboard/")

In [None]:
model.learn(total_timesteps=100)

In [None]:
    




eval_env = ClassifierEnv(clf_test, benchmark[~benchmark.index.duplicated(keep='first')], fid='1,2,6')
eval_callback = EvalCallback(eval_env, log_path="./logs/", eval_freq=50,
                            deterministic=True, render=False)

model.learn(total_timesteps=1000, callback=eval_callback)


In [None]:
obs = env.reset('1-24', '1-5')
action, _state = model.predict(obs, deterministic=True)

In [None]:
benchmark[~benchmark.index.duplicated(keep='first')].loc[(2,1),'optimizer']

In [None]:
benchmark

In [None]:
obs = env.reset('1-24', '1-5')
for i in range(100):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
        print(f"Done in {i}")
        break
        
        obs = env.next_problem()

In [None]:
y

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
def func(x):
    y = np.where(x <3, np.log(1+np.exp(x)), 0)
    #
    mask = np.insert(np.diff(y),0,0)
    y[mask < 0.0] = np.nan
    return y
def func2(x):
    y = np.where(x<3, np.where(x>0, x, np.exp(x)-1), -x)
    #y = #np.where(x>3, np.where(x>0, -x, np.exp(x)-1), np.exp(x)-1)
    mask = np.insert(np.diff(y),0,0)
    print(mask)
    y[(mask < -1.0) ] = np.nan
    return y
x = np.arange(-3.0, 5.0, 0.1)
plt.figure(figsize=(2,5))
plt.plot(x, func2(x), 'k')
plt.show

In [None]:
df = pd.read_csv('imputed_relERT_costs_50d.csv').drop(['repetition'],axis=1)
#df['min']= df.min(axis=1)
df = pd.melt(df, id_vars=['dim','fid'], value_vars=df.columns.drop(['dim','fid']))

sns.set_theme(style="ticks")
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")
sns.boxplot(data=df[(df.fid<25)&(df.dim==10)],
            y='value',
            x='fid',
            width=.6,
            whis=[0, 100]
           )
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

In [None]:
df

In [None]:
df