# SVM Experiments with Covertype Data Set

## 1. Setup

### 1.A. Import Libraries

In [None]:
import numpy

### 1.B. Import GitHub Repo

In [None]:
!git clone https://github.com/aamanrebello/HTVTC-Testing-Framework.git

#Enable importing code from parent directory
import os, sys
final_HTVTC = os.path.abspath('./HTVTC-Testing-Framework/final-HTVTC')
sys.path.insert(1, final_HTVTC)
traditional_methods = os.path.abspath('./HTVTC-Testing-Framework/traditional-methods')
sys.path.insert(1, traditional_methods)
root = os.path.abspath('./HTVTC-Testing-Framework')
sys.path.insert(1, root)

### 1.C. Establish the Common Metric

In [None]:
import classificationmetrics
metric = classificationmetrics.JensenShannonDivergence

### 1.D.1. Setup for HTVTC

In [None]:
!pip install tensorly

### 1.D.2. Setup for Random Search, BO-TPE, CMA-ES, Hyperband 

In [None]:
!pip install optuna

### 1.D.3. Setup for BOHB

In [None]:
!pip install hpbandster

###  1.D.4. Setup for BO-GP

In [None]:
!pip install git+https://github.com/fmfn/BayesianOptimization

### 1.E. Load Data Set and Setup Evaluation Function 

#### 1.E.1. Load and preprocess the data

**Note:** Since the full data cannnot be stored within RAM of even the servers used on Google Colab, the data is pre-processed by taking the first $k$ samples of class $0$, the first $k$ samples of class $1$ and putting these together to get a data set of size $2k$. 

In [None]:
from loaddata import loadData, trainTestSplit, extractZeroOneClasses, convertZeroOne, generateReturnDict

#Samples size/2 elements of each class to generate a trucated form of the data
#that evenly represents both classes.
def even_binary_truncate(data_dict, size):
  labels = data_dict['labels']
  features = data_dict['features']
  zero_features = []
  one_features = []
  zero_labels = []
  one_labels = []
  HALF_SIZE = size//2
  for index in range(len(labels)):
    label = labels[index]
    feature = features[index]
    if label == 1:
      if len(one_labels) < HALF_SIZE:
        one_features.append(feature)
        one_labels.append(label)
      continue
    if label == 0:
      if len(zero_labels) < HALF_SIZE:
        zero_features.append(feature)
        zero_labels.append(label)
      continue
    break
  final_labels = zero_labels + one_labels
  final_features = zero_features + one_features
  return generateReturnDict(final_features, final_labels)

#Load data into `data_split`
task = 'classification'
data = loadData(source='sklearn', identifier='covtype', task=task)
binary_data = extractZeroOneClasses(data, zeroClassLabel=1, oneClassLabel=2)
#Truncate to limit elements
LIMIT = 20000 
truncated_data = even_binary_truncate(binary_data, LIMIT)

#Print out description
TOTAL_ELEMENTS = len(truncated_data['labels'])
print(f'Total number of data samples: {TOTAL_ELEMENTS}')
ZERO_ELEMENTS = numpy.count_nonzero(truncated_data['labels'])
print(f'Total number of samples with label 0: {ZERO_ELEMENTS}')

#### 1.E.2. Function to Freshly Generate Evaluation Function (Which Uses a Python Generator to Achieve Cross-validation)

In [None]:
from trainmodels import crossValidationFunctionGenerator

def generate_evaluation_function(algorithm='random-forest'):
  #Regenerates the generator for cross-validation
  data_split = trainTestSplit(truncated_data, method = 'cross_validation')
  #Generate evaluation function as `func`
  func = crossValidationFunctionGenerator(data_split, algorithm=algorithm, task=task)
  return func

#Test the above function
f = generate_evaluation_function()
res = f(no_trees=15, max_tree_depth=20, bootstrap=False, min_samples_split=5, no_features=10, metric=metric)
print(res)

#### 1.E.3. Function to Freshly Generate Evaluation Function That Accepts a Budget

**Note:** This is defined separately to the other function to avoid the performance of the previous function being affected by processing of budget function parameters.

In [None]:
from trainmodels import crossValidationFunctionGenerator

def generate_budget_function(algorithm='random-forest', budget_type='samples', budget_fraction=1.0):
  #Regenerates the generator for cross-validation
  data_split = trainTestSplit(truncated_data, method = 'cross_validation')
  #Generate evaluation function as `func`
  func = crossValidationFunctionGenerator(data_split, algorithm=algorithm, task=task, budget_type=budget_type, budget_fraction=budget_fraction)
  return func

#Test the above function
f = generate_budget_function(budget_fraction=0.1)
res = f(no_trees=15, max_tree_depth=20, bootstrap=False, min_samples_split=5, no_features=10, metric=metric)
print(res)

## 2. Hyperparameter Optimisation Experiments 

### 2.A. HTVTC

In [None]:
from trainmodels import crossValidationFunctionGenerator
from finalAlgoImplementation import final_HTVTC

quantity = 'EXEC-TIME'

#Start timer/memory profiler/CPU timer
a = None
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

ranges_dict = {
        'no_trees': {
            'type': 'INTEGER',
            'start': 1.0,
            'end': 40.0,
            'interval': 5.0,
        },
        'max_tree_depth': {
            'type': 'INTEGER',
            'start': 1.0,
            'end': 20.0,
            'interval': 5.0,
        },
        'bootstrap': {
            'type': 'CATEGORICAL',
            'values': [True, False]
        },
        'min_samples_split': {
            'type': 'INTEGER',
            'start': 2.0,
            'end': 11.0,
            'interval': 2.0,
        },
        'no_features': {
            'type': 'INTEGER',
            'start': 1.0,
            'end': 11.0,
            'interval': 2.0,
        },
    }
    
func = generate_evaluation_function()
recommended_combination, history = final_HTVTC(eval_func=func, 
                                               ranges_dict=ranges_dict, 
                                               metric=metric,
                                               max_completion_cycles=4,
                                               max_size_gridsearch=51)

#End timer/memory profiler/CPU timer
result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()

#Find the true loss for the selected combination
truefunc = generate_evaluation_function()  
true_value = truefunc(metric=metric, **recommended_combination)

print(f'hyperparameters: {recommended_combination}')
print(f'history: {history}')
print(f'True value: {true_value}')
print(f'{quantity}: {result}')

### 2.B. Random Search

In [None]:
import optuna
from optuna.samplers import RandomSampler

quantity = 'EXEC-TIME'

def objective(trial):
    no_trees = trial.suggest_int("no_trees", 1, 40, step=1)
    max_tree_depth = trial.suggest_int("max_tree_depth", 1, 20, step=1)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 11, step=1)
    no_features = trial.suggest_int("no_features", 1, 11, step=1)
    
    func = generate_evaluation_function()
    return func(no_trees=no_trees, 
                max_tree_depth=max_tree_depth, 
                bootstrap=bootstrap, 
                min_samples_split=min_samples_split, 
                no_features=no_features, 
                metric=metric)

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=RandomSampler())
study.optimize(objective, n_trials=100)

result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
print(f'Number of trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial}')
print(f'{quantity}: {result}')

### 2.C. BO-TPE

In [None]:
import optuna
from optuna.samplers import TPESampler

quantity = 'EXEC-TIME'

def objective(trial):
    no_trees = trial.suggest_int("no_trees", 1, 40, step=1)
    max_tree_depth = trial.suggest_int("max_tree_depth", 1, 20, step=1)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 11, step=1)
    no_features = trial.suggest_int("no_features", 1, 11, step=1)
    
    func = generate_evaluation_function()
    return func(no_trees=no_trees, 
                max_tree_depth=max_tree_depth, 
                bootstrap=bootstrap, 
                min_samples_split=min_samples_split, 
                no_features=no_features, 
                metric=metric)

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler())
study.optimize(objective, n_trials=60)

result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
print(f'Number of trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial}')
print(f'{quantity}: {result}')

### 2.D. CMA-ES

In [None]:
import optuna

quantity = 'EXEC-TIME'

def objective(trial):
    no_trees = trial.suggest_int("no_trees", 1, 40, step=1)
    max_tree_depth = trial.suggest_int("max_tree_depth", 1, 20, step=1)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 11, step=1)
    no_features = trial.suggest_int("no_features", 1, 11, step=1)
    
    func = generate_evaluation_function()
    return func(no_trees=no_trees, 
                max_tree_depth=max_tree_depth, 
                bootstrap=bootstrap, 
                min_samples_split=min_samples_split, 
                no_features=no_features, 
                metric=metric)

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

optuna.logging.set_verbosity(optuna.logging.FATAL)
sampler = optuna.samplers.CmaEsSampler()
study = optuna.create_study(sampler=sampler)
study.optimize(objective, n_trials=80)

result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
print(f'Number of trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial}')
print(f'{quantity}: {result}')

### 2.E. BO-GP

In [None]:
from bayes_opt import BayesianOptimization

quantity = 'EXEC-TIME'
trials = 50
pval = 1

def objective(no_trees, max_tree_depth, bootstrap_ind, min_samples_split, no_features):
    no_trees  = int(no_trees)
    max_tree_depth = int(max_tree_depth)
    min_samples_split = int(min_samples_split)
    no_features = int(no_features)
    bootstrap = True
    if bootstrap_ind > 0:
        bootstrap = False
    func = generate_evaluation_function()
    #subtract from 1 because the library only supports maximise
    return pval - func(no_trees=no_trees, 
                max_tree_depth=max_tree_depth, 
                bootstrap=bootstrap, 
                min_samples_split=min_samples_split, 
                no_features=no_features, 
                metric=metric)

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

#Begin optimisation
pbounds = {'no_trees': (1, 40), 'max_tree_depth': (1, 20), 'bootstrap_ind': (-1,1), 'min_samples_split': (2,10), 'no_features': (1,10)}

optimizer = BayesianOptimization(
    f=objective,
    pbounds=pbounds,
    random_state=1,
    verbose = 0
)

optimizer.maximize(
    init_points=10,
    n_iter=trials,
)

result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
best = optimizer.max
best_params = best['params']
best_score = pval - best['target']
print(f'Number of trials: {trials}')
print(f'Best params: {best_params}')
print(f'Best score: {best_score}')
print(f'{quantity}: {result}')

### 2.F. Hyperband

In [None]:
import optuna
from commonfunctions import generate_range

quantity = 'EXEC-TIME'
resolution = 0.2

def objective(trial):
    no_trees = trial.suggest_categorical("no_trees", [1,10,20,30,40])
    max_tree_depth = trial.suggest_categorical("max_tree_depth", [1, 5, 10, 15, 20])
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 11, step=1)
    no_features = trial.suggest_int("no_features", 1, 11, step=1)   

    for fraction in generate_range(resolution,1,resolution):
        func = generate_budget_function(budget_type='samples', budget_fraction=fraction)
        metric_value = func(no_trees=no_trees, 
                max_tree_depth=max_tree_depth, 
                bootstrap=bootstrap, 
                min_samples_split=min_samples_split, 
                no_features=no_features, 
                metric=metric)
        #Check for pruning
        trial.report(metric_value, fraction)
        if trial.should_prune():
            #print('=======================================================================================================')
            raise optuna.TrialPruned()

    #Would return the metric for fully trained model (on full dataset)
    return metric_value
    

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=resolution, max_resource=1, reduction_factor=2
    ),
)
study.optimize(objective, n_trials=50)

#resource_usage = getrusage(RUSAGE_SELF)
#End timer/memory profiler/CPU timer
result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, result = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
print(f'Number of trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial}')
print(f'{quantity}: {result}')
#print(f'Resource usage: {resource_usage}')

### 2.G. BOHB

In [None]:
#TODO
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
import hpbandster.core.result as hpres
from hpbandster.core.worker import Worker
from hpbandster.examples.commons import MyWorker
from hpbandster.optimizers import BOHB as BOHB

#To hide logs
import logging
logObj = logging.getLogger('noOutput')
logObj.setLevel(100)

#To hide warnings
import warnings
warnings.filterwarnings("ignore")

#Define the worker
class MyWorker(Worker):

    def __init__(self, *args, sleep_interval=0, **kwargs):
        super().__init__(*args, **kwargs)

        self.sleep_interval = sleep_interval

    def compute(self, config, budget, **kwargs):
        func = generate_budget_function(budget_type='samples', budget_fraction=budget)
        res = func(**config, metric=metric)
        
        return({
                    'loss': res,
                    'info': res
                })
    
    @staticmethod
    def get_configspace():
        cs = CS.ConfigurationSpace()
        bootstrap = CSH.CategoricalHyperparameter('bootstrap', [True, False])
        cs.add_hyperparameters([bootstrap])

        no_trees = CSH.UniformIntegerHyperparameter('no_trees', lower=1, upper=40)
        max_tree_depth = CSH.UniformIntegerHyperparameter('max_tree_depth', lower=1, upper=20)
        min_samples_split = CSH.UniformIntegerHyperparameter('min_samples_split', lower=2, upper=11)
        no_features = CSH.UniformIntegerHyperparameter('no_features', lower=1, upper=11)
        cs.add_hyperparameters([no_trees, max_tree_depth, min_samples_split, no_features])

        return cs

#Setup nameserver
NS = hpns.NameServer(run_id='rf-wine', host='127.0.0.1', port=None)
NS.start()

#Start a worker
w = MyWorker(sleep_interval = 0, nameserver='127.0.0.1',run_id='rf-wine', logger=logObj)
w.run(background=True)

quantity = 'EXEC-TIME'

#Start timer/memory profiler/CPU timer
start_time = None
if quantity == 'EXEC-TIME':
    import time
    start_time = time.perf_counter_ns()
elif quantity == 'CPU-TIME':
    import time
    start_time = time.process_time_ns()
elif quantity == 'MAX-MEMORY':
    import tracemalloc
    tracemalloc.start()

#Run the optimiser
MAX_BUDGET = 1.0
MIN_BUDGET = 0.2
bohb = BOHB(  configspace = w.get_configspace(),
              run_id = 'rf-wine', nameserver='127.0.0.1',
              min_budget=MIN_BUDGET, max_budget=MAX_BUDGET,
              logger=logObj
           )
res = bohb.run(n_iterations=50)

#End timer/memory profiler/CPU timer
quantity_result = None
if quantity == 'EXEC-TIME':
    end_time = time.perf_counter_ns()
    quantity_result = end_time - start_time
elif quantity == 'CPU-TIME':
    end_time = time.process_time_ns()
    quantity_result = end_time - start_time
elif quantity == 'MAX-MEMORY':
    _, quantity_result = tracemalloc.get_traced_memory()
    tracemalloc.stop()

#Shutdown
bohb.shutdown(shutdown_workers=True)
NS.shutdown()

id2config = res.get_id2config_mapping()
inc_id = res.get_incumbent_id()
inc_runs = res.get_runs_by_id(inc_id)
inc_run = inc_runs[-1]

print('Best found configuration:', id2config[inc_id]['config'])
print(f'Validation loss: {inc_run.loss}')
print('A total of %i unique configurations were sampled.' % len(id2config.keys()))
print('A total of %i runs were executed.' % len(res.get_all_runs()))
print('Total budget corresponds to %.1f full function evaluations.'%(sum([r.budget for r in res.get_all_runs()])/MAX_BUDGET))
print(f'{quantity}: {quantity_result}')

## 3. Display Background Specifications

In [None]:
!lscpu

In [None]:
!nvidia-smi -L