In [1]:
from experimentscommons.nb_parameters import EXPERIMENT_ID

In [4]:
DEFAULT_PARAMS = {
    'max_depth': 5,
    'n_trees': 5,
    "train_path": "../data/processed/breast-train-0-s1.csv",
    "test_path": "../data/processed/breast-test-0-s1.csv",
    "cv": 2,
    "cv_repeats": 5,
    "n_jobs": -1,
    'n_gen': 20,
    'pop_size': 100,
    'debug': False,
    EXPERIMENT_ID: '6'
}

In [6]:
from box import Box
import uuid

if 'params' in vars():
    params = Box({**DEFAULT_PARAMS, **params})
else:
    params = Box(DEFAULT_PARAMS)

In [None]:
import mlflow
mlflow.start_run(experiment_id=params.EXPERIMENT_ID)

In [None]:
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd

# Load data

In [None]:
train_data = pd.read_csv(params['train_path'])
test_data = pd.read_csv(params['test_path'])
x_train = train_data.drop('TARGET', axis=1).values
y_train = train_data['TARGET'].values
x_test = test_data.drop('TARGET', axis=1).values
y_test = test_data['TARGET'].values

# Train base

In [None]:
from sklearn.neighbors import kneighbors_graph, KNeighborsClassifier, NearestNeighbors

In [None]:
from more_itertools import grouper
from itertools import product
from scipy.spatial.distance import euclidean
from toolz.curried import pipe, reduce, map, filter

In [None]:
def list_with_repeated_elements(input_list, n_repeated):
    return [val for val in input_list for _ in range(n_repeated)]

In [None]:
from rules.classification.utils import covered_by_statements
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, ShuffleSplit, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import make_scorer

In [None]:
from loguru import logger

In [None]:
feature_min = np.min(x_train, axis=0)
feature_max = np.max(x_train, axis=0)

In [None]:
from rules.classification.competence_region_ensemble import SimpleCompetenceRegionEnsemble

In [None]:
from sklearn.neighbors import NearestCentroid, NearestNeighbors

In [None]:
def nn_wrapper(nn):
    return Box({
     "predict": lambda x: nn.kneighbors(x, n_neighbors=nn.n_samples_fit_, return_distance=False)   
    })
    
    

In [None]:
np.nonzero([3,5,0])[0]

In [None]:
def create_estimator(centroids, depths): 
    n_trees = centroids.shape[0]
    activated_trees_indices = np.nonzero(depths)[0]

    active_centroids = centroids[activated_trees_indices]
    active_depths = depths[activated_trees_indices]
    
    space_classifier = NearestNeighbors()
    space_classifier.fit(active_centroids)

    model = SimpleCompetenceRegionEnsemble(
        None,
        {label:DecisionTreeClassifier(max_depth=depth, random_state=42) for label, depth in enumerate(active_depths)}
    )
    
    return model, space_classifier


In [None]:
def find_closeset_val(arr, val):
    return np.argmin(np.abs(np.array(arr) - val))

# Problem

Wektor = $[\text{wspolrzedne}_n, \text{depth}_n, \text{wlaczony}_n]$

In [None]:
# def create_mask(n_trees, n_dimensions):
#     return ['real'] * n_dimensions * n_trees + ['real'] * n_trees

In [None]:
# OPERATORS = {
#     'sampling': {
#         'bin': 'bin_random',
#         'real': 'real_random',
#         'int': 'int_random'
#     }, 
#     'mutation': {
#         'bin': 'bin_bitflip',
#         'real': 'real_pm',
#         'int': 'int_pm',
#     },
#     'crossover': {
#         'bin': 'bin_one_point',
#         'real': 'real_sbx',
#         'int': 'int_sbx',
#     }
# }

In [None]:
n_dim = x_train.shape[1] 
n_trees = params.n_trees

In [None]:
from pymoo.factory import get_sampling, get_crossover, get_mutation
# from pymoo.operators.mixed_variable_operator import MixedVariableSampling, MixedVariableMutation, MixedVariableCrossover


In [None]:
# mask = create_mask(n_trees, n_dim)
# variable_types = np.unique(mask)


In [None]:
# sampling = MixedVariableSampling(mask, {
#     variable_type: get_sampling(OPERATORS['sampling'][variable_type]) for variable_type in variable_types
# })
# crossover = MixedVariableCrossover(mask, {
#     variable_type: get_crossover(OPERATORS['crossover'][variable_type]) for variable_type in variable_types
# })
# mutation = MixedVariableMutation(mask, {
#     variable_type: get_mutation(OPERATORS['mutation'][variable_type]) for variable_type in variable_types
# })

In [None]:
import numpy as np
from toolz.curried import pipe, map, reduce, filter
from pymoo.core.problem import ElementwiseProblem
from loguru import logger


class MyProblem(ElementwiseProblem):

    def __init__(self, n_trees, x_train, y_train, max_tree_depth, **kwargs):
        n_dim = x_train.shape[1]
        
        super().__init__(
            n_var=n_trees*n_dim + n_trees, # each centroid * number of features + depths + on/off
             n_obj=1, # accuracy
             n_constr=0,
             xl=list(np.min(x_train, axis=0)) * n_trees + n_trees * [-0.5],
             xu=list(np.max(x_train, axis=0)) * n_trees + n_trees * [max_tree_depth + 0.5],
            **kwargs
        )
        
        
        self.n_trees = n_trees
        self.x_train = x_train
        self.y_train = y_train
        self.n_dim = n_dim
        self.max_tree_depth = max_tree_depth
        
    def _evaluate(self, individual, out, *args, **kwargs):
        n_coordinates_in_individual = self.n_dim * self.n_trees
        centroid_coordinates = individual[:n_coordinates_in_individual]
        
        individual_as_centroids = pipe(
            centroid_coordinates,
            lambda x: grouper(x, self.n_dim),
            list,
            np.array,
            np.nan_to_num
        )
        
        tree_depths_continous = individual[-self.n_trees:]
        possible_tree_depths = list(range(self.max_tree_depth + 1))
        
        tree_depths = np.array([find_closeset_val(possible_tree_depths, td) for td in tree_depths_continous])

        if np.all(tree_depths==0):
            out["F"] = 1
        else:
            model, space_classifier = create_estimator(individual_as_centroids, tree_depths)

            skf = RepeatedKFold(n_splits=params['cv'], n_repeats=params['cv_repeats'], random_state=42)
            scores = cross_validate(model, self.x_train, self.y_train, n_jobs=params['n_jobs'], scoring='accuracy', cv=skf, fit_params={
            'competence_region_classifier': nn_wrapper(space_classifier)
            })
            
            if params.debug:
                print(f"Depths = {tree_depths}, acc = {scores['test_score'].mean()}")

            out["F"] = 1 - scores['test_score'].mean()



In [None]:
import sys
logger.remove()
logger.add(sys.stderr, level="INFO")

In [None]:
from collections import defaultdict
from pymoo.optimize import minimize
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.core.problem import starmap_parallelized_eval
from multiprocessing.pool import ThreadPool


pool = ThreadPool(params['pop_size'])

problem = MyProblem(params.n_trees, x_train, y_train, params.max_depth, 
                    runner=pool.starmap, func_eval=starmap_parallelized_eval
                   )

 
res = minimize(problem,
           GA(
               pop_size=params['pop_size'],
               verbose=True,
               seed=42,
                eliminate_duplicates=True
           ),
           ("n_gen", params['n_gen']),
           verbose=True,
               save_history=True,
           seed=42)

pool.close()

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
res.exec_time

In [None]:
import matplotlib.pyplot as plt
val = [np.average(e.pop.get("F")) for e in res.history]
plt.plot(np.arange(len(val)), val)
plt.show()


In [None]:
for idx, vals in enumerate([e.pop.get("F") for e in res.history]):
    mlflow.log_metrics(
        {'min_function_value': np.min(vals), 
         'avg_function_value': np.average(vals),
         'max_function_value': np.max(vals)
        }, step=idx)

In [None]:
if res.X.ndim == 1:
    pareto_front = [res.X]
else:
    pareto_front = res.X

In [None]:
accs = []
for individual in pareto_front:
    n_coordinates_in_individual = n_dim * n_trees
    centroid_coordinates = individual[:n_coordinates_in_individual]

    individual_as_centroids = pipe(
        centroid_coordinates,
        lambda x: grouper(x, n_dim),
        list,
        np.array,
        np.nan_to_num
    )
        
    tree_depths_continous = individual[-n_trees:]
    possible_tree_depths = list(range(params.max_depth + 1))
        
    tree_depths = np.array([find_closeset_val(possible_tree_depths, td) for td in tree_depths_continous])

    if params.debug:
        print(f"Depths = {tree_depths}")

    if np.all(tree_depths==0):
        continue
    else:
        model, space_classifier = create_estimator(individual_as_centroids, tree_depths)
        model.fit(x_train, y_train, competence_region_classifier=nn_wrapper(space_classifier))
    
        accs.append(accuracy_score(y_test, model.predict(x_test)))

In [None]:
mlflow.log_metric('best_training_model_acc', 1 - min(res.F))
mlflow.log_metric('best_model_acc', max(accs))

In [None]:
accs

In [None]:
mlflow.log_params(params)

In [None]:
import json

In [None]:
mlflow.log_param("centroids", json.dumps(individual_as_centroids.tolist()))

In [None]:
mlflow.end_run()