In [2]:
import sys
import warnings
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from copy import deepcopy

from distributed import Client
from mlflow import MlflowClient
from mlflow.entities import RunStatus
from distributed import get_client

import ray
from tqdm.auto import tqdm
import mlflow
import numpy as np
from loguru import logger
from mlxtend.classifier import EnsembleVoteClassifier
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.optimize import minimize
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from pymoo.core.problem import DaskParallelization

from scalarizing.scalarizing import FindingBestExpressionSingleDatasetProblem, FindingBestExpressionProblemMutation, \
    FindingBestExpressionProblemCrossover, FindingBestExpressionProblemSampling, scorer_creator, top_n_indicies
from glob import glob
import pandas as pd
from box import Box
from sklearn.linear_model import Perceptron


ImportError: /home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing/.venv/lib/python3.10/site-packages/scipy/spatial/transform/rotation.cpython-310-x86_64-linux-gnu.so: undefined symbol: _PyGen_Send

In [2]:
DATASETS_DIR = '../../../data/processed'
EXPERIMENT_ID = 1


In [3]:
all_processed_data_files = glob(f"{DATASETS_DIR}/*")
all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'train' in file
]
print(all_file_names[:5])

['tae', 'penbased', 'vowel', 'hepatitis', 'vowel']


In [4]:
train_and_test_paths = [
    {
        'train': path,
        'test': path.replace('train', 'test')
    } for path in all_processed_data_files if 'train' in path
]
print(train_and_test_paths[:2])

[{'train': '../../../data/processed/tae-train-3-s1.csv', 'test': '../../../data/processed/tae-test-3-s1.csv'}, {'train': '../../../data/processed/penbased-train-3-s1.csv', 'test': '../../../data/processed/penbased-test-3-s1.csv'}]


In [5]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return {
        "x": x,
        "y": y
    }

In [6]:
datasets = [
    Box({
        'train': read_dataset(paths['train']),
        'test': read_dataset(paths['test']),
        'name': paths['train'].split("/")[-1].replace("-train", '')
    }) for paths in train_and_test_paths
]

In [7]:
class predict_wrapper(object):
    def __init__(self, predict_func, labels):
        self.predict_func = predict_func
        self.labels = labels

    def __call__(self, *args, **kwargs):
        return self.labels[self.predict_func(*args, **kwargs)]

def raise_not_implemented():
    raise NotImplemented("Predict proba is not supported")
def extract_classifiers_from_bagging(bagging):

    extracted = []
    for classifier in bagging.estimators_:
        cloned_classifier = deepcopy(classifier)
        cloned_classifier.predict = predict_wrapper(cloned_classifier.predict, bagging.classes_)
        cloned_classifier.predict_proba = raise_not_implemented

        extracted.append(cloned_classifier)

    return extracted

In [9]:

# class DaskParallelization:
#
#     def __init__(self, client) -> None:
#         super().__init__()
#         self.client = client
#
#     def __call__(self, f, X):
#         jobs = [self.client.submit(f, x) for x in X]
#         return [job.result() for job in jobs]
#
#     def __getstate__(self):
#         state = self.__dict__.copy()
#         state.pop("client", None)
#         return state
@ray.remote
def execute_in_ray(f, x):
    return f(x)

class RayParallelization:

    def __init__(self) -> None:
        super().__init__()

    def __call__(self, f, X):
        results = [execute_in_ray.remote(f, x) for x in X]

        return ray.get(results)


    def __getstate__(self):
        state = self.__dict__.copy()
        return state



In [10]:
class ExecutorParallelization:

    def __init__(self, executor) -> None:
        super().__init__()
        self.executor = executor

    def __call__(self, f, X):
        jobs = [self.executor.submit(f, x) for x in X]
        return [job.result() for job in jobs]

    def __getstate__(self):
        state = self.__dict__.copy()
        state.pop("executor", None) # is not serializable
        return state

In [11]:
warnings.filterwarnings("ignore")

logger.remove()
logger.add(sys.stdout, level='INFO')

1

In [23]:
def run(dataset, ensemble_size, bagging_size, pop_size, n_gen, run_id, parallelization=RayParallelization(), mlflow_client = MlflowClient()):
    from loguru import logger

    bagging = BaggingClassifier(base_estimator=Perceptron(), n_estimators=bagging_size, max_samples=0.3)
    bagging.fit(dataset.train.x, dataset.train.y)
    problem = FindingBestExpressionSingleDatasetProblem(dataset.train, extract_classifiers_from_bagging(bagging), ensemble_size=ensemble_size, elementwise_runner=parallelization)
    result = minimize(problem,
                      GA(
                          pop_size=pop_size,
                          verbose=True,
                          seed=42,
                          eliminate_duplicates=False,
                          mutation=FindingBestExpressionProblemMutation(),
                          crossover=FindingBestExpressionProblemCrossover(),
                          sampling=FindingBestExpressionProblemSampling()
                      ),
                      ("n_gen", n_gen),
                      verbose=False,
                      save_history=False,
                      seed=42)

    bagging_estimators = np.array(extract_classifiers_from_bagging(bagging))

    scorer = scorer_creator(result.X[0], labels=np.unique(dataset.train.y))
    estimator_accuracies = []
    estimator_scores = []

    for estimator in bagging_estimators:
        predictions = estimator.predict(dataset.train.x)

        estimator_accuracies.append(accuracy_score(dataset.train.y, predictions))
        estimator_scores.append(scorer(dataset.train.y, predictions))

    estimators_selected_by_accuracy = bagging_estimators[top_n_indicies(estimator_accuracies, ensemble_size)]

    ensemble = EnsembleVoteClassifier(clfs=estimators_selected_by_accuracy,
                                      weights=[1 for _ in range(ensemble_size)],
                                      fit_base_estimators=False)

    ensemble.fit(dataset.train.x, dataset.train.y) # Required by design, but does nothing apart from checking labels
    accuracy_ensemble_train_accuracy = accuracy_score(dataset.train.y, ensemble.predict(dataset.train.x))
    accuracy_ensemble_accuracy = accuracy_score(dataset.test.y, ensemble.predict(dataset.test.x))


    estimators_selected_by_score = bagging_estimators[top_n_indicies(estimator_scores, ensemble_size)]
    ensemble = EnsembleVoteClassifier(clfs=estimators_selected_by_score,
                                      weights=[1 for _ in range(ensemble_size)],
                                      fit_base_estimators=False)

    ensemble.fit(dataset.train.x, dataset.train.y) # Required by design, but does nothing apart from checking labels
    score_ensemble_train_accuracy = accuracy_score(dataset.train.y, ensemble.predict(dataset.train.x))
    score_ensemble_accuracy = accuracy_score(dataset.test.y, ensemble.predict(dataset.test.x))


    if(accuracy_ensemble_train_accuracy > score_ensemble_train_accuracy):
        mlflow_client.log_metric(run_id, "accuracy_ensemble_selected", True)
        selected_ensemble_accuracy = accuracy_ensemble_accuracy
    else:
        selected_ensemble_accuracy = score_ensemble_accuracy

    mlflow_client.log_metric(run_id, "selected_ensemble_accuracy", selected_ensemble_accuracy)
    mlflow_client.log_metric(run_id, "method_selection_accuracy", score_ensemble_accuracy)
    mlflow_client.log_metric(run_id, "accuracy_selection_accuracy", accuracy_ensemble_accuracy)

    logger.info(f"{idx} method={selected_ensemble_accuracy} normal={accuracy_ensemble_accuracy}, discarded={accuracy_ensemble_train_accuracy > score_ensemble_train_accuracy}, diff={score_ensemble_accuracy-accuracy_ensemble_accuracy}")

In [24]:
def run_experiment(dataset, ensemble_size=10, bagging_size=200, n_gen=20, pop_size=5, parallelization=RayParallelization()):
    from loguru import logger
    mlflow_client = MlflowClient()

    mlflow_run = mlflow_client.create_run("1")
    run_id = mlflow_run.info.run_id

    mlflow_client.log_param(run_id, "dataset", dataset.name)
    mlflow_client.log_param(run_id, "ensemble_size", ensemble_size)
    mlflow_client.log_param(run_id, "bagging_size", bagging_size)
    mlflow_client.log_param(run_id, "n_gen", n_gen)
    mlflow_client.log_param(run_id, "pop_size", pop_size)
    mlflow_client.log_param(run_id, "dataset", dataset.name)

    try:
        run(dataset, ensemble_size=ensemble_size, bagging_size=bagging_size, pop_size=pop_size, n_gen=n_gen, run_id=run_id, parallelization=parallelization)
        mlflow_client.set_terminated(run_id=run_id)
    except Exception as ex:
        logger.exception(ex)
        mlflow_client.set_terminated(run_id=run_id, status="FAILED")



In [21]:

%%time
for idx, dataset in enumerate(datasets[:3]):
    run_experiment(dataset, ensemble_size=10, bagging_size=200, n_gen=5, pop_size=5, parallelization=RayParallelization())


[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:03:55.799 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:03:55.874 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:03:55.882 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5333333333333333 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:03:55.943 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.52 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:03:55.987 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5333333333333333 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:04:02.448 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333

2022-10-23 21:04:42.455 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home

[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:05:24.935 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.08806404657933042 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:05:25.110 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.08515283842794759 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:05:25.099 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.08642649199417755 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:05:25.167 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.08806404657933042 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:05:25.142 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.08515283842794759 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:05:44.842 | INFO     | scalarizing.scalarizing:_evaluate:204 

2022-10-23 21:07:17.872 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home

[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:07:31.820 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:07:31.842 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:07:31.863 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:07:31.953 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5717171717171717 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:07:31.983 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5717171717171717 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:07:37.983 | INFO     | scalarizing.scalarizing:_evaluate:204 - [ba

2022-10-23 21:08:08.904 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home