In [8]:
import mlflow
from sklearn.model_selection import ParameterGrid
from mlflow import MlflowClient
from pathlib import Path
import pandas as pd
import tempfile
from glob import glob

In [3]:
mlflow.create_experiment("diversity-functions")

'1'

In [4]:
PROJECT_ROOT = Path.cwd().parents[0]
NOTEBOOKS_ROOT = PROJECT_ROOT / "notebooks"
PAPERMILL_PATH = PROJECT_ROOT/ ".venv" / "bin" / "papermill"

In [5]:
EXPERIMENT_ID = '1'
DATASETS_DIR = '../../datasets/processed/'


In [9]:
all_processed_data_files = glob(f"{DATASETS_DIR}/*")
all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'train' in file
]
print(all_file_names[:5])

['tae', 'penbased', 'vowel', 'hepatitis', 'vowel']


In [None]:
all_processed_data_files

In [10]:
train_and_test_paths = [
    {
        'train': path,
        'test': path.replace('train', 'test')
    } for path in all_processed_data_files if 'train' in path
]
print(train_and_test_paths[:2])

[{'train': '../../datasets/processed/tae-train-3-s1.csv', 'test': '../../datasets/processed/tae-test-3-s1.csv'}, {'train': '../../datasets/processed/penbased-train-3-s1.csv', 'test': '../../datasets/processed/penbased-test-3-s1.csv'}]


In [11]:
params = ParameterGrid({
    "ensemble_size": [5, 10, 20],
    "train_path": [path for path in all_processed_data_files if 'train' in path],
    "bagging_size": [50, 100, 200, 500],
    "pop_size": [100],
    "n_gen": [100],
    "scoring_method": ['normal', 'diversity']
})

In [12]:
client = MlflowClient()

In [14]:
list(params)[0]

{'bagging_size': 50,
 'ensemble_size': 5,
 'n_gen': 100,
 'pop_size': 100,
 'scoring_method': 'normal',
 'train_path': '../../datasets/processed/tae-train-3-s1.csv'}

In [15]:
# for param_set in params:
#     run = client.create_run(EXPERIMENT_ID)
#     for param, value in param_set.items():
#         client.log_param(run_id=run.info.run_id, key=param, value=value)

In [16]:
experiments = mlflow.search_runs(EXPERIMENT_ID, output_format='pandas')

In [17]:
experiments_tmp_dir = tempfile.mkdtemp(prefix="bogul-exp-ijcnn")

In [18]:
def run_experiments_in_pbs(run_ids, notebook_path, output_dir_path):
    futures = []
    
    for run_id in run_ids:
        # print(f"Running file {file_name}")
        papermill_command = f"{str(PAPERMILL_PATH)} {str(notebook_path)} {str(output_dir_path)}/{run_id}.ipynb -p {EXPERIMENT_INSTANCE_ID} {run_id} &> {str(output_dir_path)}/{run_id}.out"

        std_out_path = output_dir_path / f"{run_id}.out"
        std_err_path = output_dir_path / f"{run_id}.err"
                    # subprocess.run(f"qsub -v \"CMD='{papermill_command}'\" -o {str(std_out_path)}/{run_id}.out -e {str(std_err_path)}/{run_id}.err MAIN_PBS_SCRIPT.sh", shell=True)

        futures.append(
            subprocess.run(f"qsub -v \"CMD='{papermill_command}'\" slurm-script.sh", shell=True)
        )
        
    return futures

In [22]:
import time
def run_in_batches(run_ids, notebook_path, output_dir_path, batch_size=250, sleep_interval=25):
    chunked_run_ids = chunks(run_ids, batch_size)
    
    while(True):
        
        jobs_currently = how_many_jobs()
        if jobs_currently <= batch_size:
            print(f"There are {jobs_currently} runnig jobs, scheduling next batch of {batch_size}")
            try:
                next_batch = next(chunked_run_ids)
                run_experiments_in_pbs(next_batch, notebook_path, output_dir_path)
            except StopIteration:
                print("End of batches!")
                break
        else:
            print(f"There are {jobs_currently}, cant schedule yet!")
        print(f"Waiting {sleep_interval}") 
        time.sleep(sleep_interval)
    
    

In [20]:
(experiments['params.train_path'].isna() == False).value_counts()

True     14769
False        5
Name: params.train_path, dtype: int64

In [21]:
viable_experiments = experiments[experiments['params.train_path'].isna() == False]

In [71]:
viable_experiments

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,params.ensemble_size,params.bagging_size,params.train_path,params.n_gen,params.scoring_method,params.pop_size,params.train_and_test_paths,tags.mlflow.runName
1,31c1227103394770b95ce77822ba2310,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.666000+00:00,,20,500,../../datasets/processed/pima-train-3-s1.csv,100,normal,100,,bemused-eel-384
2,88b283dbda20477bb588e07f0db4d0c8,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.372000+00:00,,20,500,../../datasets/processed/wdbc-train-1-s2.csv,100,normal,100,,efficient-kit-510
3,a18954e76fb144f4b8bcb90e31256a5d,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:39.089000+00:00,,20,500,../../datasets/processed/breast-train-2-s2.csv,100,normal,100,,smiling-squid-923
4,48bf1cb87005434a878d6d7a1ae2639c,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:38.798000+00:00,,20,500,../../datasets/processed/haberman-train-1-s1.csv,100,normal,100,,brawny-dog-878
5,fa1926761b4f47dbab4152a405a6df91,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 22:34:38.497000+00:00,,20,500,../../datasets/processed/nursery-train-3-s1.csv,100,normal,100,,spiffy-owl-732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14765,460a47d820164879a57a63471936a711,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:46.928000+00:00,,5,50,../../datasets/processed/vowel-train-3-s1.csv,100,normal,100,,abundant-ray-707
14766,9aa3152d4faa4440a4904b5bced0164a,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:46.896000+00:00,,5,50,../../datasets/processed/hepatitis-train-3-s1.csv,100,normal,100,,brawny-fish-853
14767,bc165160695d48059253542a5ef5367d,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:46.854000+00:00,,5,50,../../datasets/processed/vowel-train-1-s1.csv,100,normal,100,,likeable-mouse-424
14768,1979d13c6d3f410088ef38c268a9e00f,1,RUNNING,file:///home/bogul/scalarizing/notebooks/mlrun...,2022-11-23 21:44:46.797000+00:00,,5,50,../../datasets/processed/penbased-train-3-s1.csv,100,normal,100,,clumsy-shad-300


In [23]:
experiments['params.train_path'].isna()

0         True
1        False
2        False
3        False
4        False
         ...  
14769    False
14770     True
14771     True
14772     True
14773     True
Name: params.train_path, Length: 14774, dtype: bool

In [6]:
import sys
import warnings
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from copy import deepcopy

from distributed import Client
from mlflow import MlflowClient
from mlflow.entities import RunStatus
from distributed import get_client

import ray
from tqdm.auto import tqdm
import mlflow
import numpy as np
from loguru import logger
from mlxtend.classifier import EnsembleVoteClassifier
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.optimize import minimize
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from pymoo.core.problem import DaskParallelization

from scalarizing.scalarizing import FindingBestExpressionSingleDatasetProblem, FindingBestExpressionProblemMutation, \
    FindingBestExpressionProblemCrossover, FindingBestExpressionProblemSampling, scorer_creator
from scalarizing.utils import top_n_indicies
from glob import glob
import pandas as pd
from box import Box
from sklearn.linear_model import Perceptron


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
EXPERIMENT_ID = 1


In [10]:
all_processed_data_files = glob(f"{DATASETS_DIR}/*")
all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'train' in file
]
print(all_file_names[:5])

['tae', 'penbased', 'vowel', 'hepatitis', 'vowel']


In [4]:
train_and_test_paths = [
    {
        'train': path,
        'test': path.replace('train', 'test')
    } for path in all_processed_data_files if 'train' in path
]
print(train_and_test_paths[:2])

[{'train': '../../../data/processed/tae-train-3-s1.csv', 'test': '../../../data/processed/tae-test-3-s1.csv'}, {'train': '../../../data/processed/penbased-train-3-s1.csv', 'test': '../../../data/processed/penbased-test-3-s1.csv'}]


In [11]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return {
        "x": x,
        "y": y
    }

In [6]:
datasets = [
    Box({
        'train': read_dataset(paths['train']),
        'test': read_dataset(paths['test']),
        'name': paths['train'].split("/")[-1].replace("-train", '')
    }) for paths in train_and_test_paths
]

In [7]:
class predict_wrapper(object):
    def __init__(self, predict_func, labels):
        self.predict_func = predict_func
        self.labels = labels

    def __call__(self, *args, **kwargs):
        return self.labels[self.predict_func(*args, **kwargs)]

def raise_not_implemented():
    raise NotImplemented("Predict proba is not supported")
def extract_classifiers_from_bagging(bagging):

    extracted = []
    for classifier in bagging.estimators_:
        cloned_classifier = deepcopy(classifier)
        cloned_classifier.predict = predict_wrapper(cloned_classifier.predict, bagging.classes_)
        cloned_classifier.predict_proba = raise_not_implemented

        extracted.append(cloned_classifier)

    return extracted

In [9]:

# class DaskParallelization:
#
#     def __init__(self, client) -> None:
#         super().__init__()
#         self.client = client
#
#     def __call__(self, f, X):
#         jobs = [self.client.submit(f, x) for x in X]
#         return [job.result() for job in jobs]
#
#     def __getstate__(self):
#         state = self.__dict__.copy()
#         state.pop("client", None)
#         return state
@ray.remote
def execute_in_ray(f, x):
    return f(x)

class RayParallelization:

    def __init__(self) -> None:
        super().__init__()

    def __call__(self, f, X):
        results = [execute_in_ray.remote(f, x) for x in X]

        return ray.get(results)


    def __getstate__(self):
        state = self.__dict__.copy()
        return state



In [10]:
class ExecutorParallelization:

    def __init__(self, executor) -> None:
        super().__init__()
        self.executor = executor

    def __call__(self, f, X):
        jobs = [self.executor.submit(f, x) for x in X]
        return [job.result() for job in jobs]

    def __getstate__(self):
        state = self.__dict__.copy()
        state.pop("executor", None) # is not serializable
        return state

In [11]:
warnings.filterwarnings("ignore")

logger.remove()
logger.add(sys.stdout, level='INFO')

1

In [23]:
def run(dataset, ensemble_size, bagging_size, pop_size, n_gen, run_id, parallelization=RayParallelization(), mlflow_client = MlflowClient()):
    from loguru import logger

    bagging = BaggingClassifier(base_estimator=Perceptron(), n_estimators=bagging_size, max_samples=0.3)
    bagging.fit(dataset.train.x, dataset.train.y)
    problem = FindingBestExpressionSingleDatasetProblem(dataset.train, extract_classifiers_from_bagging(bagging), ensemble_size=ensemble_size, elementwise_runner=parallelization)
    result = minimize(problem,
                      GA(
                          pop_size=pop_size,
                          verbose=True,
                          seed=42,
                          eliminate_duplicates=False,
                          mutation=FindingBestExpressionProblemMutation(),
                          crossover=FindingBestExpressionProblemCrossover(),
                          sampling=FindingBestExpressionProblemSampling()
                      ),
                      ("n_gen", n_gen),
                      verbose=False,
                      save_history=False,
                      seed=42)

    bagging_estimators = np.array(extract_classifiers_from_bagging(bagging))

    scorer = scorer_creator(result.X[0], labels=np.unique(dataset.train.y))
    estimator_accuracies = []
    estimator_scores = []

    for estimator in bagging_estimators:
        predictions = estimator.predict(dataset.train.x)

        estimator_accuracies.append(accuracy_score(dataset.train.y, predictions))
        estimator_scores.append(scorer(dataset.train.y, predictions))

    estimators_selected_by_accuracy = bagging_estimators[top_n_indicies(estimator_accuracies, ensemble_size)]

    ensemble = EnsembleVoteClassifier(clfs=estimators_selected_by_accuracy,
                                      weights=[1 for _ in range(ensemble_size)],
                                      fit_base_estimators=False)

    ensemble.fit(dataset.train.x, dataset.train.y) # Required by design, but does nothing apart from checking labels
    accuracy_ensemble_train_accuracy = accuracy_score(dataset.train.y, ensemble.predict(dataset.train.x))
    accuracy_ensemble_accuracy = accuracy_score(dataset.test.y, ensemble.predict(dataset.test.x))


    estimators_selected_by_score = bagging_estimators[top_n_indicies(estimator_scores, ensemble_size)]
    ensemble = EnsembleVoteClassifier(clfs=estimators_selected_by_score,
                                      weights=[1 for _ in range(ensemble_size)],
                                      fit_base_estimators=False)

    ensemble.fit(dataset.train.x, dataset.train.y) # Required by design, but does nothing apart from checking labels
    score_ensemble_train_accuracy = accuracy_score(dataset.train.y, ensemble.predict(dataset.train.x))
    score_ensemble_accuracy = accuracy_score(dataset.test.y, ensemble.predict(dataset.test.x))


    if(accuracy_ensemble_train_accuracy > score_ensemble_train_accuracy):
        mlflow_client.log_metric(run_id, "accuracy_ensemble_selected", True)
        selected_ensemble_accuracy = accuracy_ensemble_accuracy
    else:
        selected_ensemble_accuracy = score_ensemble_accuracy

    mlflow_client.log_metric(run_id, "selected_ensemble_accuracy", selected_ensemble_accuracy)
    mlflow_client.log_metric(run_id, "method_selection_accuracy", score_ensemble_accuracy)
    mlflow_client.log_metric(run_id, "accuracy_selection_accuracy", accuracy_ensemble_accuracy)

    logger.info(f"{idx} method={selected_ensemble_accuracy} normal={accuracy_ensemble_accuracy}, discarded={accuracy_ensemble_train_accuracy > score_ensemble_train_accuracy}, diff={score_ensemble_accuracy-accuracy_ensemble_accuracy}")

In [24]:
def run_experiment(dataset, ensemble_size=10, bagging_size=200, n_gen=20, pop_size=5, parallelization=RayParallelization()):
    from loguru import logger
    mlflow_client = MlflowClient()

    mlflow_run = mlflow_client.create_run("1")
    run_id = mlflow_run.info.run_id

    mlflow_client.log_param(run_id, "dataset", dataset.name)
    mlflow_client.log_param(run_id, "ensemble_size", ensemble_size)
    mlflow_client.log_param(run_id, "bagging_size", bagging_size)
    mlflow_client.log_param(run_id, "n_gen", n_gen)
    mlflow_client.log_param(run_id, "pop_size", pop_size)
    mlflow_client.log_param(run_id, "dataset", dataset.name)

    try:
        run(dataset, ensemble_size=ensemble_size, bagging_size=bagging_size, pop_size=pop_size, n_gen=n_gen, run_id=run_id, parallelization=parallelization)
        mlflow_client.set_terminated(run_id=run_id)
    except Exception as ex:
        logger.exception(ex)
        mlflow_client.set_terminated(run_id=run_id, status="FAILED")



In [21]:

%%time
for idx, dataset in enumerate(datasets[:3]):
    run_experiment(dataset, ensemble_size=10, bagging_size=200, n_gen=5, pop_size=5, parallelization=RayParallelization())


[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:03:55.799 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:03:55.874 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:03:55.882 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5333333333333333 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:03:55.943 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.52 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:03:55.987 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5333333333333333 (0.37333333333333335)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:04:02.448 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.56 (0.37333333333333

2022-10-23 21:04:42.455 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home

[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:05:24.935 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.08806404657933042 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:05:25.110 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.08515283842794759 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:05:25.099 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.08642649199417755 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:05:25.167 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.08806404657933042 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:05:25.142 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.08515283842794759 (0.08133187772925765)
[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:05:44.842 | INFO     | scalarizing.scalarizing:_evaluate:204 

2022-10-23 21:07:17.872 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home

[2m[36m(execute_in_ray pid=19260)[0m 2022-10-23 21:07:31.820 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:07:31.842 | INFO     | scalarizing.scalarizing:_evaluate:204 - [balanced_accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19257)[0m 2022-10-23 21:07:31.863 | INFO     | scalarizing.scalarizing:_evaluate:204 - [accuracy] -> 0.5838383838383838 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19261)[0m 2022-10-23 21:07:31.953 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5717171717171717 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19258)[0m 2022-10-23 21:07:31.983 | INFO     | scalarizing.scalarizing:_evaluate:204 - [precision] -> 0.5717171717171717 (0.44646464646464645)
[2m[36m(execute_in_ray pid=19262)[0m 2022-10-23 21:07:37.983 | INFO     | scalarizing.scalarizing:_evaluate:204 - [ba

2022-10-23 21:08:08.904 | ERROR    | __main__:run_experiment:19 - log_metric() missing 1 required positional argument: 'value'
Traceback (most recent call last):

  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f5ce340ec90, file "/home/bogdan/Projects/rules-embedding-forest-reduction/experiments/scalarizing...
           └ <function _run_code at 0x7f5ce2de09d0>
  File "/home/bogdan/.pyenv/versions/3.8.9/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x7f5ce340ec90, file "/home