In [1]:
%load_ext autoreload
%autoreload 2


In [38]:
import mlflow
from sklearn.model_selection import ParameterGrid
from mlflow.tracking import MlflowClient
from pathlib import Path
from copy import deepcopy
import pandas as pd
import subprocess
import tempfile
from box import Box
from glob import glob
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from PyPruning.RandomPruningClassifier import RandomPruningClassifier
from PyPruning.GreedyPruningClassifier import GreedyPruningClassifier
from PyPruning.RankPruningClassifier import RankPruningClassifier 
from PyPruning.ProxPruningClassifier import ProxPruningClassifier
from PyPruning.ClusterPruningClassifier import ClusterPruningClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [3]:
from mlutils.ensemble.extract_ensemble import extract_classifiers_from_bagging

In [51]:
PROJECT_ROOT = Path.cwd().parents[0]
NOTEBOOKS_ROOT = PROJECT_ROOT / "notebooks"
DATASETS_DIR = PROJECT_ROOT / ".." / "datasets" / "processed"
PAPERMILL_PATH = "/home/bogul/.cache/pypoetry/virtualenvs/pypruning-new-1Ojj1SPs-py3.10/bin/papermill"


In [13]:
DATASETS_DIR.exists()

True

In [14]:
# mlflow.create_experiment('pypruning-base-new')
mlflow_id = '2'

In [15]:
all_processed_data_files = glob(f"{DATASETS_DIR}/*")
all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'train' in file
]
print(all_file_names[:5])

['tae', 'penbased', 'vowel', 'hepatitis', 'vowel']


In [16]:
train_and_test_paths = [
    {
        'train': path,
        'test': path.replace('train', 'test')
    } for path in all_processed_data_files if 'train' in path
]
print(train_and_test_paths[:2])

[{'train': '/home/bogul/pypruning-experiments/../datasets/processed/tae-train-3-s1.csv', 'test': '/home/bogul/pypruning-experiments/../datasets/processed/tae-test-3-s1.csv'}, {'train': '/home/bogul/pypruning-experiments/../datasets/processed/penbased-train-3-s1.csv', 'test': '/home/bogul/pypruning-experiments/../datasets/processed/penbased-test-3-s1.csv'}]


In [17]:
base_params = ParameterGrid({
    "train_path": [path for path in all_processed_data_files if 'train' in path],
    "bagging_size": [100, 200, 500],
    "ensemble_size": [5, 10, 20],
})

In [18]:
client = MlflowClient()

In [19]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return Box({
        "x": x,
        "y": y
    })

In [20]:
len(base_params)

5940

In [21]:
for param_set in base_params:
    run = client.create_run(experiment_id=str(mlflow_id))
    for param, value in param_set.items():
        client.log_param(run_id=run.info.run_id, key=param, value=value)

In [22]:
experiments_tmp_dir = tempfile.mkdtemp(prefix="bogul-exp-ijcnn")


In [23]:
TMP_DIR = Path(experiments_tmp_dir)
TMP_DIR

PosixPath('/home/bogul/tmp/bogul-exp-ijcnnvp_p2lfe')

In [27]:
experiments_df = mlflow.search_runs(experiment_ids=mlflow_id)

In [29]:
experiments_df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,params.train_path,params.ensemble_size,params.bagging_size
0,4c7b8ef6771a40ffa871eacee7854ac0,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:52:12.085000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,20,500
1,d3bf76a3119e4e4980e0223769012531,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:52:11.924000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,20,500
2,931e25fd49c440a7af06128a44d12d27,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:52:11.748000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,20,500
3,6f5d31940b44473182313f5b895002b8,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:52:11.564000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,20,500
4,25cca4ffba7944e8960813bd736deb2c,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:52:11.388000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,20,500
...,...,...,...,...,...,...,...,...,...
5935,f83198a5e5ce476ead4b10bf63cdc9fc,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:41:30.991000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,5,100
5936,3325fbbcac044171bbd9d4b3015abeee,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:41:30.917000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,5,100
5937,5de78a756f524d1bb3563ddae54fe140,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:41:30.847000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,5,100
5938,6d6ecc7d4028429295f6a57f79b309f9,2,RUNNING,file:///home/bogul/pypruning-experiments/noteb...,2023-03-05 22:41:30.786000+00:00,,/home/bogul/pypruning-experiments/../datasets/...,5,100


In [57]:
def run_experiments_in_slurm(run_ids, notebook_path, output_dir_path):
    futures = []
    
    for run_id in run_ids:
        # print(f"Running file {file_name}")
        papermill_command = f"{str(PAPERMILL_PATH)} {str(notebook_path)} {str(output_dir_path)}/{run_id}.ipynb -p EXPERIMENT_INSTANCE_ID {run_id}"
        std_out_path = output_dir_path / f"{run_id}.out"
        std_err_path = output_dir_path / f"{run_id}.err"
        
        slurm_command = f"sbatch -o {str(std_out_path)} -e {str(std_err_path)} slurm-script.sh \"{papermill_command}\"" 

        # print(slurm_command)
        subprocess.run(slurm_command, shell=True)

        
    return futures

In [47]:
TMP_DIR

PosixPath('/home/bogul/tmp/bogul-exp-ijcnnvp_p2lfe')

In [56]:
how_many_jobs = lambda: int(subprocess.run("squeue | grep bogul | wc -l", stdout=subprocess.PIPE, shell=True).stdout.decode('utf-8').strip())


In [58]:
import time

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
        
def run_in_batches(run_ids, notebook_path, output_dir_path, batch_size=250, sleep_interval=25):
    chunked_run_ids = chunks(run_ids, batch_size)
    
    while(True):
        jobs_currently = how_many_jobs()
        if jobs_currently <= batch_size:
            print(f"There are {jobs_currently} runnig jobs, scheduling next batch of {batch_size}")
            try:
                next_batch = next(chunked_run_ids)
                run_experiments_in_slurm(next_batch, notebook_path, output_dir_path)
            except StopIteration:
                print("End of batches!")
                break
        else:
            print(f"There are {jobs_currently}, cant schedule yet!")
        print(f"Waiting {sleep_interval}") 
        time.sleep(sleep_interval)
    
    

In [35]:
NOTEBOOK_PATH = NOTEBOOKS_ROOT / 'single-experiment-run.ipynb'
NOTEBOOK_PATH

PosixPath('/home/bogul/pypruning-experiments/notebooks/single-experiment-run.ipynb')

In [48]:
TMP_DIR

PosixPath('/home/bogul/tmp/bogul-exp-ijcnnvp_p2lfe')

In [54]:
run_experiments_in_slurm(experiments_df.run_id[:5], Path(NOTEBOOK_PATH), TMP_DIR)


Submitted batch job 1161314
Submitted batch job 1161315
Submitted batch job 1161316
Submitted batch job 1161317
Submitted batch job 1161318


[]

In [59]:
run_in_batches(experiments_df.run_id, Path(NOTEBOOK_PATH), TMP_DIR, sleep_interval=100)

There are 0 runnig jobs, scheduling next batch of 250
Submitted batch job 1161319
Submitted batch job 1161320
Submitted batch job 1161321
Submitted batch job 1161322
Submitted batch job 1161323
Submitted batch job 1161324
Submitted batch job 1161325
Submitted batch job 1161326
Submitted batch job 1161327
Submitted batch job 1161328
Submitted batch job 1161329
Submitted batch job 1161330
Submitted batch job 1161331
Submitted batch job 1161332
Submitted batch job 1161333
Submitted batch job 1161334
Submitted batch job 1161335
Submitted batch job 1161336
Submitted batch job 1161337
Submitted batch job 1161338
Submitted batch job 1161339
Submitted batch job 1161340
Submitted batch job 1161341
Submitted batch job 1161342
Submitted batch job 1161343
Submitted batch job 1161344
Submitted batch job 1161345
Submitted batch job 1161346
Submitted batch job 1161347
Submitted batch job 1161348
Submitted batch job 1161349
Submitted batch job 1161350
Submitted batch job 1161351
Submitted batch job 11