In [1]:
RUN_ID = '4c7b8ef6771a40ffa871eacee7854ac0'

In [2]:
# Parameters
EXPERIMENT_INSTANCE_ID = "6f5d31940b44473182313f5b895002b8"


In [3]:
import mlflow
import pandas as pd
from box import Box
from PyPruning.RandomPruningClassifier import RandomPruningClassifier
from PyPruning.GreedyPruningClassifier import GreedyPruningClassifier
from PyPruning.RankPruningClassifier import RankPruningClassifier 
from PyPruning.ProxPruningClassifier import ProxPruningClassifier
from PyPruning.ClusterPruningClassifier import ClusterPruningClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlutils.ensemble.extract_ensemble import extract_classifiers_from_bagging
from mlutils.encoding.refittable_label_encoder import RefitableLabelEncoder

In [4]:
mlflow.set_tracking_uri("file:///home/bogul/pypruning-experiments/notebooks/mlruns")


In [5]:
run = mlflow.start_run(run_id=RUN_ID)


In [6]:
params = Box(run.data.params,  box_recast={
                'bagging_size': int,
                'ensemble_size': int
            })

In [7]:
params.test_path = params.train_path.replace('train', 'test')


In [8]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return {
        "x": x,
        "y": y
    }

In [9]:
dataset = Box({
        'train': read_dataset(params.train_path),
        'test': read_dataset(params.test_path),
        'name': params.train_path.split("/")[-1].replace("-train", '')
    })


In [10]:
params

Box({'bagging_size': 500, 'ensemble_size': 20, 'train_path': '/home/bogul/pypruning-experiments/../datasets/processed/texture-train-2-s2.csv', 'test_path': '/home/bogul/pypruning-experiments/../datasets/processed/texture-test-2-s2.csv'})

In [11]:

le = RefitableLabelEncoder()
dataset.train.y = le.fit_transform(dataset.train.y)

bagging = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=3), n_estimators=params.bagging_size, max_samples=0.3, random_state=42)
bagging.fit(dataset.train.x, dataset.train.y) 
bagging_clfs = extract_classifiers_from_bagging(bagging)    


rank_pruning = RankPruningClassifier(n_estimators = params.bagging_size)
greedy_pruning = GreedyPruningClassifier(n_estimators = params.bagging_size)
cluster_pruning = ClusterPruningClassifier(n_estimators = params.bagging_size)
prox_pruning = ProxPruningClassifier()
random_pruning = RandomPruningClassifier(n_estimators = params.bagging_size)


In [12]:
prunings =  [rank_pruning, greedy_pruning, cluster_pruning, prox_pruning, random_pruning]

In [13]:
[pruning.prune(dataset.train.x, dataset.train.y, estimators=bagging_clfs) for pruning in prunings]

[<PyPruning.RankPruningClassifier.RankPruningClassifier at 0x14f8fce684f0>,
 <PyPruning.GreedyPruningClassifier.GreedyPruningClassifier at 0x14f8fce69ba0>,
 <PyPruning.ClusterPruningClassifier.ClusterPruningClassifier at 0x14f8fce6a2f0>,
 <PyPruning.ProxPruningClassifier.ProxPruningClassifier at 0x14f8fce6bc10>,
 <PyPruning.RandomPruningClassifier.RandomPruningClassifier at 0x14f8b0d08c40>]

In [14]:
acc_by_name = {pruning.__class__.__name__: accuracy_score(pruning.predict(dataset.test.x), dataset.test.y) 
for pruning in prunings}

In [15]:
mlflow.log_metrics(acc_by_name)

In [16]:
mlflow.log_metric('bagging_acc', accuracy_score(bagging.predict(dataset.test.x), dataset.test.y))

In [17]:
mlflow.end_run()