In [None]:
RUN_ID = '4c7b8ef6771a40ffa871eacee7854ac0'

In [None]:
import mlflow
import pandas as pd
from box import Box
from PyPruning.RandomPruningClassifier import RandomPruningClassifier
from PyPruning.GreedyPruningClassifier import GreedyPruningClassifier
from PyPruning.RankPruningClassifier import RankPruningClassifier 
from PyPruning.ProxPruningClassifier import ProxPruningClassifier
from PyPruning.ClusterPruningClassifier import ClusterPruningClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlutils.ensemble.extract_ensemble import extract_classifiers_from_bagging
from mlutils.encoding.refittable_label_encoder import RefitableLabelEncoder

In [None]:
mlflow.set_tracking_uri("file:///home/bogul/pypruning-experimentspypruning-experiments/notebooks/mlruns")


In [None]:
run = mlflow.start_run(run_id=RUN_ID)


In [None]:
params = Box(run.data.params,  box_recast={
                'bagging_size': int,
                'ensemble_size': int
            })

In [None]:
params.test_path = params.train_path.replace('train', 'test')


In [None]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return {
        "x": x,
        "y": y
    }

In [None]:
dataset = Box({
        'train': read_dataset(params.train_path),
        'test': read_dataset(params.test_path),
        'name': params.train_path.split("/")[-1].replace("-train", '')
    })


In [None]:
params

In [None]:

le = RefitableLabelEncoder()
dataset.train.y = le.fit_transform(dataset.train.y)

bagging = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=3), n_estimators=params.bagging_size, max_samples=0.3, random_state=42)
bagging.fit(dataset.train.x, dataset.train.y) 
bagging_clfs = extract_classifiers_from_bagging(bagging)    


rank_pruning = RankPruningClassifier(n_estimators = params.bagging_size)
greedy_pruning = GreedyPruningClassifier(n_estimators = params.bagging_size)
cluster_pruning = ClusterPruningClassifier(n_estimators = params.bagging_size)
prox_pruning = ProxPruningClassifier()
random_pruning = RandomPruningClassifier(n_estimators = params.bagging_size)


In [None]:
prunings =  [rank_pruning, greedy_pruning, cluster_pruning, prox_pruning, random_pruning]

In [None]:
[pruning.prune(dataset.train.x, dataset.train.y, estimators=bagging_clfs) for pruning in prunings]

In [None]:
acc_by_name = {pruning.__class__.__name__: accuracy_score(pruning.predict(dataset.test.x), dataset.test.y) 
for pruning in prunings}

In [None]:
mlflow.log_metrics(acc_by_name)

In [None]:
mlflow.log_metric('bagging_acc', accuracy_score(bagging.predict(dataset.test.x), dataset.test.y))

In [None]:
mlflow.end_run()