In [2]:
%load_ext autoreload
%autoreload 2


In [4]:
# import mlflow
from sklearn.model_selection import ParameterGrid
from mlflow.tracking import MlflowClient
from pathlib import Path
from copy import deepcopy
import pandas as pd
import tempfile
from box import Box
from glob import glob
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from PyPruning.RandomPruningClassifier import RandomPruningClassifier
from PyPruning.GreedyPruningClassifier import GreedyPruningClassifier
from PyPruning.RankPruningClassifier import RankPruningClassifier 
from PyPruning.ProxPruningClassifier import ProxPruningClassifier
from PyPruning.ClusterPruningClassifier import ClusterPruningClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [7]:
from mlutils.ensemble.extract_ensemble import extract_classifiers_from_bagging

In [12]:
PROJECT_ROOT = Path.cwd().parents[0]
NOTEBOOKS_ROOT = PROJECT_ROOT / "notebooks"
DATASETS_DIR = PROJECT_ROOT / ".." / "rules-embedding-forest-reduction" / "data" / "processed"

In [13]:
DATASETS_DIR.exists()

True

In [14]:
# mlflow.create_experiment('pypruning-base')
mlflow_id = 1

In [15]:
all_processed_data_files = glob(f"{DATASETS_DIR}/*")
all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'train' in file
]
print(all_file_names[:5])

['tae', 'penbased', 'vowel', 'hepatitis', 'vowel']


In [16]:
train_and_test_paths = [
    {
        'train': path,
        'test': path.replace('train', 'test')
    } for path in all_processed_data_files if 'train' in path
]
print(train_and_test_paths[:2])

[{'train': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/tae-train-3-s1.csv', 'test': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/tae-test-3-s1.csv'}, {'train': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/penbased-train-3-s1.csv', 'test': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/penbased-test-3-s1.csv'}]


In [17]:
base_params = ParameterGrid({
    "train_path": [path for path in all_processed_data_files if 'train' in path],
    "bagging_size": [100, 200, 500],
    "ensemble_size": [5, 10, 20],
})

In [18]:
client = MlflowClient()

In [19]:
def read_dataset(path):
    data = pd.read_csv(path)
    x = data.drop('TARGET', axis=1).values
    y = data['TARGET'].values

    return Box({
        "x": x,
        "y": y
    })

In [20]:
len(base_params)

5940

In [None]:
for param_set in base_params:
    run = client.create_run(experiment_id=str(mlflow_id))
    for param, value in param_set.items():
        client.log_param(run_id=run.info.run_id, key=param, value=value)

In [12]:
for idx, params in enumerate(list(base_params)[:30]):
    params = Box(params)
    print(params)
    train = read_dataset(params.train_path)
    test = read_dataset(params.train_path.replace('train', 'test'))
    
    le = LabelEncoder()
    le.fit(np.concatenate([train.y, test.y]))
    train.y = le.transform(train.y)
    test.y = le.transform(test.y)
    
    bagging = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=3), n_estimators=params.bagging_size, max_samples=0.3, random_state=42)
    bagging.fit(train.x,train.y) 
    bagging_clfs = extract_classifiers_from_bagging(bagging)    
    
    voting = VotingClassifier(estimators=bagging_clfs)
    clf_random_pruned = RankPruningClassifier(n_estimators = int(np.ceil(params.bagging_size)))
    clf_random_pruned.prune(train.x, train.y, bagging_clfs)
    voting_clf = EnsembleVoteClassifier(clfs=bagging_clfs,
                                  weights=[1 for _ in range(len(bagging_clfs))],
                                  fit_base_estimators=False,
                                       voting='soft')
    voting_clf.fit(train.x, train.y)

    acc_pruned = accuracy_score(clf_random_pruned.predict(test.x), test.y)
    acc_bagging = accuracy_score(bagging.predict(test.x), test.y)
    voting_clf_score = accuracy_score(voting_clf.predict(test.x), test.y)
    
#     for e in bagging.estimators_:
#         if len(bagging.classes_) > 2 and len(e.classes_) != len(bagging.classes_):
#             raise Exception
        
    print(f"{acc_pruned} vs {voting_clf_score} vs {acc_bagging}. Delta={acc_pruned-acc_bagging} ({voting_clf_score - acc_bagging})")

{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/tae-train-3-s1.csv'}




0.35526315789473684 vs 0.35526315789473684 vs 0.35526315789473684. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/penbased-train-3-s1.csv'}




0.9847161572052402 vs 0.9847161572052402 vs 0.9847161572052402. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/vowel-train-1-s1.csv'}




0.5292929292929293 vs 0.5292929292929293 vs 0.5292929292929293. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/hepatitis-train-3-s1.csv'}




0.85 vs 0.85 vs 0.85. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/vowel-train-3-s1.csv'}




0.5212121212121212 vs 0.5151515151515151 vs 0.5151515151515151. Delta=0.0060606060606061 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/hayes-roth-train-0-s2.csv'}




0.5875 vs 0.5875 vs 0.5875. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/coil2000-train-1-s1.csv'}




0.9403380166972104 vs 0.9403380166972104 vs 0.9403380166972104. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/wdbc-train-3-s2.csv'}




0.9084507042253521 vs 0.9084507042253521 vs 0.9084507042253521. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/flare-train-1-s1.csv'}




0.7335834896810507 vs 0.7335834896810507 vs 0.7335834896810507. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/cleveland-train-2-s2.csv'}




0.5405405405405406 vs 0.5405405405405406 vs 0.5405405405405406. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/segment-train-4-s2.csv'}




0.8995670995670996 vs 0.8995670995670996 vs 0.8995670995670996. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/german-train-1-s2.csv'}




0.678 vs 0.672 vs 0.672. Delta=0.006000000000000005 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/flare-train-0-s1.csv'}




0.7373358348968105 vs 0.7373358348968105 vs 0.7373358348968105. Delta=0.0 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/german-train-2-s1.csv'}




0.712 vs 0.714 vs 0.714. Delta=-0.0020000000000000018 (0.0)
{'bagging_size': 100, 'ensemble_size': 5, 'train_path': '/home/bogdan/Projects/pypruning-new/../rules-embedding-forest-reduction/data/processed/abalone-train-2-s2.csv'}


IndexError: index 26 is out of bounds for axis 2 with size 24

In [None]:
data, target = load_digits(return_X_y = True)

# Perform a test / train / prune split
XTP, Xtest, ytp, ytest = train_test_split(data, target, test_size=0.25, random_state=42)
Xtrain, Xprune, ytrain, yprune = train_test_split(XTP, ytp, test_size=0.25, random_state=42)

n_base = 128
n_prune = 8

# Train a "large" initial random forest
model = RandomForestClassifier(n_estimators=n_base)
model.fit(XTP, ytp)
pred = model.predict(Xtest)

print("Accuracy of RF trained on XTrain + XPrune with {} estimators: {} %".format(n_base, 100.0 * accuracy_score(ytest, pred)))

# Train a "small" initial random forest for reference
model = RandomForestClassifier(n_estimators=n_base)
model.fit(Xtrain, ytrain)
pred = model.predict(Xtest)

print("Accuracy of RF trained on XTrain only with {} estimators: {} %".format(n_base, 100.0 * accuracy_score(ytest, pred)))

# Use different pruning methods to prune the large forest
pruned_model = RandomPruningClassifier(n_estimators = n_prune)
pruned_model.prune(Xprune, yprune, model.estimators_)
pred = pruned_model.predict(Xtest)
print("Accuracy of RandomPruningClassifier with {} estimators: {} %".format(n_prune, 100.0 * accuracy_score(ytest, pred)))