In [8]:
import scipy
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:


import numpy as np
import pandas as pd
import problexity as px
from box import Box
from loguru import logger as log
import mlflow

from mlflow import MlflowClient
from datasetz.core.load_dataset import load_embedded_dataset
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from mlutils.mlflow.utils import get_run_params, terminate_run, finish_run_and_print_exception
from sklearn.base import clone
from mlutils.scikit.ovo import ovo

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from wrapt_timeout_decorator import timeout
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from quadsplits.quadsplits import recursive_cutoff
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from datasetz.core.load_dataset import load_embedded_dataset
from imblearn.metrics import geometric_mean_score, classification_report_imbalanced
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score

from sklearn.model_selection import StratifiedKFold
from mlutils.scikit.utils import is_fitted
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from mlutils.scikit.utils import is_fitted
from sklearn.metrics import classification_report, confusion_matrix


In [10]:
DT_PARAMS = {'ccp_alpha': 0.011538226894236229, 'criterion': 'gini', 'max_features': None}
DT_PARAMS_WO_DEPTH = {'criterion': 'gini', 'max_features': None}
RF_PARAMS = {'n_estimators': 32}

In [11]:
def calculate_imb_metrics(y_true, y_pred):
    return {
        'balacc': balanced_accuracy_score(y_true, y_pred),
        'gmean': geometric_mean_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

In [12]:
def calculate_and_log_classification_metrics(y_true, y_pred, client, run_id, prefix = ""):
    conf_matrix = calculate_confusion_matrix(y_true, y_pred)
    cls_report = classification_report(y_true, y_pred, output_dict=True)
    imb_cls_report = classification_report_imbalanced(y_true, y_pred, output_dict=True)
    imb_metrics = calculate_imb_metrics(y_true, y_pred)

    client.log_dict(run_id, cls_report, f'{prefix}cls_report.json')
    client.log_dict(run_id, imb_cls_report, f'{prefix}imb_cls_report.json')

    for k, v in {**conf_matrix, **imb_metrics}.items():
        client.log_metric(run_id, f"{prefix}{k}", v)

In [32]:
def calculate_metrics_dt(model):
    return {
        'n_leaves': model.get_n_leaves(),
        'depth': model.get_depth()
    }

def calculate_metrics_qs(model):
    clf_by_rules = model.clf_by_rules_

    clfs = clf_by_rules.values()

    print(clfs)
    dummy_classifiers_no = len([clf for clf in clfs if isinstance(clf, DummyClassifier)])

    dts = [calculate_metrics_dt(clf) for clf in clfs if isinstance(clf, DecisionTreeClassifier)]
    
    no_of_predicates = np.array([rule.count("and")+1 for rule in clf_by_rules.keys()])
    
    return {
        "dts": dts,
        "dummy_clfs": dummy_classifiers_no,
        "all_clfs": len(clfs),
        "complex_clfs": len(clfs) - dummy_classifiers_no,
        "rules_no": len(clf_by_rules),
        "rules": no_of_predicates.tolist(),
        "no_of_predicates_avg": no_of_predicates.mean(),
        "no_of_predicates_median": np.median(no_of_predicates),
        "no_of_predicates_max": np.max(no_of_predicates),
        "no_of_predicates_min": np.min(no_of_predicates)
        
    }

def calculate_confusion_matrix(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred, y_true).ravel()

    return {
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp
    }
def get_params_and_client(run_id):
    client = MlflowClient(tracking_uri="http://192.168.1.181:5010")
    mlflow.set_tracking_uri("http://192.168.1.181:5010")
    params = get_run_params(run_id, client)
    params.should_take_test = params.should_take_test.lower() == "true"
    params.data_shuffle_random_state = int(params.data_shuffle_random_state)
    return params, client


def get_dataset(params):
    dataset = load_embedded_dataset('keel-binary-fast', params.dataset).encode_x_to_labels()

    x_whole = dataset.x()
    y_whole = dataset.y()

    sss = StratifiedShuffleSplit(random_state=params.data_shuffle_random_state, n_splits=1, test_size=0.5)
    train_idx, test_idx = next(sss.split(x_whole, y_whole))

    if params.should_take_test:
        x_train = x_whole[test_idx]
        y_train = y_whole[test_idx]

        x_test = x_whole[train_idx]
        y_test = y_whole[train_idx]
    else:
        x_train = x_whole[train_idx]
        y_train = y_whole[train_idx]

        x_test = x_whole[test_idx]
        y_test = y_whole[test_idx]

    return x_train, y_train, x_test, y_test


In [14]:
def exclude_indices(arr, indices):
    return arr[~np.isin(np.arange(arr.shape[0]), indices)]

def find_nearest_neighbors_indices(arr, point, n):
    nn = NearestNeighbors(n_neighbors=n)
    nn.fit(arr)
    return nn.kneighbors([point], n)[1].reshape(-1)

In [15]:
def as_classifier_from_rf(clf_by_rule, rf):
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "clf_by_rule": clf_by_rule,
        "predict": predict
    })

In [16]:
def as_classifier(clf_by_rule):
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "clf_by_rule": clf_by_rule,
        "predict": predict
    })

In [17]:
def dont_invoke_for_single_class(func):
    def wrapped(x, y):
        if len(np.unique(y)) == 1:
            return 0
        
        return func(x, y)

    return wrapped


In [18]:
def wrap_in_ovo(func):
    def wrapped(x, y):
        return np.mean(ovo(func, x, y))

    return wrapped
    

In [19]:

COMPLEXITY_METRICS = {k: dont_invoke_for_single_class(v) for k, v in {
    'f2': px.f2,
    't4': px.t4,
    'c1': px.c1,
    'n3': px.n3,
    'l2': px.l2,
    'density': px.density,
}.items()}

MODELS = {
    "knn": KNeighborsClassifier(n_neighbors=3),
    "bayes": GaussianNB(),
    "dt": DecisionTreeClassifier(random_state=42, **DT_PARAMS),
    "svm": LinearSVC(random_state=42)
}


In [20]:
def train_quad_clf(train_x, train_y, base_clf = Perceptron(random_state=42), min_samples = 10, recursion_limit = -1, minimal_split_percentage = 0.1, complexity_measure = px.f2, oversampling_in_splitting=None, neighbors_in_learning = None, log_metric = lambda x, y: None):
    
    if oversampling_in_splitting == "SMOTE":
        smote = SMOTE(random_state=42, k_neighbors=1)
        
        def resample_x_y(x,y):
            if (not (np.unique(y, return_counts=True)[1] > 3).all()) or len(np.unique(y)) <= 1:
                return x, y
            else: 
                log.info("Will resample with size {} {} and classes = {}", len(x), len(y), np.unique(y))
                return smote.fit_resample(x, y)
        
        oversampling_in_splitting_function = resample_x_y
    else:
        oversampling_in_splitting_function = lambda x,y: (x,y)
    
    statements = {}
    
    while (not statements and minimal_split_percentage > 0.1) or (len(statements) == 0) or (len(statements) == 1  and '' in statements):
        statements = recursive_cutoff(Box(
            x=train_x,
            y=train_y
        ), min_samples=min_samples, recursion_limit=recursion_limit, minimal_split_percentage=minimal_split_percentage, complexity_measure=complexity_measure, oversampling_function=oversampling_in_splitting_function)
        minimal_split_percentage = minimal_split_percentage - minimal_split_percentage * 0.1
        log.info("Stepping down minimial split percentage = {}", minimal_split_percentage)
    
    log.info("Statements {}", statements)
    
    if not statements:
        log.info("No statements! Training base clf")
        clf = clone(base_clf)
        clf.fit(train_x, train_y)
        log_metric("no_statements", True)
        return clf
    
    log_metric("actual_min_split_percentage", minimal_split_percentage)
    if statements:
        log_metric("statements_size", len(statements))
    
    clf_by_rules = {}
    x_for_indices_calculation = pd.DataFrame(train_x)
    for col in range(x_for_indices_calculation.shape[1]):
        x_for_indices_calculation[f"col{col}"] = x_for_indices_calculation[col]
    indices_by_each_statement = {
        query: x_for_indices_calculation.query(query).index for query in statements
    }
    
    simple_areas = 0
    for query, idx in indices_by_each_statement.items():
        x_train = train_x[idx]
        y_train = train_y[idx]
        
        log.info("Before enhancing with nn = {}", len(x_train))
        
        if neighbors_in_learning is not None:
            centroid = x_train.mean(axis=0)
            nn_indices = find_nearest_neighbors_indices(exclude_indices(train_x, idx), centroid, neighbors_in_learning)
            x_train = np.append(x_train, train_x[nn_indices], axis=0)
            y_train = np.append(y_train, train_y[nn_indices], axis=0)

            log.info("After enhancing with nn = {}", len(x_train))
            # log.info(y_train)
    
        if len(np.unique(y_train)) == 1:
            clf_by_rules[query] = DummyClassifier(strategy="constant", constant=y_train[0]).fit(x_train, y_train)
            simple_areas = simple_areas + 1
        else:
            clf = clone(base_clf)
            clf.fit(x_train, y_train)
            clf_by_rules[query] = clf
    
    log_metric("simple_areas", simple_areas)
    
    return as_classifier(clf_by_rules)

In [21]:
def find_best_tree(rf, x, y):
    best_score = 0
    best_tree = None
    for tree in rf.estimators_:
        tree_preds = tree.predict(x)
        acc = accuracy_score(tree_preds, y)
        if acc > best_score:
            best_tree = tree
            best_score = acc

    return best_tree, best_score

In [22]:
def rf_complexity_measure(rf):

    def measure(x, y):
        tree, acc = find_best_tree(rf, x, y)
    
        return 1 - acc

    
    
    return measure

In [23]:
from sklearn.base import BaseEstimator, ClassifierMixin


class QuadSplitClassifier(BaseEstimator, ClassifierMixin):
        
    def __init__(self, minimal_split_percentage, min_samples, complexity_measure, base_clf=None, recursion_limit=20, rf_to_explain=None):
        self.minimal_split_percentage = minimal_split_percentage
        self.min_samples = min_samples
        self.recursion_limit = recursion_limit
        self.complexity_measure = complexity_measure
        self.base_clf = base_clf
        self.rf_to_explain = rf_to_explain
        
    def fit(self, x, y):
        x, y = check_X_y(x, y)


        assert self.base_clf is not None or self.rf_to_explain is not None
        assert self.minimal_split_percentage < 0.5
        assert self.minimal_split_percentage > 0.1
        assert isinstance(self.minimal_split_percentage, float)
        assert self.min_samples >= 1
        assert isinstance(self.min_samples, int)
        
        oversampling_in_splitting_function = lambda x,y: (x,y)
        statements = {}

        minimal_split_percentage = self.minimal_split_percentage
        while (not statements and self.minimal_split_percentage > 0.1) or (len(statements) == 0) or (len(statements) == 1  and '' in statements):
            statements = recursive_cutoff(Box(
                x=x,
                y=y
            ), min_samples=self.min_samples, recursion_limit=self.recursion_limit, minimal_split_percentage=minimal_split_percentage, complexity_measure=self.complexity_measure, oversampling_function=oversampling_in_splitting_function)
            minimal_split_percentage = minimal_split_percentage - minimal_split_percentage * 0.1
            log.info("Stepping down minimial split percentage = {}", minimal_split_percentage)

        log.info("Statements {}", statements)
        
        if not statements:
            if self.rf_to_explain is not None:
                log.info("No statements! getting best tree")
                tree, acc = find_best_tree(self.rf_to_explain, x, y)
                return tree
            log.info("No statements! Training base clf")
            clf = clone(self.base_clf)
            clf.fit(x, y)
            return clf
        
        clf_by_rules = {}
        x_for_indices_calculation = pd.DataFrame(x)
        for col in range(x_for_indices_calculation.shape[1]):
            x_for_indices_calculation[f"col{col}"] = x_for_indices_calculation[col]
        indices_by_each_statement = {
            query: x_for_indices_calculation.query(query).index for query in statements
        }

        simple_areas = 0
        for query, idx in indices_by_each_statement.items():
            x_train = x[idx]
            y_train = y[idx]
        
        
            if len(np.unique(y_train)) == 1:
                clf_by_rules[query] = DummyClassifier(strategy="constant", constant=y_train[0]).fit(x_train, y_train)
                simple_areas = simple_areas + 1
            else:
                if self.rf_to_explain is not None:
                    tree, acc = find_best_tree(self.rf_to_explain, x_train, y_train)
                    clf_by_rules[query] = tree
                else:
                    clf = clone(self.base_clf)
                    clf.fit(x_train, y_train)
                    clf_by_rules[query] = clf
        
        self.clf_by_rules_ = clf_by_rules
        self.classes_ = np.unique(y)
        return self
    
    
    def predict(self, x):
        check_is_fitted(self)
        
        df = pd.DataFrame(x, columns=[f"col{i}" for i in range(x.shape[1])])
    
        for rule, clf in self.clf_by_rules_.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()
    
            if len(to_predict) == 0:
                continue
    
            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule
    
        return df.prediction.to_numpy()

In [24]:
def experiment_quad_split(run_id):
    params, client = get_params_and_client(run_id)
    params = Box(params,  box_recast={
        'min_split_percentage': float,
        'min_samples': int
    })
    log.info(params)

    base_clf = MODELS.get(params.base_clf)
    complexity_measure_func = COMPLEXITY_METRICS.get(params.complexity_measure)
    
    log.info(params)

    try:
        x_train, y_train, x_test, y_test = get_dataset(params)

        # model
        model = QuadSplitClassifier(
            base_clf=base_clf, 
            min_samples=params.min_samples,
            minimal_split_percentage=params.min_split_percentage,
            complexity_measure=complexity_measure_func, 
            recursion_limit=50
        )
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)

        metrics = calculate_metrics_qs(model)
        client.log_dict(run_id, metrics, f'quad_split.json')
        
        calculate_and_log_classification_metrics(y_test, y_pred, client, run_id)

        terminate_run(run_id, client=client)
        log.info("Run finished")
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [25]:
def experiment_quad_split_explain(run_id):
    params, client = get_params_and_client(run_id)
    params = Box(params,  box_recast={
        'min_split_percentage': float,
        'min_samples': int,
        'n_estimators': int,
    })
    log.info(params)

    try:
        x_train, y_train, x_test, y_test = get_dataset(params)

        complexity_measure_func = COMPLEXITY_METRICS.get(params.complexity_measure)

        rf = RandomForestClassifier(random_state=42, **DT_PARAMS, n_estimators=params.n_estimators)
        rf.fit(x_train, y_train)
        # model
        model = QuadSplitClassifier(
            rf_to_explain=rf,
            min_samples=params.min_samples,
            minimal_split_percentage=params.min_split_percentage,
            complexity_measure=complexity_measure_func,
            recursion_limit=50
        )
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)


        metrics = calculate_metrics_qs(model)
        client.log_dict(run_id, metrics, f'quad_split.json')
        
        calculate_and_log_classification_metrics(y_test, y_pred, client, run_id)


        terminate_run(run_id, client=client)
        log.info("Run finished")
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [26]:
def experiment_quad_split_explain_2(run_id):
    params, client = get_params_and_client(run_id)
    params = Box(params,  box_recast={
        'min_split_percentage': float,
        'min_samples': int,
    })
    log.info(params)

    try:
        x_train, y_train, x_test, y_test = get_dataset(params)

        base_clf = MODELS.get(params.base_clf)
        complexity_measure_func = COMPLEXITY_METRICS.get(params.complexity_measure)

        rf = RandomForestClassifier(random_state=42, **RF_PARAMS, **DT_PARAMS)
        # model
        rf.fit(x_train, y_train)
        
        model = QuadSplitClassifier(
            base_clf=base_clf,
            min_samples=params.min_samples,
            minimal_split_percentage=params.min_split_percentage,
            complexity_measure=complexity_measure_func,
            recursion_limit=50
        )
        model.fit(x_train, rf.predict(x_train))

        y_pred = model.predict(x_test)

        metrics = calculate_metrics_qs(model)
        client.log_dict(run_id, metrics, f'quad_split.json')

        calculate_and_log_classification_metrics(y_test, y_pred, client, run_id)
        
        terminate_run(run_id, client=client)
        log.info("Run finished")
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [27]:
import sys
log.remove(0)
log.add(sys.stderr, level="INFO")

1