In [1]:
%load_ext autoreload
%autoreload 2


In [2]:


import numpy as np
import pandas as pd
import problexity as px
from box import Box
from loguru import logger as log
from mlflow import MlflowClient
from mlutils.datasets.dataset import Dataset
from mlutils.mlflow.utils import get_run_params, terminate_run, finish_run_and_print_exception
from sklearn.base import clone
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from wrapt_timeout_decorator import timeout
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [3]:
def find_best_cutoff_for(dataset, dimension, minimal_split_percentage = 0.1, cutoff_function=px.f2):
    possible_cutoffs = sorted(set(dataset.x[:, dimension]))
    log.trace("Possible cutoffs = {}", possible_cutoffs)
    samples_in_cutoffs = [
        Box({
            'cutoff': cutoff,
            'left': {
                'x': dataset.x[dataset.x[:, dimension] <= cutoff],
                'y': dataset.y[dataset.x[:, dimension] <= cutoff],
            },
            'right': {
                'x': dataset.x[dataset.x[:, dimension] > cutoff],
                'y': dataset.y[dataset.x[:, dimension] > cutoff],
            }
         })
        for cutoff in possible_cutoffs
    ]

    def should_pass(cutoff_samples):
        left_size = len(cutoff_samples.left.x)
        right_size = len(cutoff_samples.right.x)
        
        min_split_size = len(dataset.x) * minimal_split_percentage

        log.debug("Checking left and right split sizes l={} r={}, total_dataset_size={}, test={}{}", left_size, right_size, len(dataset.x),  left_size > min_split_size,  right_size > minimal_split_percentage )

        return left_size >= min_split_size and right_size >= min_split_size

    samples_in_cutoffs_filtered = [
        cutoff_samples for cutoff_samples in samples_in_cutoffs if should_pass(cutoff_samples)
    ]
    
    log.trace("Cutoffs filtered = {}", samples_in_cutoffs_filtered)

    for it in samples_in_cutoffs_filtered:
        try:
            log.debug("Computing complexity for {}", it)
            left_complexities = []
            for label in np.unique(it.left.y):
                log.debug("Main label {}", label)
                ovo_y = it.left.y.copy()
                ovo_y[ovo_y != label] = 0
                ovo_y[ovo_y == label] = 1
                if len(np.unique(ovo_y)) == 1:
                    continue
                left_complexity = cutoff_function(it.left.x, ovo_y)
                left_complexities.append(left_complexity)


            right_complexities = []
            for label in np.unique(it.right.y):
                log.debug("Main label {}", label)
                ovo_y = it.left.y.copy()
                ovo_y[ovo_y != label] = 0
                ovo_y[ovo_y == label] = 1
                if len(np.unique(ovo_y)) == 1:
                    continue
                right_complexity = cutoff_function(it.left.x, ovo_y)
                right_complexities.append(right_complexity)
                
        except Exception as e:
            raise e
            log.exception(e)
            return None

        it.left_complexity = np.mean(left_complexities)
        it.right_complexity = np.mean(right_complexities)

    if not samples_in_cutoffs_filtered:
        return None
    lowest_complexity = min(samples_in_cutoffs_filtered, key = lambda it: it.left_complexity + it.right_complexity)

    if lowest_complexity is None:
        return None


    return lowest_complexity.cutoff, lowest_complexity.left_complexity + lowest_complexity.right_complexity


In [4]:
def recursive_cutoff(dataset, current_conditions=None, recursion_level=0, min_samples = 10, recursion_limit = 4, minimal_split_percentage = 0.1, complexity_measure = px.f2):
    log.info("Recursion level = {}", recursion_level)
    
    if current_conditions is None:
        current_conditions = list()

    if recursion_limit != -1 and recursion_level >= recursion_limit:
        log.info("Recursion limit reached {}", recursion_level)
        return {" and ".join(current_conditions)}
    recursion_level = recursion_level + 1
    log.debug("Recursion level = {}", recursion_level)

    if len(dataset.x) < min_samples:
        log.info("min_samples limit reached {} < {}", len(dataset.x), min_samples)
        return {" and ".join(current_conditions)}

    features_count = dataset.x.shape[1]

    best_cutoff_by_dimension = {}

    for feature_idx in range(features_count):
        cutoff_and_value = find_best_cutoff_for(Box(x=dataset.x, y=dataset.y), feature_idx, minimal_split_percentage=minimal_split_percentage, cutoff_function=complexity_measure)
        if cutoff_and_value is None:
            continue

        cutoff, value = cutoff_and_value

        best_cutoff_by_dimension[feature_idx] = {
            'cutoff': cutoff,
            'value': value
        }

    if not best_cutoff_by_dimension:
        log.debug("No best cutoff found")
        return {" and ".join(current_conditions)}

    best_cutoff_entry = min(best_cutoff_by_dimension.items(), key=lambda it: it[1]['cutoff'])
    best_cutoff_dimension = best_cutoff_entry[0]
    best_cutoff = best_cutoff_entry[1]['cutoff']
    best_cutoff_value = best_cutoff_entry[1]['value']

    log.debug("Best cutoff value = {} ({} at dim {})", best_cutoff_value, best_cutoff, best_cutoff_dimension)

    left_conditions = f"col{best_cutoff_dimension} <= {best_cutoff}"
    right_conditions = f"col{best_cutoff_dimension} > {best_cutoff}"
    log.debug(left_conditions)
    log.debug(right_conditions)

    left_indicies = dataset.x[:, best_cutoff_dimension] <= best_cutoff
    right_indicies = dataset.x[:, best_cutoff_dimension] > best_cutoff

    left_statements = recursive_cutoff(Box(x=dataset.x[left_indicies], y=dataset.y[left_indicies]), current_conditions + [left_conditions], recursion_level + 1)
    right_statements = recursive_cutoff(Box(x=dataset.x[right_indicies], y=dataset.y[right_indicies]), current_conditions + [right_conditions], recursion_level + 1)
    log.debug("Left statements {}", left_statements)
    log.debug("Right statments {}", right_statements)

    return left_statements.union(right_statements)


In [5]:
def as_classifier(clf_by_rule):
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "predict": predict
    })

In [6]:
def train_quad_clf(train_x, train_y, base_clf_id = "perceptron", min_samples = 10, recursion_limit = -1, minimal_split_percentage = 0.1, complexity_measure = "f2"):
    base_clf = {
        "perceptron": Perceptron(random_state=42),
        "dt": DecisionTreeClassifier(random_state=42),
        "knn": KNeighborsClassifier(),
        "nb": GaussianNB(),
    }.get(base_clf_id)
    
    complexity_measure_func = {
        "f2": px.f2,
        "f4": px.f4,
        "l2": px.l2,
        "l3": px.l3,
        "n1": px.n1,
        "n2": px.n2,
        "n3": px.n3,
        "t1": px.t1, # slow
        "clsCoef": px.clsCoef,
        "density": px.density,    
    }.get(complexity_measure)
    
    
    statements = recursive_cutoff(Box(
        x=train_x,
        y=train_y
    ), min_samples=min_samples, recursion_limit=recursion_limit, minimal_split_percentage=minimal_split_percentage, complexity_measure=complexity_measure_func)
    
    log.info("Statements {}", statements)
    
    clf_by_rules = {}
    x_for_indices_calculation = pd.DataFrame(train_x)
    for col in range(x_for_indices_calculation.shape[1]):
        x_for_indices_calculation[f"col{col}"] = x_for_indices_calculation[col]
    indices_by_each_statement = {
        query: x_for_indices_calculation.query(query).index for query in statements
    }
    
    for query, idx in indices_by_each_statement.items():
        x_train = train_x[idx]
        y_train = train_y[idx]
        
        log.debug(len(x_train))
        log.debug(query)
    
        if len(np.unique(y_train)) == 1:
            clf_by_rules[query] = DummyClassifier(strategy="constant", constant=y_train[0]).fit(x_train, y_train)
        else:
            clf = clone(base_clf)
            clf.fit(x_train, y_train)
            clf_by_rules[query] = clf
    
    return as_classifier(clf_by_rules)

In [11]:
def do_experiment_v1(run_id):
    client = MlflowClient(tracking_uri="sqlite:///experiments.db")
    param = get_run_params(run_id, client)
    from loguru import logger
    logger.info(param)

    try:
        # DATA and preprocessing
        train_path = param.train_path.replace('tra', 'tst')
        name = param.train_path.split("/")[-1].split('-')[0]
        dataset = Dataset.read_dataset(param.train_path, train_path, name) \
            .encode_x_to_labels() \
            .encode_y_to_numeric_labels()

        # model
        model = timeout(200)(train_quad_clf)(dataset.train.x, dataset.train.y, base_clf_id=param.base_clf, min_samples=int(param.min_samples), minimal_split_percentage=float(param.min_split_percentage), complexity_measure=param.complexity_measure)
        

        acc = accuracy_score(model.predict(dataset.test.x), dataset.test.y)
        client.log_metric(run_id, "acc", acc)
        log.info("Acc = {}", acc)

        terminate_run(run_id, client=client)
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [12]:
def do_experiment_base(run_id):
    client = MlflowClient(tracking_uri="sqlite:///experiments.db")
    param = get_run_params(run_id, client=client)
    from loguru import logger
    logger.info(param)

    try:
        # DATA and preprocessing
        train_path = param.train_path.replace('tra', 'tst')
        name = param.train_path.split("/")[-1].split('-')[0]
        dataset = Dataset.read_dataset(param.train_path, train_path, name) \
            .encode_x_to_labels() \
            .encode_y_to_numeric_labels()

        # model
        dt = DecisionTreeClassifier(random_state=42)
        perceptron = Perceptron(random_state=42)
        random_forest = RandomForestClassifier(random_state=42)
        
        dt.fit(dataset.train.x, dataset.train.y)
        perceptron.fit(dataset.train.x, dataset.train.y)
        random_forest.fit(dataset.train.x, dataset.train.y)
        
        client.log_metric(run_id, "dt_acc", accuracy_score(dt.predict(dataset.test.x), dataset.test.y))
        client.log_metric(run_id, "rf_acc", accuracy_score(random_forest.predict(dataset.test.x), dataset.test.y))
        client.log_metric(run_id, "perceptron_acc", accuracy_score(perceptron.predict(dataset.test.x), dataset.test.y))

        terminate_run(run_id, client=client)
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [9]:
import sys

log.remove(0)
log.add(sys.stderr, level="INFO")