In [2]:
%load_ext autoreload
%autoreload 2


In [1]:


import numpy as np
import pandas as pd
import problexity as px
from box import Box
from loguru import logger as log
from mlflow import MlflowClient
from datasetz.core.load_dataset import load_embedded_dataset
from sklearn.model_selection import ShuffleSplit
from mlutils.datasets.dataset import Dataset
from mlutils.mlflow.utils import get_run_params, terminate_run, finish_run_and_print_exception
from sklearn.base import clone
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from wrapt_timeout_decorator import timeout
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from quadsplits.quadsplits import recursive_cutoff
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors
from datasetz.core.load_dataset import load_embedded_dataset
from sklearn.model_selection import StratifiedKFold

In [3]:
def exclude_indices(arr, indices):
    return arr[~np.isin(np.arange(arr.shape[0]), indices)]

def find_nearest_neighbors_indices(arr, point, n):
    nn = NearestNeighbors(n_neighbors=n)
    nn.fit(arr)
    return nn.kneighbors([point], n)[1].reshape(-1)

In [ ]:
def as_classifier_from_rf(clf_by_rule, rf):
    
    
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "predict": predict
    })

In [4]:
def as_classifier(clf_by_rule):
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "predict": predict
    })

In [10]:
BASE_CLFS = {
    "perceptron": Perceptron(random_state=42),
    "dt": DecisionTreeClassifier(random_state=42),
    "knn": KNeighborsClassifier(),
    "nb": GaussianNB(),
    "svm": LinearSVC(random_state=42)
}

COMPLEXITIES = {
    "f1": px.f1,
    "f2": px.f2,
    "f3": px.f3,
    "l1": px.l1,
    "l2": px.l2,
    "l3": px.l3,
    "n1": px.n1,
    "n2": px.n2,
    "n3": px.n3,
    "t1": px.t1, # slow
    "t2": px.t2, # slow
    "t3": px.t3
}

In [13]:
def train_quad_clf(train_x, train_y, base_clf_id = "perceptron", min_samples = 10, recursion_limit = -1, minimal_split_percentage = 0.1, complexity_measure = "f2", oversampling_in_splitting=None, neighbors_in_learning = None, log_metric=lambda x,y: None, log_param=lambda x,y: None):
    base_clf = BASE_CLFS.get(base_clf_id)
    complexity_measure_func = COMPLEXITIES.get(complexity_measure)
    
    if oversampling_in_splitting == "SMOTE":
        smote = SMOTE(random_state=42, k_neighbors=1)
        
        def resample_x_y(x,y):
            if (not (np.unique(y, return_counts=True)[1] > 3).all()) or len(np.unique(y)) <= 1:
                return x, y
            else: 
                log.info("Will resample with size {} {} and classes = {}", len(x), len(y), np.unique(y))
                return smote.fit_resample(x, y)
        
        oversampling_in_splitting_function = resample_x_y
    else:
        oversampling_in_splitting_function = lambda x,y: (x,y)
    
    statements = {}
    
    while (not statements and minimal_split_percentage > 0.1) or (len(statements) == 0) or (len(statements) == 1  and '' in statements):
        statements = recursive_cutoff(Box(
            x=train_x,
            y=train_y
        ), min_samples=min_samples, recursion_limit=recursion_limit, minimal_split_percentage=minimal_split_percentage, complexity_measure=complexity_measure_func, oversampling_function=oversampling_in_splitting_function)
        minimal_split_percentage = minimal_split_percentage - minimal_split_percentage * 0.1
        log.info("Stepping down minimial split percentage = {}", minimal_split_percentage)
    
    log.info("Statements {}", statements)
    
    if not statements:
        log.info("No statements! Training base clf")
        clf = clone(base_clf)
        clf.fit(train_x, train_y)
        log_metric("no_statements", True)
        return clf
    
    log_metric("actual_min_split_percentage", minimal_split_percentage)
    if statements:
        log_metric("statements_size", len(statements))
        # log_param("statements", str(statements))
    
    
    clf_by_rules = {}
    x_for_indices_calculation = pd.DataFrame(train_x)
    for col in range(x_for_indices_calculation.shape[1]):
        x_for_indices_calculation[f"col{col}"] = x_for_indices_calculation[col]
    indices_by_each_statement = {
        query: x_for_indices_calculation.query(query).index for query in statements
    }
    
    simple_areas = 0
    for query, idx in indices_by_each_statement.items():
        x_train = train_x[idx]
        y_train = train_y[idx]
        
        log.info("Before enhancing with nn = {}", len(x_train))
        
        if neighbors_in_learning is not None:
            centroid = x_train.mean(axis=0)
            nn_indices = find_nearest_neighbors_indices(exclude_indices(train_x, idx), centroid, neighbors_in_learning)
            x_train = np.append(x_train, train_x[nn_indices], axis=0)
            y_train = np.append(y_train, train_y[nn_indices], axis=0)

            log.info("After enhancing with nn = {}", len(x_train))
            # log.info(y_train)
    
        if len(np.unique(y_train)) == 1:
            clf_by_rules[query] = DummyClassifier(strategy="constant", constant=y_train[0]).fit(x_train, y_train)
            simple_areas = simple_areas + 1
        else:
            clf = clone(base_clf)
            clf.fit(x_train, y_train)
            clf_by_rules[query] = clf
    
    log_metric("simple_areas", simple_areas)
    
    return as_classifier(clf_by_rules)

In [8]:
def do_experiment_v1(run_id):
    client = MlflowClient(tracking_uri="sqlite:///experiments.db")
    params = get_run_params(run_id, client)

    assert float(params.min_split_percentage) < 0.5

    from loguru import logger
    logger.info(params)

    try:

        dataset = load_embedded_dataset('keel-embedded', params.dataset_name)
        # DATA and preprocessing

        splitter = StratifiedKFold()
        train_test_dataset = dataset \
            .encode_x_to_labels() \
            .encode_y_to_numeric_labels()\
            .train_test_split(splitter)[0] \
            

        # model
        model = timeout(400)(train_quad_clf)(
            train_test_dataset.train.x, train_test_dataset.train.y, 
            base_clf_id=params.base_clf, 
            min_samples=int(params.min_samples),
            minimal_split_percentage=float(params.min_split_percentage),
            complexity_measure=params.complexity_measure, 
            recursion_limit=30, 
            neighbors_in_learning=int(params.neighbors_in_learning) if (params.neighbors_in_learning != 'None' and params.neighbors_in_learning is not None) else None,
            oversampling_in_splitting=params.oversampling_in_splitting if (params.oversampling_in_splitting != 'None' and params.oversampling_in_splitting is not None) else None,
            log_metric=lambda title, value: client.log_metric(run_id, title, value), 
            log_param=lambda title, value: client.log_param(run_id, title, value))        

        acc = accuracy_score(model.predict(train_test_dataset.test.x), train_test_dataset.test.y)
        client.log_metric(run_id, "acc", acc)
        log.info("Acc = {}", acc)

        terminate_run(run_id, client=client)
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [9]:
do_experiment_v1('b5675b2ddf654008b23dd66183324080')

2024-03-25 19:26:30.614 | INFO     | __main__:do_experiment_v1:8 - {'base_clf': 'svm', 'complexity_measure': 'f2', 'dataset_name': 'ecoli', 'min_samples': '10', 'min_split_percentage': '0.3', 'neighbors_in_learning': '20', 'oversampling_in_splitting': 'None'}
2024-03-25 19:26:30.640 | INFO     | quadsplits.quadsplits:recursive_cutoff:55 - Recursion level = 0
2024-03-25 19:26:30.641 | DEBUG    | quadsplits.quadsplits:recursive_cutoff:65 - Recursion level = 1
2024-03-25 19:26:30.646 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=80.39999999999999
  return np.nanprod(f_overlap/f_range)
2024-03-25 19:26:30.662 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=80.39999999999999
2024-03-25 19:26:30.673 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=80.39999999999999
2024-03-25 19:26:30.674 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split pe

/Users/bgulowaty/studia/projekty/datasets/definitions
{PosixPath('/Users/bgulowaty/studia/projekty/datasets/definitions/keel-embedded-splits.yml'), PosixPath('/Users/bgulowaty/studia/projekty/datasets/definitions/keel-splits.yml'), PosixPath('/Users/bgulowaty/studia/projekty/datasets/definitions/keel-embedded.yml')}


2024-03-25 19:26:30.822 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.826 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.826 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.828 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.835 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.845 | INFO     | quadsplits.quadsplits:find_best_cutoff_for:20 - min split percentage=0.3, size=17.7
2024-03-25 19:26:30.854 | DEBUG    | quadsplits.quadsplits:recursive_cutoff:89 - Cutoffs = {0: {'cutoff': 19, 'value': 0.000791391061819862}, 1: {'cutoff': 15, 'value': 0.0}, 4: {'cutoff': 21, 'value': 0.0}, 5: {'cutoff': 18, 'value': 0.0}, 6: {'cutoff': 25, 'value': 0.0001

In [7]:
def do_experiment_base(run_id):
    client = MlflowClient(tracking_uri="sqlite:///experiments.db")
    param = get_run_params(run_id, client=client)
        
    from loguru import logger
    logger.info(param)

    try:


        dataset = load_embedded_dataset('keel-embedded', params.dataset_name)
        # DATA and preprocessing

        splitter = ShuffleSplit(random_state=42, n_splits=2, train_size=0.5)
        train_test_dataset = (dataset \
                              .encode_x_to_labels() \
                              .encode_y_to_numeric_labels() \
                              .train_test_split(splitter))[int(params.dataset_split)]

        # model
        dt = DecisionTreeClassifier(random_state=42)
        perceptron = Perceptron(random_state=42)
        random_forest = RandomForestClassifier(random_state=42)
        
        dt.fit(dataset.train.x, dataset.train.y)
        perceptron.fit(dataset.train.x, dataset.train.y)
        random_forest.fit(dataset.train.x, dataset.train.y)
        
        client.log_metric(run_id, "dt_acc", accuracy_score(dt.predict(dataset.test.x), dataset.test.y))
        client.log_metric(run_id, "rf_acc", accuracy_score(random_forest.predict(dataset.test.x), dataset.test.y))
        client.log_metric(run_id, "perceptron_acc", accuracy_score(perceptron.predict(dataset.test.x), dataset.test.y))

        terminate_run(run_id, client=client)
    except Exception as e:
        finish_run_and_print_exception(run_id, e, client = client)

In [8]:
import sys

log.remove(0)
log.add(sys.stderr, level="TRACE")