In [1]:
%load_ext autoreload
%autoreload 2


In [230]:
from glob import glob

import numpy as np
import problexity as px
from box import Box
import pandas as pd
from loguru import logger as log
from mlutils.datasets.dataset import Dataset
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.base import clone
from toolz.curried import filter
from sklearn.metrics import  accuracy_score


In [3]:
all_processed_data_files = glob(f"../datasets/notebooks/processed/*.csv")

all_file_names = [
    file.split('/')[-1].split('train')[0].rstrip('-') for file in all_processed_data_files if 'tra.csv' in file
]
print(all_file_names[:5])

['shuttle-5-5tra.csv', 'pima-5-5tra.csv', 'wisconsin-5-4tra.csv', 'automobile-5-2tra.csv', 'spambase-5-1tra.csv']


In [4]:
train_file_paths = [it for it in all_processed_data_files if 'tra.csv' in it]
dataset_names = {path.split('/')[-1].split('-')[0] for path in train_file_paths}
selected_datasets = []
for name in dataset_names:
    selected_datasets.append([p for p in train_file_paths if name in p][0])

In [5]:
param = Box(
    train_path = train_file_paths[5],
    ensemble_size=20
)

In [6]:

train_path = param.train_path.replace('tra', 'tst')
name = param.train_path.split("/")[-1].split('-')[0]
dataset = Dataset.read_dataset(param.train_path, train_path, name) \
    .encode_x_to_labels() \
    .encode_y_to_numeric_labels()

# bagging
bagging = BaggingClassifier(
    estimator=Perceptron(random_state=42),
    n_estimators=int(param.ensemble_size),
    max_samples=0.3,
    random_state=42)
bagging.fit(dataset.train.x, dataset.train.y)


In [7]:
import warnings
warnings.filterwarnings("ignore")


In [8]:
test_x = [
    [0, 0], [10,10], [0, 10], [10, 0], [5, 5], [5, 7]
]
test_y = [1, 1, 1, 1, 0, 0]
px.f2(test_x, test_y)

0.0

In [215]:
labels = np.array([1,3,2,2,2,2,1,2,3,4,5,5,5,5,6])

In [217]:
labels_c = labels.copy()

In [218]:
labels_c[labels_c != 2] = 0

In [220]:
labels_c

array([0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0])

In [213]:
np.putmask(labels, labels!=2, 1)

In [214]:
labels

array([1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1])

In [203]:
np.array([1,2,2,2,2,1]) != 2

array([ True, False, False, False, False,  True])

In [228]:
def find_best_cutoff_for(dataset, dimension, cutoff_function=px.f2):
    log.debug("Cutoff calculation dataset size {}", len(dataset.x))
    possible_cutoffs = sorted(set(dataset.x[:, dimension]))
    log.debug("Possible cutoffs size {}", len(possible_cutoffs))
    samples_in_cutoffs = [
        Box({
            'cutoff': cutoff,
            'left': {
                'x': dataset.x[dataset.x[:, dimension] <= cutoff],
                'y': dataset.y[dataset.x[:, dimension] <= cutoff],
            },
            'right': {
                'x': dataset.x[dataset.x[:, dimension] > cutoff],
                'y': dataset.y[dataset.x[:, dimension] > cutoff],
            }
         })
        for cutoff in possible_cutoffs
    ]

    def should_pass(cutoff_samples):
        left_size = len(cutoff_samples.left.x)
        right_size = len(cutoff_samples.right.x)
        return left_size > 5 and right_size > 5 and left_size/len(dataset.x) > 0.1 and right_size/len(dataset.x) > 0.1

    samples_in_cutoffs_filtered = [
        cutoff_samples for cutoff_samples in samples_in_cutoffs if should_pass(cutoff_samples)
    ]

    for it in samples_in_cutoffs_filtered:
        try:

            left_complexities = []
            for label in np.unique(it.left.y):
                ovo_y = it.left.y.copy()
                ovo_y[ovo_y != label] = 0
                ovo_y[ovo_y == label] = 1
                if len(np.unique(ovo_y)) == 1:
                    continue

                log.debug(ovo_y)
                left_complexity = cutoff_function(it.left.x, ovo_y)
                left_complexities.append(left_complexity)


            right_complexities = []
            for label in np.unique(it.right.y):
                ovo_y = it.left.y.copy()
                ovo_y[ovo_y != label] = 0
                ovo_y[ovo_y == label] = 1
                if len(np.unique(ovo_y)) == 1:
                    continue
                log.debug(ovo_y)
                right_complexity = cutoff_function(it.left.x, ovo_y)
                right_complexities.append(right_complexity)
                
        except Exception as e:
            raise e
            log.exception(e)
            return None

        it.left_complexity = np.mean(left_complexities)
        it.right_complexity = np.mean(right_complexities)

    lowest_complexity = min(samples_in_cutoffs_filtered, key = lambda it: it.left_complexity + it.right_complexity)

    if lowest_complexity is None:
        return None


    return lowest_complexity.cutoff, lowest_complexity.left_complexity + lowest_complexity.right_complexity


In [195]:
def recursive_cutoff(dataset, current_conditions=None, recursion_level=0, min_samples = 10, recursion_limit = 4, cutoff_function = px.f2):
    log.debug("Recursion {}", recursion_level)

    if current_conditions is None:
        current_conditions = list()

    log.debug("Current conditions = {}", current_conditions)

    if recursion_limit != -1 and recursion_level >= recursion_limit:
        log.debug("Recursion limit reached")
        log.debug(current_conditions)
        return {" and ".join(current_conditions)}
    recursion_level = recursion_level + 1
    log.debug("Recursion level = {}", recursion_level)

    if len(dataset.x) < min_samples:
        log.debug("Min samples reached")
        log.debug(current_conditions)
        return {" and ".join(current_conditions)}

    features_count = dataset.x.shape[1]

    best_cutoff_by_dimension = {}

    for feature_idx in range(features_count):
        cutoff_and_value = find_best_cutoff_for(Box(x=dataset.x, y=dataset.y), feature_idx, cutoff_function=cutoff_function)
        if cutoff_and_value is None:
            continue

        cutoff, value = cutoff_and_value

        best_cutoff_by_dimension[feature_idx] = {
            'cutoff': cutoff,
            'value': value
        }

    log.debug("Values={}", best_cutoff_by_dimension)

    if not best_cutoff_by_dimension:
        log.debug("Returning")
        log.debug(current_conditions)
        return {" and ".join(current_conditions)}

    best_cutoff_entry = min(best_cutoff_by_dimension.items(), key=lambda it: it[1]['cutoff'])
    best_cutoff_dimension = best_cutoff_entry[0]
    best_cutoff = best_cutoff_entry[1]['cutoff']
    best_cutoff_value = best_cutoff_entry[1]['value']

    log.debug("Best cutoff value = {} ({} at dim {})", best_cutoff_value, best_cutoff, best_cutoff_dimension)

    left_conditions = f"col{best_cutoff_dimension} <= {best_cutoff}"
    right_conditions = f"col{best_cutoff_dimension} > {best_cutoff}"
    log.debug(left_conditions)
    log.debug(right_conditions)

    left_indicies = dataset.x[:, best_cutoff_dimension] <= best_cutoff
    right_indicies = dataset.x[:, best_cutoff_dimension] > best_cutoff

    left_statements = recursive_cutoff(Box(x=dataset.x[left_indicies], y=dataset.y[left_indicies]), current_conditions + [left_conditions], recursion_level + 1)
    right_statements = recursive_cutoff(Box(x=dataset.x[right_indicies], y=dataset.y[right_indicies]), current_conditions + [right_conditions], recursion_level + 1)
    log.debug("Left statements {}", left_statements)
    log.debug("Right statments {}", right_statements)

    return left_statements.union(right_statements)


In [196]:
def as_classifier(clf_by_rule):
    def predict(X):
        df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])])

        for rule, clf in clf_by_rule.items():
            idx_to_predict = df.query(rule).index
            to_predict = df.loc[idx_to_predict] \
                .drop('prediction', axis=1, errors='ignore') \
                .drop('rule', axis=1, errors='ignore') \
                .to_numpy()

            if len(to_predict) == 0:
                continue

            df.loc[idx_to_predict, 'prediction'] = clf.predict(df.loc[df.query(rule).index] \
                                                               .drop('prediction', axis=1, errors='ignore') \
                                                               .drop('rule', axis=1, errors='ignore') \
                                                               .to_numpy())
            df.loc[df.query(rule).index, 'rule'] = rule

        return df.prediction.to_numpy()

    return Box({
        "predict": predict
    })

In [235]:
def train_quad_clf(train_x, train_y, base_clf = Perceptron(random_state=42), min_samples = 10, recursion_limit = -1):
    statements = recursive_cutoff(Box(
        x=train_x,
        y=train_y
    ), min_samples=min_samples, recursion_limit=recursion_limit)
    
    clf_by_rules = {}
    x_for_indices_calculation = pd.DataFrame(dataset.train.x)
    for col in range(x_for_indices_calculation.shape[1]):
        x_for_indices_calculation[f"col{col}"] = x_for_indices_calculation[col]
    indices_by_each_statement = {
        query: x_for_indices_calculation.query(query).index for query in statements
    }
    
    for query, idx in indices_by_each_statement.items():
        x_train = dataset.train.x[idx]
        y_train = dataset.train.y[idx]
    
        if len(np.unique(y_train)) == 1:
            clf_by_rules[query] = DummyClassifier(strategy="constant", constant=y_train[0]).fit(x_train, y_train)
        else:
            clf = clone(base_clf)
            clf.fit(x_train, y_train)
            clf_by_rules[query] = clf
    
    return as_classifier(clf_by_rules)

In [232]:
train_quad_clf(dataset.train.x, dataset.train.y).predict

2023-08-12 17:47:41.318 | DEBUG    | __main__:recursive_cutoff:2 - Recursion 0
2023-08-12 17:47:41.319 | DEBUG    | __main__:recursive_cutoff:7 - Current conditions = []
2023-08-12 17:47:41.319 | DEBUG    | __main__:recursive_cutoff:14 - Recursion level = 1
2023-08-12 17:47:41.319 | DEBUG    | __main__:find_best_cutoff_for:2 - Cutoff calculation dataset size 142
2023-08-12 17:47:41.320 | DEBUG    | __main__:find_best_cutoff_for:4 - Possible cutoffs size 107
2023-08-12 17:47:41.327 | DEBUG    | __main__:find_best_cutoff_for:40 - [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
2023-08-12 17:47:41.327 | DEBUG    | __main__:find_best_cutoff_for:40 - [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
2023-08-12 17:47:41.328 | DEBUG    | __main__:find_best_cutoff_for:52 - [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
2023-08-12 17:47:41.328 | DEBUG    | __main__:find_best_cutoff_for:52 - [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
2023-08-12 17:47:41.329 | DEBUG    | __mai

<function __main__.as_classifier.<locals>.predict(X)>

In [159]:
dataset.train.x

array([[ 54,  62,  56, ...,  43,  73,  87],
       [102,  50,  42, ...,  25,  91, 100],
       [104,  45,  39, ...,  42,  98,  95],
       ...,
       [ 38,  89,  31, ...,  19,   1,  40],
       [ 61,  83,  33, ...,   1,  13,  46],
       [ 65,  99,  41, ...,  13,   9,  56]])

In [236]:
accuracy_score(
    train_quad_clf(dataset.train.x, dataset.train.y).predict(dataset.test.x),
    dataset.test.y
)

2023-08-12 17:48:58.124 | DEBUG    | __main__:recursive_cutoff:2 - Recursion 0
2023-08-12 17:48:58.125 | DEBUG    | __main__:recursive_cutoff:7 - Current conditions = []
2023-08-12 17:48:58.125 | DEBUG    | __main__:recursive_cutoff:14 - Recursion level = 1
2023-08-12 17:48:58.125 | DEBUG    | __main__:find_best_cutoff_for:2 - Cutoff calculation dataset size 142
2023-08-12 17:48:58.126 | DEBUG    | __main__:find_best_cutoff_for:4 - Possible cutoffs size 107
2023-08-12 17:48:58.132 | DEBUG    | __main__:find_best_cutoff_for:40 - [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
2023-08-12 17:48:58.133 | DEBUG    | __main__:find_best_cutoff_for:40 - [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
2023-08-12 17:48:58.133 | DEBUG    | __main__:find_best_cutoff_for:52 - [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
2023-08-12 17:48:58.134 | DEBUG    | __main__:find_best_cutoff_for:52 - [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
2023-08-12 17:48:58.134 | DEBUG    | __mai

0.9722222222222222

In [240]:
accuracy_score(
    DecisionTreeClassifier(random_state=42).fit(dataset.train.x, dataset.train.y).predict(dataset.test.x),
    dataset.test.y
)

0.9444444444444444

In [237]:
accuracy_score(
    Perceptron(random_state=42).fit(dataset.train.x, dataset.train.y).predict(dataset.test.x),
    dataset.test.y
)

0.9444444444444444

In [None]:
original_ds = dataset

In [None]:

for statement in statements:
    

In [76]:
ds.query("col3 > 31 and col3 > 39").index

Index([ 16,  51,  52,  53,  57,  59,  60,  61,  62,  66,  72,  74,  78,  80,
        81,  84,  85,  86,  87,  88,  89,  90,  91,  94,  97,  98,  99, 100,
       102, 105, 106, 107, 108, 109, 112, 113, 114, 116, 118, 119, 120, 121,
       126, 133, 136, 137, 138, 139, 140, 141],
      dtype='int64')

In [77]:
ds.query("col3 > 31 and col3 > 39")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12
16,50,54,69,52,41,57,76,26,57,29,...,52,41,57,76,26,57,29,53,76,60
51,88,22,56,52,6,70,82,5,55,27,...,52,6,70,82,5,55,27,66,72,14
52,49,33,51,57,46,80,84,5,59,26,...,57,46,80,84,5,59,26,65,94,73
53,14,6,27,42,20,81,55,0,43,23,...,42,20,81,55,0,43,23,39,71,68
57,38,96,35,47,20,65,71,24,58,10,...,47,20,65,71,24,58,10,57,71,22
59,6,55,40,44,4,32,42,27,26,15,...,44,4,32,42,27,26,15,40,51,51
60,18,13,27,49,0,40,39,23,29,0,...,49,0,40,39,23,29,0,47,77,43
61,15,22,37,45,6,9,29,28,42,30,...,45,6,9,29,28,42,30,45,47,21
62,37,24,23,40,2,5,37,32,41,20,...,40,2,5,37,32,41,20,36,29,27
66,10,57,62,43,44,16,24,1,40,9,...,43,44,16,24,1,40,9,35,35,43


In [78]:
dataset.train.y[ds.query("col3 > 31 and col3 > 39").index]

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 2, 2, 2])

In [58]:
sections

{'col3 <= 31 and col7 <= 14 and col3 <= 6',
 'col3 <= 31 and col7 <= 14 and col3 > 6',
 'col3 <= 31 and col7 > 14 and col4 <= 14',
 'col3 <= 31 and col7 > 14 and col4 > 14',
 'col3 > 31 and col3 <= 39 and col7 <= 9',
 'col3 > 31 and col3 <= 39 and col7 > 9',
 'col3 > 31 and col3 > 39'}

In [23]:
ds = pd.DataFrame(dataset.train.x)

In [26]:
for col in range(ds.shape[1]):
    ds[f"col{col}"] = ds[col]

In [69]:
ds.query("col3 > 31 and col3 > 39").index

Index([ 16,  51,  52,  53,  57,  59,  60,  61,  62,  66,  72,  74,  78,  80,
        81,  84,  85,  86,  87,  88,  89,  90,  91,  94,  97,  98,  99, 100,
       102, 105, 106, 107, 108, 109, 112, 113, 114, 116, 118, 119, 120, 121,
       126, 133, 136, 137, 138, 139, 140, 141],
      dtype='int64')

In [61]:
len(ds)

142

In [60]:
{
    query: len(ds.query(query)) for query in sections
}

{'col3 > 31 and col3 <= 39 and col7 <= 9': 6,
 'col3 > 31 and col3 > 39': 50,
 'col3 > 31 and col3 <= 39 and col7 > 9': 24,
 'col3 <= 31 and col7 <= 14 and col3 > 6': 32,
 'col3 <= 31 and col7 > 14 and col4 <= 14': 7,
 'col3 <= 31 and col7 > 14 and col4 > 14': 17,
 'col3 <= 31 and col7 <= 14 and col3 <= 6': 6}

In [17]:
pd.DataFrame(dataset.train.x).iloc[0]

0      54
1      62
2      56
3      29
4      20
5      64
6     102
7      13
8      78
9      68
10     43
11     73
12     87
Name: 0, dtype: int64

In [14]:
import pandas as pd
pd.DataFrame(dataset.train.x).query('str(0) < 34', engine="python")

ValueError: "str" is not a supported function

In [79]:
np.array([None, 1, 4, 5, 2, None ]) == None

array([ True, False, False, False, False,  True])

In [None]:
recursive_cutoff

In [126]:
dataset.train

Box({'x': array([[ 20, 113,  25, ...,  43,  75,  33],
       [ 18, 113,  15, ...,  36,  13,   6],
       [  2, 113,   9, ...,  47,  31,   9],
       ...,
       [ 14, 116,  10, ...,  35,  13,   6],
       [  6, 113,   9, ...,  42,  17,   5],
       [  2, 113,  39, ...,  75,  55,   7]]), 'y': array([3, 0, 0, ..., 0, 0, 0])})

In [146]:
best_cutoff, values = find_best_cutoff_for(dataset.train, 1)

In [None]:
best_cutoff

In [128]:
values

161

In [None]:
values

In [107]:
for cutoff in possible_cutoffs:
    left_samples_idx = dataset.train.x[:, dimension_considered] <= cutoff
    right_samples_idx = dataset.train.x[:, dimension_considered] > cutoff
    left_samples_x = dataset.train.x[left_samples_idx]
    left_samples_y = dataset.train.y[left_samples_idx]

    right_samples_x = dataset.train.x[right_samples_idx]
    right_samples_y = dataset.train.y[right_samples_idx]

    if len(left_samples_idx) < 2 or len(right_samples_idx) < 2:
            print("nope")

    if len(set(left_samples_y)) == 1 or len(set(right_samples_y)) == 1:
        print("nope")
    else:

        left_complexity = px.l1(left_samples_x, left_samples_y)
        right_complexity = px.l1(right_samples_x, right_samples_y)
        print(f"l:{left_complexity} r:{right_complexity}")

        complexities.append((left_complexity, right_complexity))




nope
nope




ValueError: shapes (27,) and (9,39) not aligned: 27 (dim 0) != 9 (dim 0)

In [76]:
complexities_summed_for_both = [l+r for (l, r) in complexities]


In [80]:
lowest_complexity_idx = np.argmin(complexities_summed_for_both)

In [81]:
lowest_complexity_idx

17

In [82]:
complexities[17]

(0.0042562119713455025, 0.0006903428702922451)

In [83]:
current_split_conditions = []