# Classification Tree Program

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.tree = None                                                          
        self.feature_importances = None                                          

    def entropy(self, y):
        counts = np.bincount(y)                                                  
        probabilities = counts / len(y)                                          
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])         

    def gini(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def information_gain(self, y, left_indices, right_indices):
        if self.criterion == 'entropy':                                          
            impurity_func = self.entropy
        elif self.criterion == 'gini':
            impurity_func = self.gini
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

        parent_impurity = impurity_func(y)                                       
        left_impurity = impurity_func(y[left_indices])
        right_impurity = impurity_func(y[right_indices])

        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        weighted_impurity = (n_left / n) * left_impurity + (n_right / n) * right_impurity
        inf_gain = parent_impurity - weighted_impurity
        
        # print(f'Inf. gain "{self.criterion}": {inf_gain}')
        return inf_gain                                                          
    
    
    def custom_1(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                               

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            b = 1
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    def custom_2(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                             

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l)
            
            
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_3(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                              

        sum_total = 0
        epsilon = 1e-10 
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l*(1 - p_l))
            
            # eps. для стабильности вычислений
            denominator_1 = max(p_1 * b**2, epsilon)
            denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / denominator_1
            sum_total += ((p_2l - p_2 * p_l)**2) / denominator_2

        return N * sum_total
    

    def custom_4(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                               

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2


        return N * sum_total
    
    
    def custom_5(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                              

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l**2
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_6(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                              

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = -np.log(max(p_l, epsilon))
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_7(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                               

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = (-p_l)*np.log(max(p_l, epsilon))
            
            # denominator_1 = max(p_1 * b**2, epsilon)
            # denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    def custom_8(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                               

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = -(p_l**0.5) * np.log(max(p_l, epsilon))
            
            # denominator_1 = max(p_1 * b**2, epsilon)
            # denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total    
    

    def most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]


    def find_best_split(self, X, y, num_features, y_oh=None):
        best_gain = -float('inf')                                                  
        best_split = None                                                          

        for feature_index in range(num_features):                                  
            
            feature_values = np.sort(X[:, feature_index])
            
            thresholds = (feature_values[:-1] + feature_values[1:]) / 2     
            
            for threshold in thresholds:                                          
                left_indices = np.where(X[:, feature_index] <= threshold)[0]      
                right_indices = np.where(X[:, feature_index] > threshold)[0]      

                if (len(left_indices) < self.min_samples_leaf or 
                    len(right_indices) < self.min_samples_leaf):
                    continue                                                      

                if self.criterion == 'custom_1':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_1 criterion")
                    gain = self.custom_1(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_2':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_2 criterion")
                    gain = self.custom_2(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_3':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_3 criterion")
                    gain = self.custom_3(y_oh, left_indices, right_indices)                    
                
                elif self.criterion == 'custom_4':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_4 criterion")
                    gain = self.custom_4(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_5':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_5 criterion")
                    gain = self.custom_5(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_6':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_6 criterion")
                    gain = self.custom_6(y_oh, left_indices, right_indices)    
                    
                elif self.criterion == 'custom_7':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_7 criterion")
                    gain = self.custom_7(y_oh, left_indices, right_indices)
                    
                elif self.criterion == 'custom_8':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_7 criterion")
                    gain = self.custom_8(y_oh, left_indices, right_indices)                      
                
                else:
                    gain = self.information_gain(y, left_indices, right_indices)  

                if gain > best_gain:                                               
                    best_gain = gain                                               
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'left_indices': left_indices,
                        'right_indices': right_indices,
                        'gain': gain
                    }                                                              
        
        return best_split                                                          


    def fit(self, X, y, y_oh=None):
        num_features = X.shape[1]
        self.feature_importances = np.zeros(num_features)                          
        self.tree = self.grow_tree(X, y, y_oh, depth=0)

        
        total = self.feature_importances.sum()
        if total > 0:
            self.feature_importances /= total


    def grow_tree(self, X, y, y_oh, depth):
        num_samples, num_features = X.shape
        num_classes = len(set(y))

        if (depth == self.max_depth or 
            num_classes == 1 or 
            num_samples < self.min_samples_split):
            return self.most_common_label(y)

        if self.criterion.startswith('custom_'):
            best_split = self.find_best_split(X, y, num_features, y_oh)
        else:
            best_split = self.find_best_split(X, y, num_features)

        if best_split is None:
            return self.most_common_label(y)

        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
        
        
        if self.criterion == 'custom_1':
            gain = self.custom_1(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_2':
            gain = self.custom_2(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_3':
            gain = self.custom_3(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_4':
            gain = self.custom_4(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_5':
            gain = self.custom_5(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_6':
            gain = self.custom_6(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_7':
            gain = self.custom_7(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_8':
            gain = self.custom_8(y_oh, left_indices, right_indices)            
            
        else:
            gain = self.information_gain(y, left_indices, right_indices)

        self.feature_importances[best_split['feature_index']] += gain              

        left_subtree = self.grow_tree(X[left_indices], y[left_indices], 
                                    y_oh[left_indices] if y_oh is not None else None, 
                                    depth + 1)
        right_subtree = self.grow_tree(X[right_indices], y[right_indices], 
                                     y_oh[right_indices] if y_oh is not None else None, 
                                     depth + 1)

        return {
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }


    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])


    def _traverse_tree(self, x, node):
        if isinstance(node, dict):
            if x[node['feature_index']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])

        return node                                                           

### 1 Experiment

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_train_test(max_depth, X, y, *, N=None, V=None, k=None, alpha=None, nmin=None, random_state=42):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)

    encoder = OneHotEncoder(sparse_output=False)
    y_oh_train = encoder.fit_transform(y_train.reshape(-1,1))


    '''Custom_1'''
    custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
    custom_1.fit(X_train, y_train, y_oh_train)
    y_pred = custom_1.predict(X_test)
    accuracy_1, precision_1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_1, f1_1 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_1 = adjusted_rand_score(y_test, y_pred)

    '''GINI'''
    gini = DecisionTree(max_depth=max_depth, criterion='gini')
    gini.fit(X_train, y_train)
    y_pred = gini.predict(X_test)
    accuracy_gini, precision_gini = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini, f1_gini = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_GINI'''
    sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
    sk_gini.fit(X_train, y_train)
    y_pred = sk_gini.predict(X_test)
    accuracy_gini_sk, precision_gini_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini_sk, f1_gini_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini_sk = adjusted_rand_score(y_test, y_pred)

    '''Entropy'''
    entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
    entropy.fit(X_train, y_train)
    y_pred = entropy.predict(X_test)
    accuracy_entropy, precision_entropy = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy, f1_entropy = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_Entropy'''
    sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    sk_entropy.fit(X_train, y_train)
    y_pred = sk_entropy.predict(X_test)
    accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy_sk, f1_entropy_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy_sk = adjusted_rand_score(y_test, y_pred)
    
    '''Custom_2'''
    custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
    custom_2.fit(X_train, y_train, y_oh_train)
    y_pred = custom_2.predict(X_test)
    accuracy_2, precision_2 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_2, f1_2 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_2 = adjusted_rand_score(y_test, y_pred)

    '''Custom_3'''
    custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
    custom_3.fit(X_train, y_train, y_oh_train)
    y_pred = custom_3.predict(X_test)
    accuracy_3, precision_3 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_3, f1_3 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_3 = adjusted_rand_score(y_test, y_pred)

    '''Custom_4'''
    custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
    custom_4.fit(X_train, y_train, y_oh_train)
    y_pred = custom_4.predict(X_test)
    accuracy_4, precision_4 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_4, f1_4 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_4 = adjusted_rand_score(y_test, y_pred)

    '''Custom_5'''
    custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
    custom_5.fit(X_train, y_train, y_oh_train)
    y_pred = custom_5.predict(X_test)
    accuracy_5, precision_5 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_5, f1_5 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_5 = adjusted_rand_score(y_test, y_pred)

    '''Custom_6'''
    custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
    custom_6.fit(X_train, y_train, y_oh_train)
    y_pred = custom_6.predict(X_test)
    accuracy_6, precision_6 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_6, f1_6 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_6 = adjusted_rand_score(y_test, y_pred)

    '''Custom_7'''
    custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
    custom_7.fit(X_train, y_train, y_oh_train)
    y_pred = custom_7.predict(X_test)
    accuracy_7, precision_7 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_7, f1_7 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_7 = adjusted_rand_score(y_test, y_pred)
    
    '''Custom_8'''
    custom_8 = DecisionTree(max_depth=max_depth, criterion='custom_8')
    custom_8.fit(X_train, y_train, y_oh_train)
    y_pred = custom_8.predict(X_test)
    accuracy_8, precision_8 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_8, f1_8 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_8 = adjusted_rand_score(y_test, y_pred)

    results = np.round([[accuracy_1, accuracy_gini, accuracy_gini_sk, accuracy_entropy, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7, accuracy_8],
                    [precision_1, precision_gini, precision_gini_sk, precision_entropy, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7, precision_8],
                    [recall_1, recall_gini, recall_gini_sk, recall_entropy, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7, recall_8],
                    [f1_1, f1_gini, f1_gini_sk, f1_entropy, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7, f1_8],
                    [ari_1, ari_gini, ari_gini_sk, ari_entropy, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7, ari_8],],4)

    column = ['b = 1','gini','gini_sklearn', 'entropy', 'entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = log(p_l)', 'b = -p_l * log(p_l)', 'b = p_l^0.5 * log(p_l)']
    table = pd.DataFrame(data=results, columns=column, index=['Accuracy', 'Precision', 'Recall','F1 score','ARI'])
    
    print(f'\nN, V, k, alpha, nmin, max_depth = {N, V, k, alpha, nmin, max_depth}')

    return table

### Mean/std of 50 exps.

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_train_test(max_depth, X, y, *, N=None, V=None, k=None, alpha=None, nmin=None):
    
    all_results = []
    
    for seed in range(1,51):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

        encoder = OneHotEncoder(sparse_output=False)
        y_oh_train = encoder.fit_transform(y_train.reshape(-1,1))


        '''Custom_1'''
        custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
        custom_1.fit(X_train, y_train, y_oh_train)
        y_pred = custom_1.predict(X_test)
        accuracy_1, precision_1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_1, f1_1 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_1 = adjusted_rand_score(y_test, y_pred)

        # '''GINI'''
        # gini = DecisionTree(max_depth=max_depth, criterion='gini')
        # gini.fit(X_train, y_train)
        # y_pred = gini.predict(X_test)
        # accuracy_gini, precision_gini = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        # recall_gini, f1_gini = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        # ari_gini = adjusted_rand_score(y_test, y_pred)

        # '''Sklearn_GINI'''
        # sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
        # sk_gini.fit(X_train, y_train)
        # y_pred = sk_gini.predict(X_test)
        # accuracy_gini_sk, precision_gini_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        # recall_gini_sk, f1_gini_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        # ari_gini_sk = adjusted_rand_score(y_test, y_pred)

        # '''Entropy'''
        # entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
        # entropy.fit(X_train, y_train)
        # y_pred = entropy.predict(X_test)
        # accuracy_entropy, precision_entropy = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        # recall_entropy, f1_entropy = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        # ari_entropy = adjusted_rand_score(y_test, y_pred)

        '''Sklearn_Entropy'''
        sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
        sk_entropy.fit(X_train, y_train)
        y_pred = sk_entropy.predict(X_test)
        accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_entropy_sk, f1_entropy_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_entropy_sk = adjusted_rand_score(y_test, y_pred)
        
        '''Custom_2'''
        custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
        custom_2.fit(X_train, y_train, y_oh_train)
        y_pred = custom_2.predict(X_test)
        accuracy_2, precision_2 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_2, f1_2 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_2 = adjusted_rand_score(y_test, y_pred)

        '''Custom_3'''
        custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
        custom_3.fit(X_train, y_train, y_oh_train)
        y_pred = custom_3.predict(X_test)
        accuracy_3, precision_3 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_3, f1_3 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_3 = adjusted_rand_score(y_test, y_pred)

        '''Custom_4'''
        custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
        custom_4.fit(X_train, y_train, y_oh_train)
        y_pred = custom_4.predict(X_test)
        accuracy_4, precision_4 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_4, f1_4 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_4 = adjusted_rand_score(y_test, y_pred)

        '''Custom_5'''
        custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
        custom_5.fit(X_train, y_train, y_oh_train)
        y_pred = custom_5.predict(X_test)
        accuracy_5, precision_5 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_5, f1_5 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_5 = adjusted_rand_score(y_test, y_pred)

        '''Custom_6'''
        custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
        custom_6.fit(X_train, y_train, y_oh_train)
        y_pred = custom_6.predict(X_test)
        accuracy_6, precision_6 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_6, f1_6 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_6 = adjusted_rand_score(y_test, y_pred)

        '''Custom_7'''
        custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
        custom_7.fit(X_train, y_train, y_oh_train)
        y_pred = custom_7.predict(X_test)
        accuracy_7, precision_7 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_7, f1_7 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_7 = adjusted_rand_score(y_test, y_pred)


        '''Custom_8'''
        custom_8 = DecisionTree(max_depth=max_depth, criterion='custom_8')
        custom_8.fit(X_train, y_train, y_oh_train)
        y_pred = custom_8.predict(X_test)
        accuracy_8, precision_8 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall_8, f1_8 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
        ari_8 = adjusted_rand_score(y_test, y_pred)

        results = np.round([[accuracy_1, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7, accuracy_8],
                        [precision_1, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7, precision_8],
                        [recall_1, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7, recall_8],
                        [f1_1, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7, f1_8],
                        [ari_1, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7, ari_8],],4)
        
        all_results.append(results)
        print(f'Finished: {seed} iter.')
        
    print(f'\nN, V, k, alpha, nmin, max_depth = {N, V, k, alpha, nmin, max_depth}')
            
    all_results = np.array(all_results)  # shape: (4, 5, 11)

    mean_results = np.round(np.mean(all_results, axis=0),4)
    std_results = np.round(np.std(all_results, axis=0),4)
    
    # Final table (Mean/std)
    columns = ['b = 1','entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = -log(p_l)', 'b = -p_l * log(p_l)', 'b = -p_l^0.5 * log(p_l)']
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 score', 'ARI']
    
    index_tuples = []
    for metric in metrics:
        index_tuples.append((metric, 'Mean'))
        index_tuples.append((metric, 'Std'))
    
    multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['Metric', 'Statistic'])
    
    # Final table
    final_table_data = []
    for i in range(len(metrics)):
        final_table_data.append(mean_results[i])
        final_table_data.append(std_results[i])
    
    final_table = pd.DataFrame(final_table_data, 
                             columns=columns, 
                             index=multi_index)
    
    return final_table

# Experiments with Generated and Real World Datasets

## Generated Datasets

---
#### Data generator

Parameters:
- N: Total number of data points
- V: Number of dimensions/features
- k: Number of clusters
- alpha: Controls cluster center spread (centers are in [α-1, 1-α])
- nmin: Minimum points per cluster
- seed: Random seed for reproducibility
- sig_range: Tuple (min, max) for cluster standard deviations

Returns:
- Nk: Array of cluster sizes
- R: List of ranges for each cluster
- y: Cluster labels for each point
- X: Generated data (N x V array)
- cen: Cluster centers (k x V array)

In [None]:
def generdat(N, V, k, alpha, nmin, seed=None, sig_range=(0.05, 0.1)):
    if N < k * nmin:
        raise ValueError(f"N must be >= k * nmin. Got N={N}, k={k}, nmin={nmin}")
    if k < 1:
        raise ValueError("k must be at least 1")
    if alpha == 1:
        raise ValueError("alpha cannot be 1")

    if seed is not None:
        np.random.seed(seed)

    if k == 1:
        Nk = np.array([N])
    else:
        base_sizes = np.ones(k, dtype=int) * nmin
        remaining = N - k * nmin
        if remaining > 0:
            additional = np.random.multinomial(remaining, np.ones(k)/k)
            Nk = base_sizes + additional
        else:
            Nk = base_sizes

    # Cluster centers
    cen = (alpha - 1) + 2 * (1 - alpha) * np.random.rand(k, V)

    X = np.zeros((N, V))
    y = np.zeros(N, dtype=int)
    R = []
    
    sig_min, sig_max = sig_range
    start_idx = 0
    
    for k0 in range(k):
        nk = Nk[k0]
        end_idx = start_idx + nk
        
        # Range for the current cluster
        R.append(range(start_idx, end_idx))
        y[start_idx:end_idx] = k0 
        
        # Cluster data generation
        sig = sig_min + (sig_max - sig_min) * np.random.rand(V)
        X[start_idx:end_idx] = np.random.randn(nk, V) * sig + cen[k0, :]
        
        start_idx = end_idx

    return Nk, R, y, X, cen

In [45]:
N, V, k, alpha, nmin = 500, 3, 4, 0.5, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=3, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (500, 3, 4, 0.5, 50, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9914,0.9914,0.9914,0.9891,0.9901,0.9896,0.9912,0.9888,0.9874,0.9898,0.9915,0.9912
Accuracy,Std,0.0076,0.0076,0.0076,0.0103,0.0102,0.0104,0.0077,0.0109,0.0118,0.0099,0.0074,0.0077
Precision,Mean,0.9918,0.9918,0.9918,0.9896,0.9905,0.9902,0.9916,0.9896,0.9884,0.9902,0.9919,0.9916
Precision,Std,0.0072,0.0072,0.0072,0.0098,0.0097,0.0092,0.0073,0.0095,0.0101,0.0093,0.0069,0.0073
Recall,Mean,0.9914,0.9914,0.9914,0.9891,0.9901,0.9896,0.9912,0.9888,0.9874,0.9898,0.9915,0.9912
Recall,Std,0.0076,0.0076,0.0076,0.0103,0.0102,0.0104,0.0077,0.0109,0.0118,0.0099,0.0074,0.0077
F1 score,Mean,0.9914,0.9914,0.9914,0.9891,0.9901,0.9896,0.9912,0.9888,0.9874,0.9898,0.9915,0.9912
F1 score,Std,0.0076,0.0076,0.0076,0.0103,0.0102,0.0105,0.0077,0.0109,0.0118,0.0098,0.0074,0.0077
ARI,Mean,0.9779,0.9779,0.978,0.972,0.9743,0.9739,0.9775,0.9723,0.9691,0.974,0.9784,0.9775
ARI,Std,0.0197,0.0197,0.0194,0.0265,0.0262,0.0247,0.0198,0.0254,0.027,0.0252,0.0188,0.0198


### Generated Dataset / Tree_depth = 3

In [47]:
tables = []

for cluster in [4,8,15]:
    for feature in [6,15]:
        for squeeze in [0.5, 0.85]:
            N, V, k, alpha, nmin = 2000, feature, cluster, squeeze, 50
            Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)
            
            table = compare_metrics_train_test(max_depth=3, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)
            tables.append(table)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 3)

In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 3)
tables[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9933,0.9933,0.9934,0.9916,0.9919,0.9933,0.9936,0.9934,0.9929,0.9935,0.9934,0.9936
Accuracy,Std,0.003,0.003,0.0028,0.0035,0.004,0.003,0.0031,0.003,0.0033,0.0028,0.0029,0.0031
Precision,Mean,0.9934,0.9934,0.9935,0.9917,0.992,0.9934,0.9937,0.9935,0.993,0.9935,0.9935,0.9936
Precision,Std,0.0029,0.0029,0.0027,0.0035,0.0039,0.0029,0.0031,0.003,0.0032,0.0028,0.0028,0.0031
Recall,Mean,0.9933,0.9933,0.9934,0.9916,0.9919,0.9933,0.9936,0.9934,0.9929,0.9935,0.9934,0.9936
Recall,Std,0.003,0.003,0.0028,0.0035,0.004,0.003,0.0031,0.003,0.0033,0.0028,0.0029,0.0031
F1 score,Mean,0.9933,0.9933,0.9934,0.9916,0.9919,0.9933,0.9936,0.9934,0.9929,0.9935,0.9934,0.9936
F1 score,Std,0.003,0.003,0.0028,0.0036,0.004,0.003,0.0031,0.003,0.0033,0.0028,0.0029,0.0031
ARI,Mean,0.9822,0.9822,0.9825,0.9778,0.9785,0.9822,0.9831,0.9824,0.9812,0.9827,0.9824,0.983
ARI,Std,0.0077,0.0077,0.0072,0.0091,0.0102,0.0077,0.0081,0.0079,0.0086,0.0072,0.0076,0.008


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.85, 50, 3)
tables[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7312,0.7312,0.7312,0.7195,0.7194,0.7254,0.7356,0.7245,0.7245,0.7186,0.7271,0.7334
Accuracy,Std,0.0196,0.0196,0.0196,0.022,0.0216,0.0218,0.0182,0.0212,0.0183,0.025,0.0223,0.0183
Precision,Mean,0.7408,0.7408,0.7415,0.7424,0.7432,0.74,0.744,0.7381,0.7388,0.7146,0.7386,0.7401
Precision,Std,0.0184,0.0184,0.0185,0.0195,0.0188,0.0188,0.0176,0.0203,0.0207,0.0342,0.0191,0.0178
Recall,Mean,0.7312,0.7312,0.7312,0.7195,0.7194,0.7254,0.7356,0.7245,0.7245,0.7186,0.7271,0.7334
Recall,Std,0.0196,0.0196,0.0196,0.022,0.0216,0.0218,0.0182,0.0212,0.0183,0.025,0.0223,0.0183
F1 score,Mean,0.73,0.73,0.7304,0.7203,0.7205,0.7259,0.7337,0.7246,0.7248,0.7048,0.7264,0.73
F1 score,Std,0.0191,0.0191,0.0191,0.022,0.0215,0.0198,0.0179,0.02,0.0182,0.0353,0.0206,0.0185
ARI,Mean,0.4544,0.4544,0.454,0.4255,0.4245,0.4393,0.4642,0.4383,0.4362,0.4635,0.4459,0.4629
ARI,Std,0.0377,0.0377,0.0375,0.0391,0.0383,0.0427,0.0332,0.0423,0.0376,0.0308,0.0436,0.0341


In [None]:
#N, V, k, alpha, nmin, max_depth = (2000, 15, 4, 0.5, 50, 3)
tables[2]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9974,0.9975,0.9982,0.9974,0.9978
Accuracy,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0022,0.0026,0.0024
Precision,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9975,0.9975,0.9983,0.9974,0.9978
Precision,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0022,0.0025,0.0024
Recall,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9974,0.9975,0.9982,0.9974,0.9978
Recall,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0022,0.0026,0.0024
F1 score,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9974,0.9975,0.9982,0.9974,0.9978
F1 score,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0022,0.0026,0.0024
ARI,Mean,0.9939,0.9937,0.9934,0.9978,0.9975,0.9934,0.9941,0.9933,0.9935,0.9954,0.993,0.9941
ARI,Std,0.0065,0.0067,0.0072,0.005,0.0034,0.007,0.0064,0.0066,0.0064,0.0059,0.0068,0.0063


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 4, 0.85, 50, 3)
tables[3]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9514,0.9514,0.9516,0.9514,0.9515,0.9502,0.9521,0.949,0.9464,0.9428,0.9498,0.9509
Accuracy,Std,0.0076,0.0076,0.0077,0.0067,0.0068,0.0088,0.0079,0.0102,0.0147,0.0311,0.0085,0.0082
Precision,Mean,0.9525,0.9525,0.9526,0.9527,0.9527,0.9514,0.9532,0.9502,0.9477,0.9466,0.9512,0.9521
Precision,Std,0.0074,0.0074,0.0074,0.0065,0.0065,0.0085,0.0076,0.0101,0.0143,0.0167,0.008,0.0077
Recall,Mean,0.9514,0.9514,0.9516,0.9514,0.9515,0.9502,0.9521,0.949,0.9464,0.9428,0.9498,0.9509
Recall,Std,0.0076,0.0076,0.0077,0.0067,0.0068,0.0088,0.0079,0.0102,0.0147,0.0311,0.0085,0.0082
F1 score,Mean,0.9515,0.9515,0.9516,0.9514,0.9515,0.9502,0.9522,0.949,0.9464,0.9416,0.9499,0.951
F1 score,Std,0.0076,0.0076,0.0077,0.0067,0.0068,0.0088,0.0079,0.0102,0.0147,0.0405,0.0084,0.0081
ARI,Mean,0.8744,0.8744,0.8748,0.8743,0.8746,0.8715,0.8763,0.8686,0.8621,0.8582,0.87,0.8726
ARI,Std,0.0189,0.0189,0.019,0.0167,0.0168,0.0219,0.0196,0.0256,0.0367,0.0417,0.0212,0.0205


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 8, 0.5, 50, 3)
tables[4]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6444,0.6444,0.6446,0.864,0.864,0.6156,0.8579,0.6025,0.542,0.7398,0.6181,0.647
Accuracy,Std,0.073,0.073,0.0727,0.0157,0.0159,0.0435,0.0334,0.0309,0.0616,0.0428,0.0435,0.0757
Precision,Mean,0.5973,0.5973,0.5984,0.8034,0.8034,0.5711,0.7949,0.5722,0.5622,0.6743,0.5868,0.6136
Precision,Std,0.1088,0.1088,0.1069,0.0189,0.019,0.0917,0.0477,0.0833,0.0843,0.0644,0.1007,0.1117
Recall,Mean,0.6444,0.6444,0.6446,0.864,0.864,0.6156,0.8579,0.6025,0.542,0.7398,0.6181,0.647
Recall,Std,0.073,0.073,0.0727,0.0157,0.0159,0.0435,0.0334,0.0309,0.0616,0.0428,0.0435,0.0757
F1 score,Mean,0.5575,0.5575,0.5579,0.8237,0.8237,0.5375,0.816,0.5317,0.4821,0.6775,0.5362,0.56
F1 score,Std,0.0866,0.0866,0.0861,0.0187,0.0188,0.049,0.0434,0.0328,0.0596,0.0491,0.0495,0.0896
ARI,Mean,0.6129,0.6129,0.6131,0.8468,0.8468,0.5236,0.8406,0.4729,0.3596,0.6552,0.5399,0.6124
ARI,Std,0.0944,0.0944,0.0941,0.0182,0.0182,0.0951,0.0316,0.0717,0.0916,0.0639,0.091,0.0924


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 8, 0.85, 50, 3)
tables[5]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5988,0.5988,0.5988,0.5947,0.5947,0.5896,0.5712,0.5791,0.5535,0.5055,0.5916,0.597
Accuracy,Std,0.0216,0.0216,0.0216,0.0224,0.0224,0.0247,0.0357,0.0321,0.0316,0.0251,0.0218,0.0212
Precision,Mean,0.542,0.542,0.542,0.5402,0.5402,0.5345,0.5063,0.5258,0.4918,0.4137,0.5386,0.5405
Precision,Std,0.0205,0.0205,0.0205,0.0252,0.0252,0.0329,0.0571,0.0398,0.0475,0.0567,0.0272,0.0192
Recall,Mean,0.5988,0.5988,0.5988,0.5947,0.5947,0.5896,0.5712,0.5791,0.5535,0.5055,0.5916,0.597
Recall,Std,0.0216,0.0216,0.0216,0.0224,0.0224,0.0247,0.0357,0.0321,0.0316,0.0251,0.0218,0.0212
F1 score,Mean,0.5571,0.5571,0.5571,0.553,0.553,0.5444,0.5145,0.533,0.4958,0.4099,0.5474,0.555
F1 score,Std,0.0226,0.0226,0.0226,0.0235,0.0235,0.0299,0.0521,0.0361,0.0424,0.0344,0.0252,0.0221
ARI,Mean,0.3718,0.3718,0.3718,0.3678,0.3678,0.364,0.3701,0.3535,0.3283,0.3697,0.3655,0.3706
ARI,Std,0.0266,0.0266,0.0266,0.0229,0.0229,0.027,0.024,0.034,0.0328,0.0232,0.0255,0.0258


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 8, 0.5, 50, 3)
tables[6]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6652,0.6652,0.6659,0.9946,0.9951,0.6709,0.6745,0.6762,0.6612,0.6432,0.6713,0.663
Accuracy,Std,0.0804,0.0803,0.0809,0.0034,0.0036,0.0874,0.0712,0.0874,0.1001,0.0704,0.0873,0.0786
Precision,Mean,0.5585,0.5585,0.5588,0.9947,0.9952,0.5851,0.554,0.5945,0.6018,0.5276,0.5821,0.553
Precision,Std,0.0861,0.086,0.0865,0.0033,0.0036,0.0833,0.0797,0.0811,0.0829,0.0833,0.0845,0.0851
Recall,Mean,0.6652,0.6652,0.6659,0.9946,0.9951,0.6709,0.6745,0.6762,0.6612,0.6432,0.6713,0.663
Recall,Std,0.0804,0.0803,0.0809,0.0034,0.0036,0.0874,0.0712,0.0874,0.1001,0.0704,0.0873,0.0786
F1 score,Mean,0.5821,0.5822,0.5826,0.9946,0.9951,0.5988,0.5867,0.6068,0.5942,0.5575,0.5974,0.5781
F1 score,Std,0.0911,0.091,0.0916,0.0034,0.0036,0.0961,0.0813,0.0955,0.1076,0.0814,0.0964,0.0895
ARI,Mean,0.5971,0.5971,0.597,0.9878,0.989,0.5678,0.6338,0.5719,0.5494,0.5572,0.5748,0.6011
ARI,Std,0.1275,0.1274,0.1274,0.0078,0.0082,0.1297,0.1158,0.1258,0.1364,0.1113,0.1309,0.1248


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 8, 0.85, 50, 3)
tables[7]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6307,0.6307,0.6305,0.6737,0.6737,0.6105,0.6224,0.5946,0.5654,0.5919,0.6148,0.6399
Accuracy,Std,0.0307,0.0307,0.0307,0.0221,0.0221,0.0362,0.0423,0.0472,0.0534,0.0468,0.0397,0.0315
Precision,Mean,0.5987,0.5987,0.5966,0.6241,0.6241,0.5828,0.5591,0.5701,0.5566,0.5462,0.5872,0.6147
Precision,Std,0.0377,0.0377,0.041,0.0252,0.0252,0.0452,0.0703,0.0636,0.0694,0.0979,0.049,0.0428
Recall,Mean,0.6307,0.6307,0.6305,0.6737,0.6737,0.6105,0.6224,0.5946,0.5654,0.5919,0.6148,0.6399
Recall,Std,0.0307,0.0307,0.0307,0.0221,0.0221,0.0362,0.0423,0.0472,0.0534,0.0468,0.0397,0.0315
F1 score,Mean,0.5871,0.5871,0.5862,0.6395,0.6395,0.5617,0.5686,0.5437,0.5194,0.5148,0.5659,0.5971
F1 score,Std,0.0359,0.0359,0.0369,0.0235,0.0235,0.0403,0.0572,0.0564,0.0645,0.0704,0.0454,0.0411
ARI,Mean,0.4296,0.4296,0.4297,0.4653,0.4653,0.4119,0.4278,0.392,0.3497,0.4266,0.4147,0.4317
ARI,Std,0.0347,0.0347,0.0346,0.0316,0.0316,0.0498,0.0404,0.0595,0.0632,0.0285,0.0521,0.035


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 15, 0.5, 50, 3)
tables[8]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.457,0.4569,0.4567,0.5121,0.5116,0.47,0.4863,0.4624,0.441,0.5077,0.4694,0.4602
Accuracy,Std,0.0305,0.0306,0.0311,0.0194,0.0199,0.0305,0.0383,0.0446,0.0559,0.0233,0.0309,0.035
Precision,Mean,0.2902,0.2887,0.289,0.3083,0.3078,0.3269,0.3022,0.3374,0.3429,0.3491,0.3254,0.2913
Precision,Std,0.0355,0.0352,0.0356,0.0204,0.0206,0.044,0.044,0.0448,0.0428,0.0244,0.0421,0.0381
Recall,Mean,0.457,0.4569,0.4567,0.5121,0.5116,0.47,0.4863,0.4624,0.441,0.5077,0.4694,0.4602
Recall,Std,0.0305,0.0306,0.0311,0.0194,0.0199,0.0305,0.0383,0.0446,0.0559,0.0233,0.0309,0.035
F1 score,Mean,0.3322,0.332,0.3317,0.3712,0.3706,0.3575,0.3546,0.3555,0.3401,0.3908,0.3578,0.3328
F1 score,Std,0.0312,0.0313,0.0316,0.021,0.0214,0.0393,0.0437,0.0492,0.0542,0.0246,0.0389,0.037
ARI,Mean,0.4315,0.4315,0.4315,0.5667,0.5668,0.4075,0.4896,0.3801,0.3386,0.49,0.4038,0.4434
ARI,Std,0.0388,0.0389,0.0389,0.031,0.0311,0.0345,0.0311,0.0591,0.0799,0.0236,0.0374,0.0389


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 15, 0.85, 50, 3)
tables[9]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.3508,0.3508,0.3518,0.3428,0.3444,0.3534,0.339,0.3477,0.3379,0.3381,0.3527,0.3489
Accuracy,Std,0.0187,0.0187,0.0191,0.0187,0.0193,0.0179,0.02,0.0185,0.0194,0.0204,0.018,0.0188
Precision,Mean,0.2074,0.2074,0.2082,0.1849,0.1858,0.2107,0.1812,0.2083,0.207,0.1776,0.2098,0.2036
Precision,Std,0.0178,0.0178,0.0181,0.0215,0.0214,0.0174,0.0243,0.0179,0.0187,0.0213,0.0167,0.0196
Recall,Mean,0.3508,0.3508,0.3518,0.3428,0.3444,0.3534,0.339,0.3477,0.3379,0.3381,0.3527,0.3489
Recall,Std,0.0187,0.0187,0.0191,0.0187,0.0193,0.0179,0.02,0.0185,0.0194,0.0204,0.018,0.0188
F1 score,Mean,0.2464,0.2464,0.2474,0.2315,0.2328,0.2489,0.226,0.2448,0.2385,0.2223,0.2485,0.2429
F1 score,Std,0.0178,0.0178,0.0181,0.0193,0.0193,0.017,0.0212,0.0177,0.0177,0.0191,0.0173,0.0193
ARI,Mean,0.2371,0.2371,0.2371,0.2331,0.2331,0.2372,0.2301,0.2312,0.2116,0.2339,0.2383,0.2387
ARI,Std,0.0171,0.0171,0.0171,0.0179,0.0179,0.0167,0.0184,0.0168,0.0227,0.0196,0.0176,0.0173


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.5, 50, 3)
tables[10]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.39,0.39,0.3906,0.5087,0.5083,0.4049,0.3906,0.3807,0.3215,0.3948,0.4012,0.3942
Accuracy,Std,0.0434,0.0434,0.0431,0.0178,0.0171,0.0567,0.0191,0.0708,0.0678,0.0193,0.0592,0.0438
Precision,Mean,0.295,0.295,0.2927,0.2979,0.2977,0.3126,0.2933,0.3083,0.3096,0.3018,0.3086,0.2944
Precision,Std,0.0406,0.0407,0.0402,0.0197,0.0186,0.0537,0.0233,0.058,0.0512,0.0278,0.0509,0.0394
Recall,Mean,0.39,0.39,0.3906,0.5087,0.5083,0.4049,0.3906,0.3807,0.3215,0.3948,0.4012,0.3942
Recall,Std,0.0434,0.0434,0.0431,0.0178,0.0171,0.0567,0.0191,0.0708,0.0678,0.0193,0.0592,0.0438
F1 score,Mean,0.3045,0.3045,0.3047,0.3654,0.3651,0.3227,0.3066,0.2989,0.257,0.3117,0.3168,0.3085
F1 score,Std,0.0385,0.0385,0.0383,0.02,0.0189,0.0549,0.0182,0.0698,0.0597,0.0213,0.0571,0.038
ARI,Mean,0.2347,0.2346,0.2351,0.5637,0.5641,0.2426,0.1957,0.2269,0.1505,0.2166,0.2471,0.2452
ARI,Std,0.0685,0.0685,0.0684,0.0135,0.0135,0.0792,0.0289,0.0818,0.0775,0.0481,0.0817,0.0738


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 3)
tables[11]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.3825,0.3825,0.3823,0.3973,0.3972,0.3836,0.3289,0.3826,0.3801,0.3549,0.3817,0.3832
Accuracy,Std,0.0228,0.0228,0.0226,0.0251,0.025,0.0197,0.033,0.0202,0.0238,0.025,0.0193,0.0227
Precision,Mean,0.271,0.271,0.2708,0.2377,0.2376,0.2768,0.1873,0.2769,0.2816,0.2387,0.2744,0.2722
Precision,Std,0.0255,0.0255,0.0252,0.0205,0.0204,0.0268,0.0365,0.0249,0.028,0.0234,0.0267,0.0253
Recall,Mean,0.3825,0.3825,0.3823,0.3973,0.3972,0.3836,0.3289,0.3826,0.3801,0.3549,0.3817,0.3832
Recall,Std,0.0228,0.0228,0.0226,0.0251,0.025,0.0197,0.033,0.0202,0.0238,0.025,0.0193,0.0227
F1 score,Mean,0.2984,0.2984,0.2982,0.2884,0.2883,0.301,0.2183,0.3001,0.2991,0.2575,0.299,0.2991
F1 score,Std,0.0213,0.0213,0.021,0.0223,0.0222,0.0197,0.0361,0.0192,0.0229,0.0261,0.0195,0.0208
ARI,Mean,0.2066,0.2066,0.2066,0.2612,0.2612,0.2035,0.2198,0.2024,0.1957,0.2281,0.2034,0.2077
ARI,Std,0.0223,0.0223,0.0223,0.0216,0.0216,0.0243,0.0238,0.0236,0.0207,0.0232,0.0233,0.0255


### Generated Dataset / Tree_depth = 4

In [28]:
tables = []

for cluster in [4,8,15]:
    for feature in [6,15]:
        for squeeze in [0.5, 0.85]:
            N, V, k, alpha, nmin = 2000, feature, cluster, squeeze, 50
            Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)
            
            table = compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)
            tables.append(table)


N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 1 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 2 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 3 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 4 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 5 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 6 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 7 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 8 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 9 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 10 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 11 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
Finished: 12 iter.

N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
F

In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.5, 50, 4)
tables[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9932,0.9932,0.9933,0.9927,0.9926,0.9931,0.9933,0.9935,0.9935,0.9931,0.9933,0.9932
Accuracy,Std,0.0038,0.0037,0.0038,0.0035,0.0037,0.0037,0.0037,0.0033,0.0035,0.0034,0.0036,0.0037
Precision,Mean,0.9933,0.9932,0.9934,0.9928,0.9926,0.9932,0.9934,0.9936,0.9935,0.9931,0.9933,0.9932
Precision,Std,0.0038,0.0037,0.0038,0.0035,0.0037,0.0036,0.0036,0.0033,0.0034,0.0033,0.0036,0.0037
Recall,Mean,0.9932,0.9932,0.9933,0.9927,0.9926,0.9931,0.9933,0.9935,0.9935,0.9931,0.9933,0.9932
Recall,Std,0.0038,0.0037,0.0038,0.0035,0.0037,0.0037,0.0037,0.0033,0.0035,0.0034,0.0036,0.0037
F1 score,Mean,0.9932,0.9932,0.9933,0.9927,0.9926,0.9931,0.9933,0.9935,0.9935,0.9931,0.9933,0.9932
F1 score,Std,0.0038,0.0037,0.0039,0.0035,0.0037,0.0037,0.0037,0.0033,0.0035,0.0034,0.0036,0.0037
ARI,Mean,0.9821,0.9819,0.9823,0.9806,0.9803,0.9817,0.9823,0.9828,0.9827,0.9817,0.9821,0.9819
ARI,Std,0.01,0.0096,0.0101,0.0092,0.0097,0.0095,0.0095,0.0087,0.009,0.0086,0.0094,0.0096


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 4, 0.85, 50, 4)
tables[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7576,0.7576,0.7571,0.7455,0.7451,0.7529,0.7567,0.7518,0.7493,0.7446,0.7517,0.7529
Accuracy,Std,0.016,0.016,0.0163,0.0165,0.0166,0.0173,0.0161,0.0185,0.0187,0.0171,0.0161,0.016
Precision,Mean,0.7628,0.7628,0.7625,0.7568,0.7567,0.7616,0.7608,0.7627,0.7617,0.7565,0.7565,0.7566
Precision,Std,0.0175,0.0175,0.0174,0.0185,0.0187,0.0196,0.0175,0.0207,0.0204,0.0201,0.0181,0.017
Recall,Mean,0.7576,0.7576,0.7571,0.7455,0.7451,0.7529,0.7567,0.7518,0.7493,0.7446,0.7517,0.7529
Recall,Std,0.016,0.016,0.0163,0.0165,0.0166,0.0173,0.0161,0.0185,0.0187,0.0171,0.0161,0.016
F1 score,Mean,0.7571,0.7571,0.7568,0.7431,0.743,0.7533,0.7553,0.7529,0.7508,0.7448,0.7503,0.7507
F1 score,Std,0.0164,0.0164,0.016,0.0182,0.0181,0.0173,0.0163,0.018,0.0183,0.0173,0.0172,0.0175
ARI,Mean,0.4937,0.4937,0.492,0.4762,0.4753,0.481,0.496,0.4769,0.4707,0.4798,0.4838,0.4894
ARI,Std,0.0273,0.0273,0.0278,0.0329,0.0332,0.0316,0.0286,0.0337,0.0356,0.03,0.0287,0.0268


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 4, 0.5, 50, 4)
tables[2]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9975,0.9976,0.9984,0.9974,0.9978
Accuracy,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0023,0.0026,0.0024
Precision,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9975,0.9976,0.9984,0.9974,0.9978
Precision,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0025,0.0024,0.0024,0.0023,0.0022,0.0025,0.0024
Recall,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9975,0.9976,0.9984,0.9974,0.9978
Recall,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0023,0.0026,0.0024
F1 score,Mean,0.9977,0.9976,0.9975,0.9992,0.9991,0.9975,0.9978,0.9975,0.9976,0.9984,0.9974,0.9978
F1 score,Std,0.0025,0.0025,0.0027,0.0019,0.0013,0.0026,0.0024,0.0025,0.0024,0.0023,0.0026,0.0024
ARI,Mean,0.9939,0.9937,0.9934,0.9978,0.9975,0.9935,0.9941,0.9935,0.9937,0.9957,0.993,0.9941
ARI,Std,0.0065,0.0067,0.0072,0.005,0.0034,0.0068,0.0064,0.0065,0.0063,0.0059,0.0068,0.0063


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 4, 0.85, 50, 4)
tables[3]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.964,0.964,0.964,0.9655,0.9661,0.9627,0.9655,0.9627,0.9618,0.9572,0.9608,0.9608
Accuracy,Std,0.0084,0.0083,0.0083,0.0077,0.0079,0.0087,0.0089,0.0089,0.0092,0.0087,0.0083,0.0085
Precision,Mean,0.9646,0.9646,0.9646,0.9662,0.9668,0.9634,0.9661,0.9633,0.9624,0.9582,0.9616,0.9615
Precision,Std,0.0082,0.0081,0.0081,0.0076,0.0077,0.0085,0.0087,0.0088,0.0091,0.0084,0.0083,0.008
Recall,Mean,0.964,0.964,0.964,0.9655,0.9661,0.9627,0.9655,0.9627,0.9618,0.9572,0.9608,0.9608
Recall,Std,0.0084,0.0083,0.0083,0.0077,0.0079,0.0087,0.0089,0.0089,0.0092,0.0087,0.0083,0.0085
F1 score,Mean,0.964,0.964,0.964,0.9656,0.9662,0.9628,0.9656,0.9627,0.9618,0.9573,0.9609,0.9608
F1 score,Std,0.0083,0.0083,0.0083,0.0077,0.0079,0.0087,0.0089,0.0089,0.0092,0.0087,0.0083,0.0084
ARI,Mean,0.9061,0.9061,0.9061,0.9101,0.9117,0.9029,0.9101,0.9029,0.9006,0.8892,0.898,0.8977
ARI,Std,0.0212,0.021,0.0212,0.0197,0.0201,0.0221,0.0225,0.0229,0.0236,0.0219,0.021,0.0218


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 8, 0.5, 50, 4)
tables[4]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.8782,0.8784,0.8784,0.9897,0.9896,0.802,0.9888,0.7602,0.678,0.8769,0.8166,0.88
Accuracy,Std,0.0676,0.0679,0.0674,0.0044,0.0043,0.0723,0.0049,0.0556,0.0634,0.0423,0.0703,0.0699
Precision,Mean,0.8998,0.9,0.8974,0.99,0.9898,0.8498,0.9892,0.8239,0.7611,0.8395,0.8561,0.9052
Precision,Std,0.0874,0.0876,0.0857,0.0042,0.0041,0.0929,0.0047,0.0954,0.0963,0.0723,0.0945,0.0881
Recall,Mean,0.8782,0.8784,0.8784,0.9897,0.9896,0.802,0.9888,0.7602,0.678,0.8769,0.8166,0.88
Recall,Std,0.0676,0.0679,0.0674,0.0044,0.0043,0.0723,0.0049,0.0556,0.0634,0.0423,0.0703,0.0699
F1 score,Mean,0.8486,0.8488,0.849,0.9897,0.9896,0.7598,0.9888,0.714,0.63,0.8399,0.7748,0.8506
F1 score,Std,0.0876,0.0879,0.0873,0.0044,0.0043,0.0856,0.0049,0.0624,0.0652,0.0548,0.0844,0.0905
ARI,Mean,0.8556,0.8561,0.8552,0.9766,0.9763,0.7482,0.9745,0.6763,0.5446,0.8574,0.7719,0.8563
ARI,Std,0.0661,0.0668,0.0662,0.0097,0.0095,0.1088,0.0111,0.0981,0.1018,0.045,0.1005,0.0688


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 8, 0.85, 50, 4)
tables[5]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6496,0.6496,0.6493,0.6433,0.6434,0.6495,0.638,0.6453,0.6303,0.5874,0.6512,0.651
Accuracy,Std,0.0199,0.0199,0.02,0.0192,0.0191,0.0196,0.0309,0.0219,0.0262,0.0294,0.0191,0.0196
Precision,Mean,0.66,0.66,0.6594,0.6603,0.6599,0.6619,0.6444,0.6574,0.6458,0.5783,0.6626,0.6606
Precision,Std,0.0218,0.0218,0.0217,0.021,0.0209,0.0223,0.0394,0.0209,0.0255,0.0551,0.0222,0.0202
Recall,Mean,0.6496,0.6496,0.6493,0.6433,0.6434,0.6495,0.638,0.6453,0.6303,0.5874,0.6512,0.651
Recall,Std,0.0199,0.0199,0.02,0.0192,0.0191,0.0196,0.0309,0.0219,0.0262,0.0294,0.0191,0.0196
F1 score,Mean,0.6439,0.6439,0.6436,0.6387,0.6388,0.643,0.6297,0.6399,0.6263,0.5548,0.6451,0.6446
F1 score,Std,0.0206,0.0206,0.0206,0.0196,0.0194,0.0194,0.0384,0.0212,0.0269,0.0419,0.0201,0.0204
ARI,Mean,0.3923,0.3923,0.3921,0.3842,0.3846,0.3926,0.3869,0.3879,0.3709,0.3856,0.394,0.3945
ARI,Std,0.026,0.026,0.0265,0.025,0.0246,0.0279,0.026,0.0295,0.033,0.0271,0.0259,0.0265


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 8, 0.5, 50, 4)
tables[6]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.8502,0.8503,0.8502,0.9963,0.9969,0.8134,0.8852,0.8119,0.7981,0.8253,0.8216,0.8556
Accuracy,Std,0.0997,0.0996,0.0999,0.0031,0.003,0.0963,0.0938,0.0931,0.097,0.0907,0.0989,0.0988
Precision,Mean,0.7971,0.7969,0.7942,0.9964,0.9969,0.7778,0.8347,0.7908,0.7905,0.7543,0.7897,0.8002
Precision,Std,0.1217,0.1216,0.1276,0.003,0.0029,0.1169,0.1294,0.1205,0.1215,0.1255,0.1229,0.1222
Recall,Mean,0.8502,0.8503,0.8502,0.9963,0.9969,0.8134,0.8852,0.8119,0.7981,0.8253,0.8216,0.8556
Recall,Std,0.0997,0.0996,0.0999,0.0031,0.003,0.0963,0.0938,0.0931,0.097,0.0907,0.0989,0.0988
F1 score,Mean,0.806,0.806,0.8059,0.9963,0.9969,0.7668,0.8494,0.766,0.7519,0.7757,0.7759,0.8116
F1 score,Std,0.1253,0.1251,0.1256,0.0031,0.003,0.1112,0.1221,0.1062,0.1083,0.1153,0.1171,0.1257
ARI,Mean,0.8335,0.833,0.8331,0.9917,0.993,0.7662,0.8799,0.764,0.7399,0.7923,0.7796,0.8439
ARI,Std,0.1158,0.1157,0.116,0.007,0.0067,0.1403,0.0975,0.1353,0.1436,0.1152,0.1396,0.1077


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 8, 0.85, 50, 4)
tables[7]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7662,0.7662,0.7661,0.7676,0.7667,0.7462,0.7284,0.7306,0.7039,0.6804,0.7598,0.7682
Accuracy,Std,0.0198,0.0198,0.0199,0.0171,0.0176,0.0369,0.0368,0.0398,0.0503,0.03,0.0404,0.0208
Precision,Mean,0.786,0.7861,0.7855,0.7864,0.7861,0.7709,0.7508,0.7569,0.7355,0.7126,0.7813,0.7848
Precision,Std,0.0186,0.0186,0.0192,0.0174,0.0174,0.028,0.0424,0.0363,0.0529,0.0442,0.0324,0.0195
Recall,Mean,0.7662,0.7662,0.7661,0.7676,0.7667,0.7462,0.7284,0.7306,0.7039,0.6804,0.7598,0.7682
Recall,Std,0.0198,0.0198,0.0199,0.0171,0.0176,0.0369,0.0368,0.0398,0.0503,0.03,0.0404,0.0208
F1 score,Mean,0.768,0.7681,0.7678,0.7695,0.7686,0.7481,0.7205,0.7316,0.7015,0.6536,0.7607,0.7691
F1 score,Std,0.0193,0.0193,0.0196,0.017,0.0175,0.0381,0.0472,0.0448,0.0629,0.0366,0.042,0.0203
ARI,Mean,0.5399,0.54,0.5401,0.5431,0.5415,0.515,0.5065,0.4977,0.4616,0.4968,0.5352,0.5452
ARI,Std,0.034,0.034,0.034,0.0287,0.0295,0.0487,0.0409,0.0515,0.0553,0.0331,0.0547,0.0361


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 15, 0.5, 50, 4)
tables[8]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7271,0.7272,0.7267,0.8999,0.9004,0.6941,0.8026,0.6805,0.6535,0.7158,0.692,0.7435
Accuracy,Std,0.0601,0.06,0.0607,0.029,0.0289,0.0388,0.0443,0.0532,0.0675,0.0603,0.0413,0.0585
Precision,Mean,0.6603,0.6605,0.6611,0.859,0.8595,0.6531,0.731,0.6545,0.6648,0.6242,0.6537,0.6718
Precision,Std,0.073,0.073,0.0714,0.0414,0.0412,0.0614,0.0698,0.0665,0.0666,0.0744,0.064,0.0644
Recall,Mean,0.7271,0.7272,0.7267,0.8999,0.9004,0.6941,0.8026,0.6805,0.6535,0.7158,0.692,0.7435
Recall,Std,0.0601,0.06,0.0607,0.029,0.0289,0.0388,0.0443,0.0532,0.0675,0.0603,0.0413,0.0585
F1 score,Mean,0.6605,0.6607,0.6604,0.8726,0.8731,0.6272,0.7479,0.6155,0.5935,0.6361,0.6221,0.6785
F1 score,Std,0.0698,0.0697,0.07,0.038,0.0379,0.046,0.0572,0.0557,0.0686,0.0782,0.0487,0.0667
ARI,Mean,0.6716,0.6719,0.672,0.8571,0.8581,0.6239,0.7599,0.5892,0.5153,0.7082,0.627,0.6903
ARI,Std,0.0648,0.0646,0.0648,0.037,0.0368,0.0518,0.0477,0.0831,0.1131,0.0466,0.0532,0.0672


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 6, 15, 0.85, 50, 4)
tables[9]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5151,0.5151,0.5152,0.4992,0.4993,0.5116,0.4835,0.5052,0.4783,0.4888,0.514,0.5153
Accuracy,Std,0.0196,0.0196,0.0192,0.0205,0.0206,0.019,0.0258,0.0211,0.0335,0.0295,0.021,0.0198
Precision,Mean,0.4842,0.4842,0.4843,0.4559,0.4565,0.491,0.4274,0.4882,0.4671,0.4284,0.4928,0.4825
Precision,Std,0.0245,0.0245,0.024,0.0266,0.0267,0.0233,0.0436,0.0249,0.033,0.0466,0.0241,0.026
Recall,Mean,0.5151,0.5151,0.5152,0.4992,0.4993,0.5116,0.4835,0.5052,0.4783,0.4888,0.514,0.5153
Recall,Std,0.0196,0.0196,0.0192,0.0205,0.0206,0.019,0.0258,0.0211,0.0335,0.0295,0.021,0.0198
F1 score,Mean,0.4812,0.4812,0.4814,0.4612,0.4615,0.4803,0.433,0.474,0.4466,0.4369,0.4825,0.4801
F1 score,Std,0.0203,0.0203,0.0202,0.0223,0.0225,0.0184,0.0335,0.02,0.0335,0.0377,0.0206,0.022
ARI,Mean,0.2918,0.2918,0.2918,0.2924,0.2926,0.2852,0.2805,0.2804,0.2632,0.2867,0.2868,0.2946
ARI,Std,0.0212,0.0212,0.021,0.02,0.02,0.0208,0.023,0.0218,0.0272,0.0255,0.0224,0.0204


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.5, 50, 4)
tables[10]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5426,0.5427,0.5437,0.9015,0.902,0.5552,0.5296,0.5362,0.4595,0.5228,0.5557,0.5494
Accuracy,Std,0.0667,0.0666,0.0667,0.0143,0.0143,0.0793,0.0321,0.0843,0.0891,0.029,0.0797,0.0728
Precision,Mean,0.4791,0.4806,0.4785,0.8722,0.8729,0.509,0.4541,0.5087,0.4975,0.4379,0.5129,0.4852
Precision,Std,0.0699,0.0705,0.0689,0.0253,0.0258,0.0748,0.0415,0.0761,0.0653,0.0425,0.0673,0.0692
Recall,Mean,0.5426,0.5427,0.5437,0.9015,0.902,0.5552,0.5296,0.5362,0.4595,0.5228,0.5557,0.5494
Recall,Std,0.0667,0.0666,0.0667,0.0143,0.0143,0.0793,0.0321,0.0843,0.0891,0.029,0.0797,0.0728
F1 score,Mean,0.4712,0.4713,0.4722,0.8788,0.8791,0.4903,0.4605,0.4699,0.4023,0.4476,0.4895,0.4779
F1 score,Std,0.069,0.069,0.0687,0.0189,0.0188,0.0804,0.0375,0.0882,0.0832,0.0376,0.0812,0.0736
ARI,Mean,0.3928,0.3928,0.3936,0.8759,0.8759,0.4045,0.3082,0.3656,0.2599,0.3338,0.404,0.412
ARI,Std,0.0963,0.0962,0.0963,0.0209,0.0204,0.1129,0.0599,0.1117,0.1135,0.0732,0.1177,0.1016


In [None]:
# N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 4)
tables[11]

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.515,0.515,0.5146,0.5573,0.5573,0.5042,0.4682,0.5032,0.4984,0.473,0.5088,0.5171
Accuracy,Std,0.0254,0.0254,0.0253,0.0311,0.0311,0.0249,0.0479,0.025,0.0268,0.0384,0.0262,0.0289
Precision,Mean,0.5298,0.5298,0.5304,0.5278,0.5278,0.5132,0.4113,0.5157,0.5089,0.4343,0.5195,0.5364
Precision,Std,0.0327,0.0327,0.0329,0.043,0.043,0.0341,0.0737,0.036,0.0368,0.0605,0.0369,0.0387
Recall,Mean,0.515,0.515,0.5146,0.5573,0.5573,0.5042,0.4682,0.5032,0.4984,0.473,0.5088,0.5171
Recall,Std,0.0254,0.0254,0.0253,0.0311,0.0311,0.0249,0.0479,0.025,0.0268,0.0384,0.0262,0.0289
F1 score,Mean,0.49,0.49,0.4899,0.5251,0.5251,0.4778,0.403,0.4759,0.4704,0.4082,0.4795,0.4914
F1 score,Std,0.0266,0.0266,0.0266,0.0363,0.0363,0.026,0.0592,0.0255,0.0257,0.0507,0.0272,0.029
ARI,Mean,0.2821,0.2821,0.2815,0.3485,0.3485,0.2793,0.298,0.2772,0.2735,0.3072,0.2819,0.2824
ARI,Std,0.0308,0.0308,0.0308,0.0263,0.0263,0.0304,0.0364,0.0279,0.0288,0.0318,0.0313,0.0346


## Real World Datasets (UCI Repository)

### Glass

In [12]:
df = pd.read_csv('../Datasets/glass.csv')
X = df.drop('Type', axis=1).to_numpy()
y = df['Type'].to_numpy()

In [13]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 51 iter.
Finished: 52 iter.
Finished: 53 iter.
Finished: 54 iter.
Finished: 55 iter.
Finished: 56 iter.
Finished: 57 iter.
Finished: 58 iter.
Finished: 59 iter.
Finished: 60 iter.
Finished: 61 iter.
Finished: 62 iter.
Finished: 63 iter.
Finished: 64 iter.
Finished: 65 iter.
Finished: 66 iter.
Finished: 67 iter.
Finished: 68 iter.
Finished: 69 iter.
Finished: 70 iter.
Finished: 71 iter.
Finished: 72 iter.
Finished: 73 iter.
Finished: 74 iter.
Finished: 75 iter.
Finished: 76 iter.
Finished: 77 iter.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 78 iter.
Finished: 79 iter.
Finished: 80 iter.
Finished: 81 iter.
Finished: 82 iter.
Finished: 83 iter.
Finished: 84 iter.
Finished: 85 iter.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 86 iter.
Finished: 87 iter.
Finished: 88 iter.
Finished: 89 iter.
Finished: 90 iter.
Finished: 91 iter.
Finished: 92 iter.
Finished: 93 iter.
Finished: 94 iter.
Finished: 95 iter.
Finished: 96 iter.
Finished: 97 iter.
Finished: 98 iter.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 99 iter.
Finished: 100 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.6548,0.6185,0.643,0.6093,0.6281,0.6189,0.567,0.6633,0.6526
Accuracy,Std,0.0608,0.0647,0.0601,0.0962,0.055,0.0672,0.0985,0.0641,0.0589
Precision,Mean,0.6054,0.6024,0.6273,0.5723,0.6119,0.5971,0.4913,0.6188,0.5955
Precision,Std,0.0703,0.0862,0.0708,0.1369,0.0746,0.0823,0.1728,0.08,0.0703
Recall,Mean,0.6548,0.6185,0.643,0.6093,0.6281,0.6189,0.567,0.6633,0.6526
Recall,Std,0.0608,0.0647,0.0601,0.0962,0.055,0.0672,0.0985,0.0641,0.0589
F1 score,Mean,0.6155,0.5964,0.6203,0.5575,0.6051,0.5922,0.4907,0.6276,0.6096
F1 score,Std,0.0642,0.0736,0.0651,0.1244,0.0615,0.07,0.1414,0.0713,0.062
ARI,Mean,0.3179,0.2858,0.3134,0.3027,0.2909,0.2778,0.2938,0.3327,0.316
ARI,Std,0.0783,0.0857,0.0825,0.0796,0.0706,0.085,0.0664,0.0891,0.0828


In [9]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6578,0.6585,0.6615,0.6552,0.6607,0.6544,0.6393,0.6315,0.627,0.5907,0.6692,0.6652
Accuracy,Std,0.0569,0.0568,0.0558,0.0557,0.0595,0.0692,0.0763,0.0545,0.0567,0.105,0.0576,0.0641
Precision,Mean,0.6515,0.6517,0.6527,0.6711,0.6767,0.6649,0.6286,0.6455,0.6504,0.5599,0.6654,0.6459
Precision,Std,0.079,0.079,0.0765,0.0588,0.0586,0.0777,0.1033,0.0577,0.0552,0.168,0.0732,0.0842
Recall,Mean,0.6578,0.6585,0.6615,0.6552,0.6607,0.6544,0.6393,0.6315,0.627,0.5907,0.6692,0.6652
Recall,Std,0.0569,0.0568,0.0558,0.0557,0.0595,0.0692,0.0763,0.0545,0.0567,0.105,0.0576,0.0641
F1 score,Mean,0.6386,0.6393,0.6417,0.6449,0.6515,0.6414,0.6138,0.6208,0.6174,0.5373,0.6529,0.6425
F1 score,Std,0.0632,0.0633,0.0626,0.0583,0.0618,0.0736,0.0926,0.0551,0.0533,0.1437,0.0623,0.0698
ARI,Mean,0.3119,0.3126,0.3159,0.3061,0.3171,0.3043,0.3047,0.2794,0.2718,0.3,0.3308,0.3285
ARI,Std,0.0825,0.0823,0.0822,0.0689,0.0776,0.0954,0.0763,0.0727,0.0674,0.0838,0.0816,0.083


### Breast-cancer-prognostic

In [187]:
df = pd.read_csv('../DATASETS/breast_cancer_prognostic.csv')
df = df.drop('Unnamed: 0', axis=1)
df['Outcome'] = df['Outcome'].map({'R': 1, 'N': 0})
X = df.drop('Outcome', axis = 1).to_numpy()
y = df['Outcome'].to_numpy()
df

Unnamed: 0,Outcome,Mean_Radius,Mean_Texture,Mean_Perimeter,Mean_Area,Mean_Smoothness,Mean_Compactness,Mean_Concavity,Mean_Concave_Points,Mean_Symmetry,...,Worst_Perimeter,Worst_Area,Worst_Smoothness,Worst_Compactness,Worst_Concavity,Worst_Concave_Points,Worst_Symmetry,Worst_Fractal_Dimension,Tumor_Size,Lymph_Node_Status
0,0,18.02,27.60,117.50,1013.0,0.09489,0.10360,0.10860,0.07055,0.1865,...,139.70,1436.0,0.11950,0.1926,0.3140,0.11700,0.2677,0.08113,5.0,5.0
1,0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,184.60,2019.0,0.16220,0.6656,0.7119,0.26540,0.4601,0.11890,3.0,2.0
2,0,21.37,17.44,137.50,1373.0,0.08836,0.11890,0.12550,0.08180,0.2333,...,159.10,1949.0,0.11880,0.3449,0.3414,0.20320,0.4334,0.09067,2.5,0.0
3,0,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,98.87,567.7,0.20980,0.8663,0.6869,0.25750,0.6638,0.17300,2.0,0.0
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,152.20,1575.0,0.13740,0.2050,0.4000,0.16250,0.2364,0.07678,3.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0,14.72,25.26,99.28,657.5,0.11740,0.21120,0.17290,0.09465,0.2079,...,111.60,814.8,0.14640,0.5352,0.5655,0.19740,0.3778,0.11320,1.7,21.0
190,0,22.52,21.92,146.90,1597.0,0.07592,0.09162,0.06862,0.06367,0.1728,...,162.10,1902.0,0.08191,0.1319,0.1056,0.09378,0.2061,0.05788,6.0,2.0
191,0,15.44,31.18,101.00,740.4,0.09399,0.10620,0.13750,0.06500,0.1735,...,112.60,929.0,0.12720,0.2362,0.2975,0.12860,0.2914,0.08024,1.5,0.0
192,0,17.17,29.19,110.00,915.3,0.08952,0.06655,0.06583,0.05068,0.1793,...,132.50,1295.0,0.12610,0.1572,0.2141,0.09520,0.3362,0.06033,3.7,0.0


In [63]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6955,0.6959,0.6947,0.711,0.711,0.6967,0.6955,0.6959,0.6967,0.6947,0.6959,0.6963
Accuracy,Std,0.0731,0.072,0.0721,0.0817,0.077,0.0726,0.0723,0.072,0.072,0.0736,0.0708,0.0725
Precision,Mean,0.6558,0.6563,0.6557,0.6492,0.6507,0.6565,0.6561,0.6563,0.6572,0.6559,0.6562,0.6547
Precision,Std,0.0874,0.0869,0.0876,0.0819,0.083,0.0887,0.087,0.0869,0.0872,0.0874,0.0866,0.0873
Recall,Mean,0.6955,0.6959,0.6947,0.711,0.711,0.6967,0.6955,0.6959,0.6967,0.6947,0.6959,0.6963
Recall,Std,0.0731,0.072,0.0721,0.0817,0.077,0.0726,0.0723,0.072,0.072,0.0736,0.0708,0.0725
F1 score,Mean,0.6646,0.665,0.6649,0.6657,0.6672,0.6659,0.6647,0.665,0.6658,0.6644,0.665,0.6648
F1 score,Std,0.0783,0.0776,0.0787,0.0737,0.0709,0.0782,0.0778,0.0776,0.0779,0.0784,0.077,0.0777
ARI,Mean,0.0175,0.0176,0.0184,0.0055,0.0055,0.0207,0.0174,0.0176,0.0192,0.0177,0.017,0.0177
ARI,Std,0.0887,0.0887,0.0899,0.0609,0.0623,0.0913,0.0888,0.0887,0.0869,0.0889,0.0883,0.0899


In [169]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6629,0.6629,0.6698,0.6882,0.6841,0.6633,0.6625,0.662,0.6625,0.66,0.6633,0.6608
Accuracy,Std,0.0639,0.0655,0.0621,0.0868,0.0832,0.065,0.0657,0.0658,0.0652,0.0665,0.0634,0.0661
Precision,Mean,0.6516,0.652,0.6485,0.6641,0.6546,0.6528,0.6517,0.6516,0.652,0.6505,0.6511,0.6508
Precision,Std,0.08,0.0779,0.0826,0.0842,0.0848,0.0789,0.078,0.078,0.0779,0.0783,0.0784,0.0782
Recall,Mean,0.6629,0.6629,0.6698,0.6882,0.6841,0.6633,0.6625,0.662,0.6625,0.66,0.6633,0.6608
Recall,Std,0.0639,0.0655,0.0621,0.0868,0.0832,0.065,0.0657,0.0658,0.0652,0.0665,0.0634,0.0661
F1 score,Mean,0.6499,0.6503,0.6528,0.6628,0.6575,0.6512,0.65,0.6497,0.6502,0.6483,0.65,0.6489
F1 score,Std,0.0696,0.0696,0.0709,0.0811,0.0822,0.0699,0.0698,0.0698,0.0695,0.0704,0.069,0.0701
ARI,Mean,0.0079,0.0078,0.0042,0.0257,0.0167,0.009,0.0073,0.0073,0.0077,0.0061,0.0065,0.0064
ARI,Std,0.065,0.0622,0.0655,0.0726,0.069,0.0614,0.0621,0.0621,0.0621,0.0652,0.0623,0.0617


### Congressional voting records

In [234]:
df = pd.read_csv('../DATASETS/congressional-voting-records.csv')
X = df.drop('Class Name', axis=1).to_numpy()
y = df['Class Name'].to_numpy()
df

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0
2,0,1,0,0,0,1,1,1,1,1,0,1,0,1,1,0,0
3,0,1,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0
4,0,1,1,0,0,1,1,1,1,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,0,0,0,0,0,1,0,1,1,1,0,1,0,1,0,0,0
228,0,1,1,0,0,1,1,1,1,1,1,0,0,1,1,0,0
229,1,0,1,1,1,0,0,1,1,0,1,0,1,0,0,1,0
230,1,0,1,0,1,0,0,0,0,1,1,0,1,0,0,1,0


In [65]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9493,0.9493,0.95,0.9504,0.9507,0.9486,0.9493,0.9493,0.9493,0.9493,0.9493,0.9493
Accuracy,Std,0.027,0.027,0.0265,0.0273,0.0272,0.0266,0.027,0.027,0.027,0.027,0.027,0.027
Precision,Mean,0.9524,0.9524,0.9527,0.9534,0.9535,0.9517,0.9524,0.9524,0.9524,0.9524,0.9524,0.9524
Precision,Std,0.0241,0.0241,0.0237,0.0243,0.0244,0.0237,0.0241,0.0241,0.0241,0.0241,0.0241,0.0241
Recall,Mean,0.9493,0.9493,0.95,0.9504,0.9507,0.9486,0.9493,0.9493,0.9493,0.9493,0.9493,0.9493
Recall,Std,0.027,0.027,0.0265,0.0273,0.0272,0.0266,0.027,0.027,0.027,0.027,0.027,0.027
F1 score,Mean,0.9493,0.9493,0.9499,0.9503,0.9506,0.9486,0.9493,0.9493,0.9493,0.9493,0.9493,0.9493
F1 score,Std,0.0271,0.0271,0.0266,0.0274,0.0272,0.0267,0.0271,0.0271,0.0271,0.0271,0.0271,0.0271
ARI,Mean,0.807,0.807,0.8095,0.8109,0.8121,0.8044,0.807,0.807,0.807,0.807,0.807,0.807
ARI,Std,0.0965,0.0965,0.0948,0.0977,0.0973,0.0948,0.0965,0.0965,0.0965,0.0965,0.0965,0.0965


In [66]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.9483,0.9486,0.9455,0.9493,0.95,0.9493,0.9493,0.9486,0.9486,0.9486,0.9483,0.9483
Accuracy,Std,0.0272,0.0275,0.0281,0.0279,0.0281,0.0277,0.0272,0.0275,0.0275,0.0275,0.0272,0.0272
Precision,Mean,0.9513,0.9517,0.9486,0.9521,0.953,0.9523,0.9521,0.9517,0.9517,0.9517,0.9513,0.9513
Precision,Std,0.0241,0.0244,0.0247,0.0249,0.0242,0.0247,0.0243,0.0244,0.0244,0.0244,0.0241,0.0241
Recall,Mean,0.9483,0.9486,0.9455,0.9493,0.95,0.9493,0.9493,0.9486,0.9486,0.9486,0.9483,0.9483
Recall,Std,0.0272,0.0275,0.0281,0.0279,0.0281,0.0277,0.0272,0.0275,0.0275,0.0275,0.0272,0.0272
F1 score,Mean,0.9483,0.9486,0.9455,0.9493,0.95,0.9493,0.9493,0.9486,0.9486,0.9486,0.9483,0.9483
F1 score,Std,0.0272,0.0275,0.0281,0.0279,0.0281,0.0277,0.0273,0.0275,0.0275,0.0275,0.0272,0.0272
ARI,Mean,0.8033,0.8046,0.7935,0.8073,0.8098,0.8072,0.8071,0.8046,0.8046,0.8046,0.8033,0.8033
ARI,Std,0.0969,0.0982,0.0994,0.0999,0.1006,0.099,0.0975,0.0982,0.0982,0.0982,0.0969,0.0969


### Balance-scale

In [186]:
df = pd.read_csv('../Datasets/balance-scale-preprocessed.csv')
X = df.drop('Class', axis=1).to_numpy()
y = df['Class'].to_numpy()
df

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,0,1,1,1,1
1,2,1,1,1,2
2,2,1,1,1,3
3,2,1,1,1,4
4,2,1,1,1,5
...,...,...,...,...,...
620,1,5,5,5,1
621,1,5,5,5,2
622,1,5,5,5,3
623,1,5,5,5,4


In [68]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7092,0.7092,0.7092,0.6989,0.6989,0.7055,0.7085,0.7056,0.7046,0.7118,0.7108,0.7121
Accuracy,Std,0.0326,0.0326,0.0326,0.0266,0.0266,0.0319,0.031,0.0314,0.0317,0.0304,0.0307,0.0351
Precision,Mean,0.6612,0.6612,0.6614,0.6509,0.6511,0.6573,0.66,0.6571,0.6574,0.6651,0.6639,0.6648
Precision,Std,0.037,0.037,0.037,0.0316,0.0316,0.0353,0.0355,0.0342,0.0351,0.0337,0.0335,0.0364
Recall,Mean,0.7092,0.7092,0.7092,0.6989,0.6989,0.7055,0.7085,0.7056,0.7046,0.7118,0.7108,0.7121
Recall,Std,0.0326,0.0326,0.0326,0.0266,0.0266,0.0319,0.031,0.0314,0.0317,0.0304,0.0307,0.0351
F1 score,Mean,0.6811,0.6811,0.6811,0.6707,0.6707,0.6774,0.6804,0.6774,0.6764,0.6835,0.6825,0.6839
F1 score,Std,0.0344,0.0344,0.0344,0.0291,0.0292,0.0338,0.0326,0.0331,0.0339,0.0316,0.0316,0.0352
ARI,Mean,0.2427,0.2427,0.2427,0.2219,0.2219,0.2361,0.2414,0.2364,0.2339,0.2473,0.2454,0.2494
ARI,Std,0.0606,0.0606,0.0606,0.0457,0.0457,0.0581,0.0568,0.0581,0.0574,0.06,0.0608,0.0713


In [69]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7825,0.7825,0.7818,0.7434,0.7442,0.7767,0.7782,0.7753,0.772,0.7761,0.7833,0.7847
Accuracy,Std,0.032,0.032,0.0316,0.0269,0.0261,0.0334,0.031,0.0308,0.0315,0.0276,0.0255,0.0279
Precision,Mean,0.7334,0.7334,0.7357,0.7032,0.7067,0.7331,0.7295,0.7302,0.7288,0.7277,0.7341,0.7341
Precision,Std,0.0357,0.0357,0.0373,0.0311,0.0326,0.0383,0.0328,0.0391,0.0395,0.0299,0.0316,0.0305
Recall,Mean,0.7825,0.7825,0.7818,0.7434,0.7442,0.7767,0.7782,0.7753,0.772,0.7761,0.7833,0.7847
Recall,Std,0.032,0.032,0.0316,0.0269,0.0261,0.0334,0.031,0.0308,0.0315,0.0276,0.0255,0.0279
F1 score,Mean,0.7551,0.7551,0.7556,0.7179,0.7199,0.7502,0.7508,0.7485,0.7454,0.7487,0.7558,0.7567
F1 score,Std,0.0332,0.0332,0.0337,0.0283,0.028,0.0342,0.031,0.0324,0.0334,0.0277,0.0274,0.0283
ARI,Mean,0.4173,0.4173,0.4169,0.3209,0.3239,0.4031,0.4063,0.3981,0.39,0.4,0.4176,0.4212
ARI,Std,0.0757,0.0757,0.0751,0.0649,0.0633,0.0773,0.0744,0.0726,0.0745,0.0672,0.0625,0.0697


### Blood-transfusion

In [70]:
df = pd.read_csv('../Datasets/blood-transfusion.csv')
X = df.drop('whether he/she donated blood in March 2007', axis=1).to_numpy()
y = df['whether he/she donated blood in March 2007'].to_numpy()

In [71]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7757,0.7757,0.7751,0.7752,0.7744,0.7757,0.7757,0.7757,0.7757,0.7757,0.7757,0.7757
Accuracy,Std,0.0295,0.0295,0.0303,0.0314,0.0319,0.0295,0.0295,0.0295,0.0295,0.0295,0.0295,0.0295
Precision,Mean,0.7496,0.7496,0.7489,0.7522,0.7516,0.7496,0.7496,0.7496,0.7496,0.7496,0.7496,0.7496
Precision,Std,0.0417,0.0417,0.0423,0.049,0.049,0.0417,0.0417,0.0417,0.0417,0.0417,0.0417,0.0417
Recall,Mean,0.7757,0.7757,0.7751,0.7752,0.7744,0.7757,0.7757,0.7757,0.7757,0.7757,0.7757,0.7757
Recall,Std,0.0295,0.0295,0.0303,0.0314,0.0319,0.0295,0.0295,0.0295,0.0295,0.0295,0.0295,0.0295
F1 score,Mean,0.7473,0.7473,0.7468,0.7517,0.7512,0.7473,0.7473,0.7473,0.7473,0.7473,0.7473,0.7473
F1 score,Std,0.0453,0.0453,0.046,0.044,0.0442,0.0453,0.0453,0.0453,0.0453,0.0453,0.0453,0.0453
ARI,Mean,0.1829,0.1829,0.1821,0.1937,0.1925,0.1829,0.1829,0.1829,0.1829,0.1829,0.1829,0.1829
ARI,Std,0.0856,0.0856,0.0863,0.0832,0.0836,0.0856,0.0856,0.0856,0.0856,0.0856,0.0856,0.0856


In [72]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7761,0.7761,0.7752,0.7726,0.7725,0.7761,0.7761,0.7761,0.7761,0.7761,0.7761,0.7761
Accuracy,Std,0.0299,0.0299,0.0294,0.0308,0.0306,0.0299,0.0299,0.0299,0.0299,0.0299,0.0299,0.0299
Precision,Mean,0.7635,0.7635,0.7622,0.7607,0.7601,0.7635,0.7635,0.7635,0.7635,0.7635,0.7635,0.7635
Precision,Std,0.0353,0.0353,0.0352,0.0338,0.0335,0.0353,0.0354,0.0353,0.0353,0.0353,0.0353,0.0353
Recall,Mean,0.7761,0.7761,0.7752,0.7726,0.7725,0.7761,0.7761,0.7761,0.7761,0.7761,0.7761,0.7761
Recall,Std,0.0299,0.0299,0.0294,0.0308,0.0306,0.0299,0.0299,0.0299,0.0299,0.0299,0.0299,0.0299
F1 score,Mean,0.7599,0.7599,0.7585,0.7589,0.7585,0.7599,0.7599,0.7599,0.7599,0.7599,0.7599,0.7599
F1 score,Std,0.0382,0.0382,0.0375,0.0357,0.0354,0.0382,0.0382,0.0382,0.0382,0.0382,0.0382,0.0382
ARI,Mean,0.2122,0.2122,0.2087,0.2097,0.2084,0.2122,0.2122,0.2122,0.2122,0.2122,0.2122,0.2122
ARI,Std,0.0752,0.0752,0.0732,0.068,0.0679,0.0752,0.0751,0.0752,0.0752,0.0752,0.0752,0.0752


### Car-evaluation

In [16]:
import pandas as pd

with open('/Users/user/HSE 24:25/A Family of Classifiying criteria/Datasets/car+evaluation/car.names', 'r') as f:
    column_names = []
    for line in f:
        line = line.strip()
        if ':' in line and not line.startswith(('|', '#')):
            column_names.append(line.split(':')[0].strip())

print("Columns:", column_names)

df = pd.read_csv('/Users/user/HSE 24:25/A Family of Classifiying criteria/Datasets/car+evaluation/car.data', 
                 header=None, 
                 names=column_names,
                 delimiter=',')

df = df[['1. Title', '2. Sources', '(a) Creator', '(b) Donors', '(c) Date',
       '3. Past Usage','M. Bohanec and V. Rajkovic']]
df.columns = ['buying', 'maint', 'doors', 'persons','lug_boot','safety','class']

for column in df.columns:
    df[column], _ = pd.factorize(df[column])
    
X = df.drop('class', axis=1).to_numpy()
y = df['class'].to_numpy()
X

Columns: ['1. Title', '2. Sources', '(a) Creator', '(b) Donors', '(c) Date', '3. Past Usage', 'M. Bohanec and V. Rajkovic', 'B. Zupan, M. Bohanec, I. Bratko, J. Demsar', '4. Relevant Information Paragraph', '(M. Bohanec, V. Rajkovic', 'cars according to the following concept structure', 'concept (CAR), the model includes three intermediate concepts', 'these examples sets see http', 'attributes', '5. Number of Instances', '6. Number of Attributes', '7. Attribute Values', '8. Missing Attribute Values']


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 2],
       ...,
       [3, 3, 3, 2, 2, 0],
       [3, 3, 3, 2, 2, 1],
       [3, 3, 3, 2, 2, 2]], shape=(1728, 6))

In [14]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.7878,0.7887,0.7863,0.7887,0.7896,0.7969,0.7887,0.7872,0.7887
Accuracy,Std,0.019,0.019,0.0166,0.019,0.015,0.0147,0.019,0.0181,0.019
Precision,Mean,0.73,0.73,0.733,0.73,0.7452,0.767,0.73,0.7301,0.73
Precision,Std,0.0271,0.0271,0.0278,0.0271,0.0306,0.0207,0.0271,0.0274,0.0271
Recall,Mean,0.7878,0.7887,0.7863,0.7887,0.7896,0.7969,0.7887,0.7872,0.7887
Recall,Std,0.019,0.019,0.0166,0.019,0.015,0.0147,0.019,0.0181,0.019
F1 score,Mean,0.7549,0.7554,0.7554,0.7554,0.7627,0.7759,0.7554,0.7545,0.7554
F1 score,Std,0.017,0.0172,0.0164,0.0172,0.0197,0.0164,0.0172,0.0161,0.0172
ARI,Mean,0.5061,0.5088,0.4982,0.5088,0.5008,0.5109,0.5088,0.5041,0.5088
ARI,Std,0.053,0.0527,0.048,0.0527,0.0389,0.0379,0.0527,0.0532,0.0527


In [15]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.8443,0.8339,0.8422,0.8443,0.8363,0.8304,0.8443,0.8444,0.8443
Accuracy,Std,0.0151,0.0183,0.0183,0.0151,0.0193,0.0182,0.0151,0.0152,0.0151
Precision,Mean,0.8451,0.8523,0.8434,0.8458,0.8385,0.8319,0.8458,0.8445,0.8458
Precision,Std,0.0223,0.0207,0.0214,0.0222,0.0181,0.0144,0.0222,0.0224,0.0222
Recall,Mean,0.8443,0.8339,0.8422,0.8443,0.8363,0.8304,0.8443,0.8444,0.8443
Recall,Std,0.0151,0.0183,0.0183,0.0151,0.0193,0.0182,0.0151,0.0152,0.0151
F1 score,Mean,0.8371,0.8308,0.8339,0.8376,0.8254,0.816,0.8376,0.8367,0.8376
F1 score,Std,0.0186,0.0198,0.0219,0.0181,0.0224,0.0176,0.0181,0.0188,0.0181
ARI,Mean,0.6821,0.6506,0.6665,0.6836,0.6351,0.6047,0.6836,0.6794,0.6836
ARI,Std,0.0489,0.0554,0.0671,0.0464,0.072,0.0553,0.0464,0.0534,0.0464


### Connectionist-bench-sonar

In [179]:
df = pd.read_csv('../Datasets/connectionist-bench-sonar.csv')
df

Unnamed: 0,Freq_1,Freq_2,Freq_3,Freq_4,Freq_5,Freq_6,Freq_7,Freq_8,Freq_9,Freq_10,...,Freq_52,Freq_53,Freq_54,Freq_55,Freq_56,Freq_57,Freq_58,Freq_59,Freq_60,Label
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [180]:
df['Label'] = df['Label'].map({'R':0, 'M':1})
X = df.drop('Label', axis=1).to_numpy()
y = df['Label'].to_numpy()

In [181]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7162,0.7138,0.7173,0.7231,0.7212,0.7158,0.7165,0.7135,0.7138,0.7138,0.7158,0.7127
Accuracy,Std,0.0671,0.0661,0.0674,0.0673,0.0642,0.0659,0.0667,0.066,0.0661,0.0661,0.0681,0.0651
Precision,Mean,0.7292,0.7274,0.7303,0.7466,0.7446,0.7301,0.73,0.727,0.7275,0.7274,0.7294,0.7263
Precision,Std,0.0703,0.069,0.0706,0.0703,0.0703,0.0678,0.0697,0.0681,0.0689,0.069,0.0707,0.0679
Recall,Mean,0.7162,0.7138,0.7173,0.7231,0.7212,0.7158,0.7165,0.7135,0.7138,0.7138,0.7158,0.7127
Recall,Std,0.0671,0.0661,0.0674,0.0673,0.0642,0.0659,0.0667,0.066,0.0661,0.0661,0.0681,0.0651
F1 score,Mean,0.7133,0.7108,0.7151,0.719,0.7167,0.7125,0.7136,0.7106,0.7108,0.7108,0.7128,0.7097
F1 score,Std,0.0688,0.0679,0.0677,0.0682,0.0656,0.0675,0.0685,0.0674,0.0679,0.0679,0.0698,0.0668
ARI,Mean,0.1892,0.1846,0.1913,0.2011,0.196,0.1879,0.1897,0.1839,0.1846,0.1846,0.1891,0.1821
ARI,Std,0.1148,0.1125,0.1136,0.1188,0.1097,0.1128,0.1141,0.1122,0.1125,0.1125,0.1172,0.1101


In [182]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7131,0.7115,0.7185,0.7212,0.7181,0.7135,0.7115,0.7119,0.7108,0.7131,0.7127,0.7119
Accuracy,Std,0.0655,0.0644,0.0646,0.0655,0.0716,0.0662,0.0658,0.0635,0.0659,0.0647,0.0644,0.0638
Precision,Mean,0.7229,0.7211,0.7289,0.7321,0.7312,0.7239,0.7215,0.7216,0.7206,0.7222,0.7228,0.7214
Precision,Std,0.0655,0.0642,0.0646,0.0672,0.0706,0.065,0.0663,0.063,0.0657,0.0646,0.0641,0.0632
Recall,Mean,0.7131,0.7115,0.7185,0.7212,0.7181,0.7135,0.7115,0.7119,0.7108,0.7131,0.7127,0.7119
Recall,Std,0.0655,0.0644,0.0646,0.0655,0.0716,0.0662,0.0658,0.0635,0.0659,0.0647,0.0644,0.0638
F1 score,Mean,0.7118,0.7101,0.7177,0.7196,0.7165,0.7119,0.7103,0.7106,0.7095,0.7118,0.7114,0.7106
F1 score,Std,0.0667,0.0658,0.0654,0.0661,0.0715,0.0676,0.0669,0.0648,0.0672,0.0659,0.0657,0.0652
ARI,Mean,0.183,0.1797,0.1919,0.1971,0.1951,0.184,0.1804,0.1799,0.1792,0.1825,0.1817,0.1801
ARI,Std,0.1125,0.1118,0.1147,0.1125,0.121,0.1173,0.1142,0.1106,0.1126,0.1112,0.1127,0.111


### Contraceptive-method-choice

In [196]:
import pandas as pd

# Quick column extraction without function
with open('/Users/user/HSE 24:25/Term Paper 2025/Datasets/contraceptive+method+choice/cmc.names', 'r') as f:
    column_names = []
    for line in f:
        line = line.strip()
        if ':' in line and not line.startswith(('|', '#')):
            column_names.append(line.split(':')[0].strip())

print("Columns:", column_names)

# Read data
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/Datasets/contraceptive+method+choice/cmc.data', 
                 header=None, 
                 names=column_names,
                 delimiter=',')

df = df[['1. Title', '2. Sources', '(a) Origin', '(b) Creator', '(c) Donor', '(c) Date', '3. Past Usage', '(ftp', '(http', '4. Relevant Information']]
df.columns = ['w_age', 'w_education','h_education', 'N_children','w_religion','w_working','h_occupation','sol_index','media_exposure','class']

X = df.drop('class', axis=1).to_numpy()
y = df['class'].to_numpy()

Columns: ['1. Title', '2. Sources', '(a) Origin', '(b) Creator', '(c) Donor', '(c) Date', '3. Past Usage', '(ftp', '(http', '4. Relevant Information', '5. Number of Instances', '6. Number of Attributes', '7. Attribute Information', '8. Missing Attribute Values']


In [199]:
df

Unnamed: 0,w_age,w_education,h_education,N_children,w_religion,w_working,h_occupation,sol_index,media_exposure,class
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,4,4,2,1,0,2,4,0,3
1469,33,4,4,3,1,1,1,4,0,3
1470,39,3,3,8,1,0,1,4,0,3
1471,33,3,3,4,1,0,2,2,0,3


In [145]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5118,0.5118,0.5119,0.5022,0.5018,0.5128,0.5131,0.5133,0.513,0.5215,0.5109,0.5143
Accuracy,Std,0.0269,0.0269,0.0271,0.0288,0.0291,0.0291,0.0279,0.0316,0.0296,0.0246,0.0257,0.0267
Precision,Mean,0.5677,0.5677,0.5677,0.5505,0.5462,0.5374,0.5754,0.5147,0.5043,0.5677,0.5604,0.5764
Precision,Std,0.0564,0.0564,0.0566,0.0746,0.0771,0.0677,0.0524,0.0656,0.0734,0.0604,0.0581,0.051
Recall,Mean,0.5118,0.5118,0.5119,0.5022,0.5018,0.5128,0.5131,0.5133,0.513,0.5215,0.5109,0.5143
Recall,Std,0.0269,0.0269,0.0271,0.0288,0.0291,0.0291,0.0279,0.0316,0.0296,0.0246,0.0257,0.0267
F1 score,Mean,0.4992,0.4992,0.4993,0.4867,0.4857,0.4894,0.5052,0.4886,0.4873,0.5146,0.4942,0.5069
F1 score,Std,0.0317,0.0317,0.032,0.0417,0.0427,0.0389,0.0348,0.0428,0.0437,0.0387,0.0343,0.0345
ARI,Mean,0.0676,0.0676,0.0677,0.0607,0.0606,0.0687,0.0698,0.0735,0.0732,0.0808,0.0659,0.0709
ARI,Std,0.0229,0.0229,0.023,0.0245,0.0247,0.0284,0.0244,0.0279,0.0258,0.0223,0.023,0.0241


In [146]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5512,0.5512,0.5505,0.5442,0.5433,0.5433,0.5497,0.5373,0.5306,0.5492,0.5477,0.5524
Accuracy,Std,0.0237,0.0237,0.0235,0.0259,0.026,0.0252,0.0227,0.0241,0.0268,0.0223,0.0246,0.0209
Precision,Mean,0.5648,0.5648,0.5638,0.5634,0.562,0.5594,0.5618,0.56,0.5565,0.5652,0.5625,0.5628
Precision,Std,0.0284,0.0284,0.028,0.0361,0.0368,0.0291,0.026,0.0295,0.0401,0.0263,0.0291,0.0262
Recall,Mean,0.5512,0.5512,0.5505,0.5442,0.5433,0.5433,0.5497,0.5373,0.5306,0.5492,0.5477,0.5524
Recall,Std,0.0237,0.0237,0.0235,0.0259,0.026,0.0252,0.0227,0.0241,0.0268,0.0223,0.0246,0.0209
F1 score,Mean,0.5455,0.5455,0.5452,0.5378,0.5371,0.5363,0.545,0.5306,0.522,0.5469,0.5419,0.5472
F1 score,Std,0.0244,0.0244,0.0244,0.0323,0.0323,0.0258,0.0242,0.0238,0.0305,0.0232,0.025,0.0232
ARI,Mean,0.1115,0.1115,0.1108,0.103,0.1021,0.1035,0.1111,0.0953,0.0886,0.1115,0.1084,0.1141
ARI,Std,0.0266,0.0266,0.0261,0.0304,0.03,0.0294,0.026,0.0262,0.0266,0.0259,0.0281,0.0244


### Haberman-survival

In [147]:
import pandas as pd

with open('/Users/user/HSE 24:25/Term Paper 2025/Datasets/haberman+s+survival/haberman.names', 'r') as f:
    column_names = []
    for line in f:
        line = line.strip()
        if ':' in line and not line.startswith(('|', '#')):
            column_names.append(line.split(':')[0].strip())

print("Columns:", column_names)

df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/Datasets/haberman+s+survival/haberman.data', 
                 header=None, 
                 names=column_names,
                 delimiter=',')

df = df[['1. Title', '2. Sources', '(a) Donor', '(b) Date']]
df.columns = ['age', 'operation_year','positive_auxillary_nodes','survival_status']

X = df.drop('survival_status', axis=1).to_numpy()
y = df['survival_status'].to_numpy()

Columns: ['1. Title', '2. Sources', '(a) Donor', '(b) Date', '3. Past Usage', 'discussion), Journal of the American Statistical Association 79', '4. Relevant Information', '5. Number of Instances', '6. Number of Attributes', '7. Attribute Information', '8. Missing Attribute Values']


In [148]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7343,0.7343,0.7358,0.7322,0.734,0.7343,0.7343,0.7348,0.7343,0.7343,0.7343,0.7343
Accuracy,Std,0.0462,0.0462,0.0463,0.047,0.0467,0.0462,0.0462,0.0459,0.0462,0.0462,0.0462,0.0462
Precision,Mean,0.715,0.715,0.7158,0.7137,0.7171,0.715,0.715,0.7155,0.715,0.715,0.715,0.715
Precision,Std,0.0697,0.0697,0.07,0.0679,0.0693,0.0697,0.0697,0.0698,0.0697,0.0697,0.0697,0.0697
Recall,Mean,0.7343,0.7343,0.7358,0.7322,0.734,0.7343,0.7343,0.7348,0.7343,0.7343,0.7343,0.7343
Recall,Std,0.0462,0.0462,0.0463,0.047,0.0467,0.0462,0.0462,0.0459,0.0462,0.0462,0.0462,0.0462
F1 score,Mean,0.7095,0.7095,0.7109,0.7103,0.7102,0.7095,0.7095,0.71,0.7095,0.7095,0.7095,0.7095
F1 score,Std,0.06,0.06,0.0594,0.0614,0.0641,0.06,0.06,0.0601,0.06,0.06,0.06,0.06
ARI,Mean,0.1244,0.1244,0.1268,0.1265,0.1284,0.1244,0.1244,0.1253,0.1244,0.1244,0.1244,0.1244
ARI,Std,0.0856,0.0856,0.0866,0.0883,0.092,0.0856,0.0856,0.0857,0.0856,0.0856,0.0856,0.0856


In [149]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7322,0.7319,0.7322,0.7239,0.7249,0.7319,0.7306,0.7319,0.7319,0.7317,0.7317,0.7317
Accuracy,Std,0.0472,0.047,0.0439,0.0464,0.0434,0.047,0.0467,0.047,0.047,0.0472,0.0472,0.0472
Precision,Mean,0.7239,0.7237,0.7185,0.7083,0.7053,0.7237,0.7216,0.7237,0.7237,0.7233,0.7233,0.7233
Precision,Std,0.0556,0.0555,0.0552,0.0598,0.0585,0.0555,0.0561,0.0555,0.0555,0.0555,0.0555,0.0555
Recall,Mean,0.7322,0.7319,0.7322,0.7239,0.7249,0.7319,0.7306,0.7319,0.7319,0.7317,0.7317,0.7317
Recall,Std,0.0472,0.047,0.0439,0.0464,0.0434,0.047,0.0467,0.047,0.047,0.0472,0.0472,0.0472
F1 score,Mean,0.7178,0.7176,0.7139,0.7085,0.7069,0.7176,0.7156,0.7176,0.7176,0.7173,0.7173,0.7173
F1 score,Std,0.0493,0.0491,0.0487,0.053,0.0515,0.0491,0.0497,0.0491,0.0491,0.0491,0.0491,0.0491
ARI,Mean,0.1375,0.137,0.1281,0.1191,0.1143,0.137,0.1334,0.137,0.137,0.1363,0.1363,0.1363
ARI,Std,0.0776,0.0772,0.0753,0.0893,0.0831,0.0772,0.0766,0.0772,0.0772,0.078,0.078,0.078


### Hayes-roth

In [208]:
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/Datasets/hayes+roth/hayes-roth.data', header=None)
df.columns = ['name','hobby','age','educational level','marital status','class']

X = df.drop(['class','name'],axis=1).to_numpy()
y = df['class'].to_numpy()

In [209]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.5503,0.5503,0.5503,0.5528,0.5528,0.537,0.5528,0.5364,0.5479,0.5528,0.5358,0.5528
Accuracy,Std,0.0628,0.0628,0.0628,0.0602,0.0602,0.0437,0.0602,0.0536,0.0635,0.0602,0.0443,0.0602
Precision,Mean,0.373,0.373,0.373,0.3704,0.3704,0.5486,0.3704,0.5613,0.5848,0.3704,0.5169,0.3704
Precision,Std,0.0605,0.0605,0.0605,0.0585,0.0585,0.1309,0.0585,0.1251,0.1242,0.0585,0.1364,0.0585
Recall,Mean,0.5503,0.5503,0.5503,0.5528,0.5528,0.537,0.5528,0.5364,0.5479,0.5528,0.5358,0.5528
Recall,Std,0.0628,0.0628,0.0628,0.0602,0.0602,0.0437,0.0602,0.0536,0.0635,0.0602,0.0443,0.0602
F1 score,Mean,0.4243,0.4243,0.4243,0.4241,0.4241,0.5131,0.4241,0.5189,0.5337,0.4241,0.4952,0.4241
F1 score,Std,0.063,0.063,0.063,0.063,0.063,0.0843,0.063,0.0886,0.0895,0.063,0.0843,0.063
ARI,Mean,0.4584,0.4584,0.4584,0.4657,0.4657,0.1953,0.4657,0.1766,0.1751,0.4657,0.2334,0.4657
ARI,Std,0.0893,0.0893,0.0893,0.0698,0.0698,0.1582,0.0698,0.1362,0.1309,0.0698,0.1777,0.0698


In [207]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6564,0.6564,0.6564,0.6564,0.6564,0.5952,0.6564,0.6194,0.6303,0.6564,0.6,0.6564
Accuracy,Std,0.0448,0.0448,0.0448,0.0448,0.0448,0.0907,0.0448,0.0964,0.0913,0.0448,0.0831,0.0448
Precision,Mean,0.6759,0.6759,0.6759,0.6759,0.6759,0.6597,0.6759,0.7019,0.7028,0.6759,0.6529,0.6759
Precision,Std,0.0625,0.0625,0.0625,0.0625,0.0625,0.083,0.0625,0.092,0.0964,0.0625,0.0763,0.0625
Recall,Mean,0.6564,0.6564,0.6564,0.6564,0.6564,0.5952,0.6564,0.6194,0.6303,0.6564,0.6,0.6564
Recall,Std,0.0448,0.0448,0.0448,0.0448,0.0448,0.0907,0.0448,0.0964,0.0913,0.0448,0.0831,0.0448
F1 score,Mean,0.6551,0.6551,0.6551,0.6551,0.6551,0.5896,0.6551,0.6095,0.62,0.6551,0.5979,0.6551
F1 score,Std,0.0476,0.0476,0.0476,0.0476,0.0476,0.0942,0.0476,0.0967,0.0941,0.0476,0.0872,0.0476
ARI,Mean,0.3449,0.3449,0.3449,0.3449,0.3449,0.2539,0.3449,0.255,0.2709,0.3449,0.2583,0.3449
ARI,Std,0.0757,0.0757,0.0757,0.0757,0.0757,0.118,0.0757,0.1244,0.1087,0.0757,0.1053,0.0757


### Heart-disease-Cleveland

In [153]:
df = pd.read_csv('../Datasets/Heart_disease_cleveland_new.csv')
X = df.drop('target', axis=1).to_numpy()
y = df['target'].to_numpy()

In [154]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7858,0.7858,0.7861,0.7832,0.7824,0.7858,0.7858,0.7858,0.7858,0.7858,0.7858,0.7858
Accuracy,Std,0.0412,0.0412,0.0418,0.0433,0.0449,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412
Precision,Mean,0.7904,0.7904,0.791,0.7891,0.7888,0.7904,0.7904,0.7904,0.7904,0.7904,0.7904,0.7905
Precision,Std,0.0429,0.0429,0.0432,0.0428,0.043,0.0429,0.0429,0.0429,0.0429,0.0429,0.0429,0.0429
Recall,Mean,0.7858,0.7858,0.7861,0.7832,0.7824,0.7858,0.7858,0.7858,0.7858,0.7858,0.7858,0.7858
Recall,Std,0.0412,0.0412,0.0418,0.0433,0.0449,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412
F1 score,Mean,0.7846,0.7846,0.7848,0.7816,0.7806,0.7846,0.7846,0.7846,0.7846,0.7846,0.7846,0.7846
F1 score,Std,0.0412,0.0412,0.0419,0.0437,0.0458,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412,0.0412
ARI,Mean,0.3244,0.3244,0.3252,0.3191,0.3179,0.3244,0.3244,0.3244,0.3244,0.3244,0.3244,0.3244
ARI,Std,0.0955,0.0955,0.0948,0.0969,0.0992,0.0955,0.0955,0.0955,0.0955,0.0955,0.0955,0.0955


In [155]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.7466,0.7482,0.7487,0.7579,0.7597,0.7479,0.7482,0.7482,0.7482,0.7476,0.7476,0.7479
Accuracy,Std,0.0427,0.0442,0.037,0.043,0.0423,0.0441,0.0447,0.0442,0.0442,0.0438,0.0438,0.0443
Precision,Mean,0.7565,0.758,0.7583,0.7664,0.7698,0.7577,0.7581,0.758,0.758,0.7574,0.7574,0.7577
Precision,Std,0.0433,0.0445,0.0376,0.0432,0.042,0.0445,0.0451,0.0445,0.0445,0.0443,0.0443,0.0447
Recall,Mean,0.7466,0.7482,0.7487,0.7579,0.7597,0.7479,0.7482,0.7482,0.7482,0.7476,0.7476,0.7479
Recall,Std,0.0427,0.0442,0.037,0.043,0.0423,0.0441,0.0447,0.0442,0.0442,0.0438,0.0438,0.0443
F1 score,Mean,0.7444,0.746,0.7464,0.7558,0.7571,0.7457,0.746,0.746,0.746,0.7454,0.7454,0.7457
F1 score,Std,0.0433,0.0449,0.0376,0.0436,0.0429,0.0448,0.0453,0.0449,0.0449,0.0445,0.0445,0.0449
ARI,Mean,0.2404,0.2441,0.2428,0.2636,0.2673,0.2436,0.2443,0.2441,0.2441,0.243,0.243,0.2436
ARI,Std,0.0823,0.0863,0.0713,0.0903,0.0894,0.0862,0.0868,0.0863,0.0863,0.0855,0.0855,0.0865


### Hepatitis

In [3]:
df = pd.read_csv('../Datasets/hepatitis_csv.csv')
df = df.dropna()
df = df.replace({True: 1, False: 0})
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['class'] = df['class'].map({'live': 1, 'die': 0})

X = df.drop('class',axis=1).to_numpy()
y = df['class'].to_numpy()
df

  df = df.replace({True: 1, False: 0})


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,0,1,0,0,0,0,1,0,0,0,0,0,0.9,95.0,28.0,4.0,75.0,0,1
10,39,0,0,1,0,0,0,0,1,0,0,0,0,1.3,78.0,30.0,4.4,85.0,0,1
11,32,0,1,1,1,0,0,1,1,0,1,0,0,1.0,59.0,249.0,3.7,54.0,0,1
12,41,0,1,1,1,0,0,1,1,0,0,0,0,0.9,81.0,60.0,3.9,52.0,0,1
13,30,0,1,0,1,0,0,1,1,0,0,0,0,2.2,57.0,144.0,4.9,78.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,45,0,1,1,0,0,0,1,0,0,0,0,0,1.3,85.0,44.0,4.2,85.0,1,1
143,49,0,0,0,1,1,0,1,0,1,1,0,0,1.4,85.0,70.0,3.5,35.0,1,0
145,31,0,0,0,1,0,0,1,0,0,0,0,0,1.2,75.0,173.0,4.2,54.0,1,1
153,53,1,0,0,1,0,0,1,0,1,1,0,1,1.5,81.0,19.0,4.1,48.0,1,1


In [4]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.811,0.815,0.81,0.808,0.809,0.81,0.813,0.817,0.809
Accuracy,Std,0.0666,0.0763,0.07,0.0681,0.0676,0.0693,0.0699,0.0704,0.0683
Precision,Mean,0.8166,0.8372,0.8134,0.8137,0.8152,0.8138,0.8215,0.8245,0.8138
Precision,Std,0.0977,0.0925,0.1014,0.1001,0.0991,0.1002,0.0988,0.0973,0.1002
Recall,Mean,0.811,0.815,0.81,0.808,0.809,0.81,0.813,0.817,0.809
Recall,Std,0.0666,0.0763,0.07,0.0681,0.0676,0.0693,0.0699,0.0704,0.0683
F1 score,Mean,0.8071,0.8126,0.805,0.8038,0.8053,0.8049,0.8098,0.8136,0.8044
F1 score,Std,0.0783,0.0815,0.0822,0.0805,0.0794,0.0814,0.0808,0.0798,0.0809
ARI,Mean,0.1992,0.2306,0.1947,0.1937,0.197,0.1941,0.2141,0.2256,0.1938
ARI,Std,0.1894,0.2384,0.1916,0.1905,0.1902,0.1901,0.193,0.1989,0.1905


In [5]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.81,0.811,0.808,0.806,0.808,0.809,0.806,0.809,0.808
Accuracy,Std,0.0686,0.0673,0.0717,0.0712,0.0703,0.0705,0.0712,0.0705,0.0703
Precision,Mean,0.828,0.835,0.8242,0.8233,0.8267,0.8252,0.8257,0.8283,0.8252
Precision,Std,0.0919,0.0899,0.0964,0.0961,0.0935,0.0949,0.0961,0.0941,0.0948
Recall,Mean,0.81,0.811,0.808,0.806,0.808,0.809,0.806,0.809,0.808
Recall,Std,0.0686,0.0673,0.0717,0.0712,0.0703,0.0705,0.0712,0.0705,0.0703
F1 score,Mean,0.8129,0.8104,0.8101,0.808,0.8109,0.8108,0.8088,0.8118,0.8101
F1 score,Std,0.0763,0.0792,0.0807,0.0797,0.0777,0.0797,0.0794,0.0771,0.0791
ARI,Mean,0.2294,0.2148,0.2242,0.2178,0.226,0.2257,0.2221,0.2296,0.224
ARI,Std,0.1911,0.2136,0.1944,0.1987,0.1926,0.1931,0.1943,0.192,0.1931


### Ionosphere

In [18]:
df = pd.read_csv('/Users/user/HSE 24:25/A Family of Classifiying criteria/Datasets/ionosphere/ionosphere.data', 
                 header=None,
                 delimiter=',')
df = pd.get_dummies(df, drop_first=True, dtype=int)
X = df.drop('34_g', axis=1).to_numpy()
y = df['34_g'].to_numpy()

In [230]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.8866,0.8866,0.8877,0.8941,0.8959,0.887,0.8868,0.8866,0.8866,0.8859,0.8866,0.8866
Accuracy,Std,0.0418,0.0418,0.0373,0.0422,0.036,0.0408,0.0415,0.0418,0.0418,0.0421,0.0418,0.0418
Precision,Mean,0.8924,0.8924,0.8937,0.8993,0.9012,0.8931,0.8927,0.8924,0.8924,0.8917,0.8924,0.8924
Precision,Std,0.0368,0.0368,0.0326,0.0362,0.0293,0.0351,0.0362,0.0368,0.0368,0.0373,0.0368,0.0368
Recall,Mean,0.8866,0.8866,0.8877,0.8941,0.8959,0.887,0.8868,0.8866,0.8866,0.8859,0.8866,0.8866
Recall,Std,0.0418,0.0418,0.0373,0.0422,0.036,0.0408,0.0415,0.0418,0.0418,0.0421,0.0418,0.0418
F1 score,Mean,0.8857,0.8857,0.8872,0.893,0.8951,0.8861,0.8859,0.8857,0.8857,0.885,0.8857,0.8857
F1 score,Std,0.0445,0.0445,0.0394,0.0455,0.0385,0.0435,0.0442,0.0445,0.0445,0.0448,0.0445,0.0445
ARI,Mean,0.5963,0.5963,0.5986,0.6201,0.624,0.5974,0.597,0.5963,0.5963,0.5942,0.5963,0.5963
ARI,Std,0.1265,0.1265,0.1148,0.1259,0.1102,0.1242,0.1255,0.1265,0.1265,0.1271,0.1265,0.1265


In [231]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.882,0.8816,0.8818,0.8798,0.875,0.8816,0.882,0.8816,0.8816,0.8811,0.882,0.8823
Accuracy,Std,0.0322,0.0322,0.0301,0.038,0.0365,0.0315,0.0318,0.0322,0.0322,0.0321,0.0321,0.0317
Precision,Mean,0.886,0.8854,0.886,0.8854,0.8799,0.8857,0.8862,0.8854,0.8854,0.8851,0.8858,0.8861
Precision,Std,0.0323,0.0323,0.0291,0.0372,0.0376,0.0313,0.0316,0.0323,0.0323,0.0321,0.0324,0.0319
Recall,Mean,0.882,0.8816,0.8818,0.8798,0.875,0.8816,0.882,0.8816,0.8816,0.8811,0.882,0.8823
Recall,Std,0.0322,0.0322,0.0301,0.038,0.0365,0.0315,0.0318,0.0322,0.0322,0.0321,0.0321,0.0317
F1 score,Mean,0.8795,0.8791,0.8793,0.877,0.872,0.8791,0.8795,0.8791,0.8791,0.8785,0.8796,0.8798
F1 score,Std,0.0335,0.0334,0.0312,0.0387,0.0375,0.0327,0.033,0.0334,0.0334,0.0333,0.0333,0.033
ARI,Mean,0.5757,0.5743,0.575,0.5706,0.5551,0.5742,0.5756,0.5743,0.5743,0.5728,0.5757,0.5763
ARI,Std,0.1016,0.1016,0.0933,0.1142,0.1108,0.1002,0.0998,0.1016,0.1016,0.1013,0.1013,0.1003


### Mammographic Mass

In [19]:
df = pd.read_csv('../Datasets/Mammographic Mass.csv')
X = df.drop('Severity', axis=1).to_numpy()
y = df['Severity'].to_numpy()

In [163]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.843,0.8428,0.8439,0.8398,0.8394,0.8428,0.8428,0.8428,0.8428,0.8428,0.8428,0.8428
Accuracy,Std,0.0228,0.0226,0.0214,0.0215,0.0213,0.0226,0.0226,0.0226,0.0226,0.0226,0.0226,0.0226
Precision,Mean,0.8485,0.8484,0.8495,0.8451,0.8449,0.8484,0.8484,0.8484,0.8484,0.8484,0.8484,0.8484
Precision,Std,0.0214,0.0212,0.0201,0.0201,0.02,0.0212,0.0212,0.0212,0.0212,0.0212,0.0212,0.0212
Recall,Mean,0.843,0.8428,0.8439,0.8398,0.8394,0.8428,0.8428,0.8428,0.8428,0.8428,0.8428,0.8428
Recall,Std,0.0228,0.0226,0.0214,0.0215,0.0213,0.0226,0.0226,0.0226,0.0226,0.0226,0.0226,0.0226
F1 score,Mean,0.8421,0.8419,0.8429,0.8389,0.8385,0.8419,0.8419,0.8419,0.8419,0.8419,0.8419,0.8419
F1 score,Std,0.0231,0.0229,0.0218,0.0218,0.0216,0.0229,0.0229,0.0229,0.0229,0.0229,0.0229,0.0229
ARI,Mean,0.47,0.4695,0.4722,0.4611,0.46,0.4695,0.4695,0.4695,0.4695,0.4695,0.4695,0.4695
ARI,Std,0.0618,0.0613,0.0583,0.0575,0.0569,0.0613,0.0613,0.0613,0.0613,0.0613,0.0613,0.0613


In [164]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.8262,0.8262,0.8268,0.8304,0.8311,0.8261,0.8262,0.8262,0.8262,0.8262,0.8262,0.8262
Accuracy,Std,0.0232,0.0232,0.0235,0.0245,0.0243,0.0233,0.0231,0.0232,0.0232,0.0232,0.0233,0.0232
Precision,Mean,0.8314,0.8313,0.8321,0.8353,0.8359,0.8312,0.8314,0.8313,0.8313,0.8313,0.8314,0.8313
Precision,Std,0.0228,0.0228,0.0231,0.0238,0.0238,0.0231,0.0228,0.0228,0.0228,0.0228,0.0229,0.0228
Recall,Mean,0.8262,0.8262,0.8268,0.8304,0.8311,0.8261,0.8262,0.8262,0.8262,0.8262,0.8262,0.8262
Recall,Std,0.0232,0.0232,0.0235,0.0245,0.0243,0.0233,0.0231,0.0232,0.0232,0.0232,0.0233,0.0232
F1 score,Mean,0.8257,0.8256,0.8262,0.8298,0.8304,0.8255,0.8256,0.8256,0.8256,0.8256,0.8257,0.8256
F1 score,Std,0.0232,0.0232,0.0236,0.0246,0.0245,0.0234,0.0231,0.0232,0.0232,0.0232,0.0233,0.0232
ARI,Mean,0.4251,0.4249,0.4267,0.4363,0.438,0.4247,0.4249,0.4249,0.4249,0.4249,0.4252,0.4249
ARI,Std,0.0596,0.0596,0.0602,0.0636,0.0634,0.0601,0.0594,0.0596,0.0596,0.0596,0.0599,0.0596


### Monks

In [165]:
df = pd.read_csv('../Datasets/monk.csv')
df = df.drop('id', axis=1)
X = df.drop("'class'", axis=1).to_numpy()
y = df["'class'"].to_numpy()

In [166]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6119,0.6119,0.613,0.6146,0.6156,0.6119,0.6119,0.6119,0.6119,0.6119,0.6119,0.6119
Accuracy,Std,0.0352,0.0352,0.0355,0.0357,0.0359,0.0352,0.0352,0.0352,0.0352,0.0352,0.0352,0.0352
Precision,Mean,0.5455,0.5455,0.5453,0.5435,0.5433,0.5455,0.5455,0.5455,0.5455,0.5455,0.5455,0.5455
Precision,Std,0.0815,0.0815,0.0815,0.084,0.084,0.0815,0.0815,0.0815,0.0815,0.0815,0.0815,0.0815
Recall,Mean,0.6119,0.6119,0.613,0.6146,0.6156,0.6119,0.6119,0.6119,0.6119,0.6119,0.6119,0.6119
Recall,Std,0.0352,0.0352,0.0355,0.0357,0.0359,0.0352,0.0352,0.0352,0.0352,0.0352,0.0352,0.0352
F1 score,Mean,0.5585,0.5585,0.5578,0.5575,0.5568,0.5585,0.5585,0.5585,0.5585,0.5585,0.5585,0.5585
F1 score,Std,0.0511,0.0511,0.0512,0.0514,0.0515,0.0511,0.0511,0.0511,0.0511,0.0511,0.0511,0.0511
ARI,Mean,0.0114,0.0114,0.0115,0.0116,0.0117,0.0114,0.0114,0.0114,0.0114,0.0114,0.0114,0.0114
ARI,Std,0.0227,0.0227,0.0228,0.0224,0.0225,0.0227,0.0227,0.0227,0.0227,0.0227,0.0227,0.0227


In [167]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.6151,0.6151,0.6147,0.6128,0.6127,0.6151,0.6151,0.6151,0.6151,0.6151,0.6151,0.6151
Accuracy,Std,0.0376,0.0376,0.0352,0.0365,0.035,0.0376,0.0376,0.0376,0.0376,0.0376,0.0376,0.0376
Precision,Mean,0.6137,0.6137,0.6089,0.6095,0.605,0.6137,0.6137,0.6137,0.6137,0.6137,0.6137,0.6137
Precision,Std,0.0468,0.0468,0.0485,0.0455,0.0478,0.0468,0.0468,0.0468,0.0468,0.0468,0.0468,0.0468
Recall,Mean,0.6151,0.6151,0.6147,0.6128,0.6127,0.6151,0.6151,0.6151,0.6151,0.6151,0.6151,0.6151
Recall,Std,0.0376,0.0376,0.0352,0.0365,0.035,0.0376,0.0376,0.0376,0.0376,0.0376,0.0376,0.0376
F1 score,Mean,0.6022,0.6022,0.5975,0.5992,0.5948,0.6022,0.6022,0.6022,0.6022,0.6022,0.6022,0.6022
F1 score,Std,0.0449,0.0449,0.0457,0.043,0.0446,0.0449,0.0449,0.0449,0.0449,0.0449,0.0449,0.0449
ARI,Mean,0.0387,0.0387,0.0352,0.0351,0.0322,0.0387,0.0387,0.0387,0.0387,0.0387,0.0387,0.0387
ARI,Std,0.0312,0.0312,0.031,0.03,0.0307,0.0312,0.0312,0.0312,0.0312,0.0312,0.0312,0.0312


### Spambase

In [20]:
df = pd.read_csv('../Datasets/spambase.csv')
X = df.drop('spam', axis=1).to_numpy()
y = df['spam'].to_numpy()

In [None]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

In [None]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l),b = p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Accuracy,Mean,0.8976,0.8976,0.8975,0.8983,0.898,0.8976,0.8976,0.8976,0.8976,0.8976,0.8976,0.8976
Accuracy,Std,0.0104,0.0104,0.0105,0.0085,0.0086,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104
Precision,Mean,0.8995,0.8995,0.8994,0.8998,0.8995,0.8995,0.8995,0.8995,0.8995,0.8995,0.8996,0.8996
Precision,Std,0.0105,0.0105,0.0105,0.0084,0.0085,0.0105,0.0105,0.0105,0.0105,0.0105,0.0105,0.0105
Recall,Mean,0.8976,0.8976,0.8975,0.8983,0.898,0.8976,0.8976,0.8976,0.8976,0.8976,0.8976,0.8976
Recall,Std,0.0104,0.0104,0.0105,0.0085,0.0086,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104
F1 score,Mean,0.8964,0.8964,0.8962,0.8971,0.8968,0.8964,0.8964,0.8964,0.8964,0.8964,0.8964,0.8964
F1 score,Std,0.0104,0.0104,0.0104,0.0087,0.0088,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104,0.0104
ARI,Mean,0.631,0.631,0.6306,0.6331,0.632,0.631,0.6309,0.631,0.631,0.631,0.631,0.631
ARI,Std,0.033,0.033,0.0331,0.027,0.0274,0.033,0.033,0.033,0.033,0.033,0.033,0.033


### Abalone

In [22]:
df = pd.read_csv('../Datasets/abalone/abalone.data', header=None)
df.columns = ['Sex','Length','Diameter','Height','Whole_weight','Shucked_weight','Viscera_weight','Shell_weight','Rings']

### Mushrooms

In [24]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../Datasets/mushroom/agaricus-lepiota.data', header=None)
df.columns = ['cap-shape','cap-surface','cap-color','bruises?','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape',
              'stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring',
              'veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat', 'class']

for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    
X = df.drop('class', axis=1).to_numpy()
y = df['class'].to_numpy()

In [5]:
compare_metrics_train_test(max_depth=3, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.6008,0.6069,0.6012,0.583,0.6181,0.6181,0.5807,0.6003,0.597
Accuracy,Std,0.0112,0.013,0.011,0.0133,0.0098,0.0098,0.0093,0.011,0.0133
Precision,Mean,0.5078,0.5392,0.5063,0.5275,0.4701,0.4701,0.5154,0.5177,0.5334
Precision,Std,0.0635,0.0418,0.0658,0.049,0.0124,0.0124,0.0344,0.069,0.0627
Recall,Mean,0.6008,0.6069,0.6012,0.583,0.6181,0.6181,0.5807,0.6003,0.597
Recall,Std,0.0112,0.013,0.011,0.0133,0.0098,0.0098,0.0093,0.011,0.0133
F1 score,Mean,0.5262,0.5166,0.5254,0.5106,0.524,0.524,0.5053,0.5292,0.5177
F1 score,Std,0.0221,0.0213,0.0229,0.023,0.0117,0.0117,0.0237,0.0242,0.0245
ARI,Mean,0.3096,0.2979,0.3232,0.2755,0.3593,0.3593,0.2476,0.3076,0.2894
ARI,Std,0.017,0.0198,0.0198,0.0306,0.012,0.012,0.0277,0.0168,0.0206


In [6]:
compare_metrics_train_test(max_depth=4, X=X, y=y)

Finished: 1 iter.
Finished: 2 iter.
Finished: 3 iter.
Finished: 4 iter.
Finished: 5 iter.
Finished: 6 iter.
Finished: 7 iter.
Finished: 8 iter.
Finished: 9 iter.
Finished: 10 iter.
Finished: 11 iter.
Finished: 12 iter.
Finished: 13 iter.
Finished: 14 iter.
Finished: 15 iter.
Finished: 16 iter.
Finished: 17 iter.
Finished: 18 iter.
Finished: 19 iter.
Finished: 20 iter.
Finished: 21 iter.
Finished: 22 iter.
Finished: 23 iter.
Finished: 24 iter.
Finished: 25 iter.
Finished: 26 iter.
Finished: 27 iter.
Finished: 28 iter.
Finished: 29 iter.
Finished: 30 iter.
Finished: 31 iter.
Finished: 32 iter.
Finished: 33 iter.
Finished: 34 iter.
Finished: 35 iter.
Finished: 36 iter.
Finished: 37 iter.
Finished: 38 iter.
Finished: 39 iter.
Finished: 40 iter.
Finished: 41 iter.
Finished: 42 iter.
Finished: 43 iter.
Finished: 44 iter.
Finished: 45 iter.
Finished: 46 iter.
Finished: 47 iter.
Finished: 48 iter.
Finished: 49 iter.
Finished: 50 iter.

N, V, k, alpha, nmin, max_depth = (None, None, None, None,

Unnamed: 0_level_0,Unnamed: 1_level_0,b = 1,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = -log(p_l),b = -p_l * log(p_l),b = -p_l^0.5 * log(p_l)
Metric,Statistic,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Accuracy,Mean,0.6373,0.6403,0.6362,0.6078,0.6377,0.6455,0.6111,0.6368,0.6352
Accuracy,Std,0.0123,0.0103,0.0119,0.0171,0.0093,0.0101,0.0114,0.0124,0.0174
Precision,Mean,0.5943,0.6432,0.603,0.6077,0.5951,0.5918,0.6431,0.595,0.6332
Precision,Std,0.0415,0.0591,0.041,0.0501,0.0702,0.0694,0.0635,0.0395,0.0465
Recall,Mean,0.6373,0.6403,0.6362,0.6078,0.6377,0.6455,0.6111,0.6368,0.6352
Recall,Std,0.0123,0.0103,0.0119,0.0171,0.0093,0.0101,0.0114,0.0124,0.0174
F1 score,Mean,0.5995,0.594,0.602,0.5418,0.5802,0.5846,0.5671,0.6,0.5863
F1 score,Std,0.0271,0.0298,0.0269,0.0264,0.0268,0.0278,0.031,0.026,0.034
ARI,Mean,0.3995,0.3921,0.3944,0.3254,0.3565,0.3712,0.3142,0.3991,0.3781
ARI,Std,0.0249,0.029,0.0257,0.0386,0.012,0.0129,0.0251,0.0249,0.0311
