In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.tree = None                                                          # переменная, в которой будет храниться готовое дерево решений.
        self.feature_importances = None                                          # переменная для важности фич

    def entropy(self, y):
        counts = np.bincount(y)                                                   # Считаем количество объектов для каждого класса. Формат - [0,0,1,2,1,2,0]
        probabilities = counts / len(y)                                           # вероятность. Формат - [x/y, x1/y, x3/y]
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])          # суммируем вероятности. p - каждая итерация в полученном массиве 'probabilities'.

    def gini(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def information_gain(self, y, left_indices, right_indices):
        if self.criterion == 'entropy':                                            # Выбор критерия
            impurity_func = self.entropy
        elif self.criterion == 'gini':
            impurity_func = self.gini
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

        parent_impurity = impurity_func(y)                                         # неопределенность для всей выборки.
        left_impurity = impurity_func(y[left_indices])
        right_impurity = impurity_func(y[right_indices])

        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        weighted_impurity = (n_left / n) * left_impurity + (n_right / n) * right_impurity
        inf_gain = parent_impurity - weighted_impurity
        
        # print(f'Inf. gain "{self.criterion}": {inf_gain}')
        return inf_gain                                                            # возвращаем инф. выиг.
    
    
    def custom_1(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            b = 1
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    def custom_2(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l)
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_3(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10 
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l*(1 - p_l))
            
            # eps. для стабильности вычислений
            denominator_1 = max(p_1 * b**2, epsilon)
            denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / denominator_1
            sum_total += ((p_2l - p_2 * p_l)**2) / denominator_2

        return N * sum_total
    

    def custom_4(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2


        return N * sum_total
    
    
    def custom_5(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l**2
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_6(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.log(max(p_l, epsilon))
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_7(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = (-p_l)*np.log(max(p_l, epsilon))
            
            # # eps. для стабильности вычислений
            # denominator_1 = max(p_1 * b**2, epsilon)
            # denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    # Функция находит наиболее частый элемент в массиве y (метки классов).
    def most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]


    def find_best_split(self, X, y, num_features, y_oh=None):
        best_gain = -float('inf')                                                  # хранит лучшее значение критерия
        best_split = None                                                          # будет содержать параметры наилучшего разбиения

        for feature_index in range(num_features):                                  # перебираем по очереди признаки.
            # Сортируем значения признака
            feature_values = np.sort(X[:, feature_index])
            # Берем средние между соседними значениями
            thresholds = (feature_values[:-1] + feature_values[1:]) / 2     
            
            for threshold in thresholds:                                           # для каждого уникального значения делим данные на 2 части.
                left_indices = np.where(X[:, feature_index] <= threshold)[0]       # левый - меньше уникального значения. [0] - нужен для возвращения массива, а не кортежа.
                right_indices = np.where(X[:, feature_index] > threshold)[0]       # правый - больше ун. знач. feature_index - искомый признак.

                if (len(left_indices) < self.min_samples_leaf or 
                    len(right_indices) < self.min_samples_leaf):
                    continue                                                       # если условие срабатывает, переходим к следующей итерации, пропуская то, что ниже.

                if self.criterion == 'custom_1':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_1 criterion")
                    gain = self.custom_1(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_2':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_2 criterion")
                    gain = self.custom_2(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_3':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_3 criterion")
                    gain = self.custom_3(y_oh, left_indices, right_indices)                    
                
                elif self.criterion == 'custom_4':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_4 criterion")
                    gain = self.custom_4(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_5':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_5 criterion")
                    gain = self.custom_5(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_6':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_6 criterion")
                    gain = self.custom_6(y_oh, left_indices, right_indices)    
                    
                elif self.criterion == 'custom_7':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_7 criterion")
                    gain = self.custom_7(y_oh, left_indices, right_indices)                                                        
                
                else:
                    gain = self.information_gain(y, left_indices, right_indices)   # рассчитываем инф. прирост.

                if gain > best_gain:                                               # если текущий прирост больше самого большого
                    best_gain = gain                                               # приравниваем переменную наибольшего к текущему.
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'left_indices': left_indices,
                        'right_indices': right_indices,
                        'gain': gain
                    }                                                              # теперь это параметры разбиения, которые дают наилучший прирост.
        
        return best_split                                                          # После перебора всех признаков и порогов, возвращаем параметры лучшего найденного разбиения.


    def fit(self, X, y, y_oh=None):
        num_features = X.shape[1]
        self.feature_importances = np.zeros(num_features)                          # инициализируем нулями
        self.tree = self.grow_tree(X, y, y_oh, depth=0)

        # нормализуем важности, чтобы сумма = 1, как в sklearn
        total = self.feature_importances.sum()
        if total > 0:
            self.feature_importances /= total


    def grow_tree(self, X, y, y_oh, depth):
        num_samples, num_features = X.shape
        num_classes = len(set(y))

        if (depth == self.max_depth or 
            num_classes == 1 or 
            num_samples < self.min_samples_split):
            return self.most_common_label(y)

        if self.criterion.startswith('custom_'):
            best_split = self.find_best_split(X, y, num_features, y_oh)
        else:
            best_split = self.find_best_split(X, y, num_features)

        if best_split is None:
            return self.most_common_label(y)

        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
        
        # Вычисляем прирост информации для подсчета важности признаков
        if self.criterion == 'custom_1':
            gain = self.custom_1(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_2':
            gain = self.custom_2(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_3':
            gain = self.custom_3(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_4':
            gain = self.custom_4(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_5':
            gain = self.custom_5(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_6':
            gain = self.custom_6(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_7':
            gain = self.custom_7(y_oh, left_indices, right_indices)
        else:
            gain = self.information_gain(y, left_indices, right_indices)

        self.feature_importances[best_split['feature_index']] += gain              # Сохраняем вклад этого признака в важность

        left_subtree = self.grow_tree(X[left_indices], y[left_indices], 
                                    y_oh[left_indices] if y_oh is not None else None, 
                                    depth + 1)
        right_subtree = self.grow_tree(X[right_indices], y[right_indices], 
                                     y_oh[right_indices] if y_oh is not None else None, 
                                     depth + 1)

        return {
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }


    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])


    def _traverse_tree(self, x, node):
        if isinstance(node, dict):
            if x[node['feature_index']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])

        return node                                                             # Если нет, то это лист и присваиваем метку.

---
### Generated dataset

---
#### Data generator

Parameters:
- N: Total number of data points
- V: Number of dimensions/features
- k: Number of clusters
- alpha: Controls cluster center spread (centers are in [α-1, 1-α])
- nmin: Minimum points per cluster
- seed: Random seed for reproducibility
- sig_range: Tuple (min, max) for cluster standard deviations

Returns:
- Nk: Array of cluster sizes
- R: List of ranges for each cluster
- y: Cluster labels for each point
- X: Generated data (N x V array)
- cen: Cluster centers (k x V array)

In [3]:
def generdat(N, V, k, alpha, nmin, seed=None, sig_range=(0.05, 0.1)):
    if N < k * nmin:
        raise ValueError(f"N must be >= k * nmin. Got N={N}, k={k}, nmin={nmin}")
    if k < 1:
        raise ValueError("k must be at least 1")
    if alpha == 1:
        raise ValueError("alpha cannot be 1")

    if seed is not None:
        np.random.seed(seed)

    # Более равномерное распределение размеров кластеров
    if k == 1:
        Nk = np.array([N])
    else:
        base_sizes = np.ones(k, dtype=int) * nmin
        remaining = N - k * nmin
        if remaining > 0:
            additional = np.random.multinomial(remaining, np.ones(k)/k)
            Nk = base_sizes + additional
        else:
            Nk = base_sizes

    # Центры кластеров
    cen = (alpha - 1) + 2 * (1 - alpha) * np.random.rand(k, V)

    # Генерация данных с предварительным выделением памяти
    X = np.zeros((N, V))
    y = np.zeros(N, dtype=int)
    R = []
    
    sig_min, sig_max = sig_range
    start_idx = 0
    
    for k0 in range(k):
        nk = Nk[k0]
        end_idx = start_idx + nk
        
        # Диапазон для текущего кластера
        R.append(range(start_idx, end_idx))
        y[start_idx:end_idx] = k0  # Метки начинаются с 0
        
        # Генерация данных кластера
        sig = sig_min + (sig_max - sig_min) * np.random.rand(V)
        X[start_idx:end_idx] = np.random.randn(nk, V) * sig + cen[k0, :]
        
        start_idx = end_idx

    return Nk, R, y, X, cen


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_train_test(max_depth, X, y, *, N=None, V=None, k=None, alpha=None, nmin=None):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    encoder = OneHotEncoder(sparse_output=False)
    y_oh_train = encoder.fit_transform(y_train.reshape(-1,1))


    '''Custom_1'''
    custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
    custom_1.fit(X_train, y_train, y_oh_train)
    y_pred = custom_1.predict(X_test)
    accuracy_1, precision_1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_1, f1_1 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_1 = adjusted_rand_score(y_test, y_pred)

    '''GINI'''
    gini = DecisionTree(max_depth=max_depth, criterion='gini')
    gini.fit(X_train, y_train)
    y_pred = gini.predict(X_test)
    accuracy_gini, precision_gini = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini, f1_gini = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_GINI'''
    sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
    sk_gini.fit(X_train, y_train)
    y_pred = sk_gini.predict(X_test)
    accuracy_gini_sk, precision_gini_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini_sk, f1_gini_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini_sk = adjusted_rand_score(y_test, y_pred)

    '''Entropy'''
    entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
    entropy.fit(X_train, y_train)
    y_pred = entropy.predict(X_test)
    accuracy_entropy, precision_entropy = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy, f1_entropy = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_Entropy'''
    sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    sk_entropy.fit(X_train, y_train)
    y_pred = sk_entropy.predict(X_test)
    accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy_sk, f1_entropy_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy_sk = adjusted_rand_score(y_test, y_pred)
    
    '''Custom_2'''
    custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
    custom_2.fit(X_train, y_train, y_oh_train)
    y_pred = custom_2.predict(X_test)
    accuracy_2, precision_2 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_2, f1_2 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_2 = adjusted_rand_score(y_test, y_pred)

    '''Custom_3'''
    custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
    custom_3.fit(X_train, y_train, y_oh_train)
    y_pred = custom_3.predict(X_test)
    accuracy_3, precision_3 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_3, f1_3 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_3 = adjusted_rand_score(y_test, y_pred)

    '''Custom_4'''
    custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
    custom_4.fit(X_train, y_train, y_oh_train)
    y_pred = custom_4.predict(X_test)
    accuracy_4, precision_4 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_4, f1_4 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_4 = adjusted_rand_score(y_test, y_pred)

    '''Custom_5'''
    custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
    custom_5.fit(X_train, y_train, y_oh_train)
    y_pred = custom_5.predict(X_test)
    accuracy_5, precision_5 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_5, f1_5 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_5 = adjusted_rand_score(y_test, y_pred)

    '''Custom_6'''
    custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
    custom_6.fit(X_train, y_train, y_oh_train)
    y_pred = custom_6.predict(X_test)
    accuracy_6, precision_6 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_6, f1_6 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_6 = adjusted_rand_score(y_test, y_pred)

    '''Custom_7'''
    custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
    custom_7.fit(X_train, y_train, y_oh_train)
    y_pred = custom_7.predict(X_test)
    accuracy_7, precision_7 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_7, f1_7 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_7 = adjusted_rand_score(y_test, y_pred)

    results = np.round([[accuracy_1, accuracy_gini, accuracy_gini_sk, accuracy_entropy, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7],
                    [precision_1, precision_gini, precision_gini_sk, precision_entropy, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7],
                    [recall_1, recall_gini, recall_gini_sk, recall_entropy, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7],
                    [f1_1, f1_gini, f1_gini_sk, f1_entropy, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7],
                    [ari_1, ari_gini, ari_gini_sk, ari_entropy, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7]],4)

    column = ['b = 1','gini','gini_sklearn', 'entropy', 'entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = log(p_l)', 'b = -p_l * log(p_l)']
    table = pd.DataFrame(data=results, columns=column, index=['Accuracy', 'Precision', 'Recall','F1 score','ARI'])
    
    print(f'\nN, V, k, alpha, nmin, max_depth = {N, V, k, alpha, nmin, max_depth}')

    return table

---
#### Глубина 3, a = [0.75, 0.85], V = [10, 15] k = [5, 7, 10, 15] 

In [41]:
tables = []

for cluster in [5,10,15]:
    for feature in [10,15]:
        for squeeze in [0.75, 0.85]:
            N, V, k, alpha, nmin = 2000, feature, cluster, squeeze, 50
            Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)
            
            table = compare_metrics_train_test(max_depth=3, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)
            tables.append(table)


# Теперь tables[0], tables[1], ... содержат отдельные таблицы



N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.85, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.85, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.85, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.85, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.75, 50, 3)

N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 3)


In [46]:
tables[0]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.97,0.97,0.97,0.9817,0.9817,0.97,0.97,0.76,0.7467,0.9683,0.97
Precision,0.9706,0.9706,0.9708,0.9821,0.9821,0.9706,0.9706,0.8661,0.8856,0.969,0.9706
Recall,0.97,0.97,0.97,0.9817,0.9817,0.97,0.97,0.76,0.7467,0.9683,0.97
F1 score,0.97,0.97,0.9701,0.9817,0.9817,0.97,0.97,0.7033,0.7006,0.9683,0.97
ARI,0.9285,0.9285,0.9289,0.9557,0.9557,0.9285,0.9285,0.6783,0.6203,0.9246,0.9285


In [47]:
tables[1]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8733,0.8733,0.8733,0.88,0.88,0.8533,0.8267,0.8533,0.8533,0.7033,0.8767
Precision,0.8761,0.8761,0.8761,0.8805,0.8805,0.8596,0.8511,0.8596,0.8596,0.7502,0.8771
Recall,0.8733,0.8733,0.8733,0.88,0.88,0.8533,0.8267,0.8533,0.8533,0.7033,0.8767
F1 score,0.8735,0.8735,0.8735,0.8798,0.8798,0.8547,0.8281,0.8547,0.8547,0.6455,0.8764
ARI,0.7098,0.7098,0.7098,0.7248,0.7248,0.6667,0.6147,0.6667,0.6667,0.5411,0.7175


In [48]:
tables[2]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9967,0.9967,0.9967,0.9833,0.9833,0.9967,0.9883,0.9967,0.9967,0.9883,0.9967
Precision,0.9967,0.9967,0.9967,0.9835,0.9835,0.9967,0.9885,0.9967,0.9967,0.9885,0.9967
Recall,0.9967,0.9967,0.9967,0.9833,0.9833,0.9967,0.9883,0.9967,0.9967,0.9883,0.9967
F1 score,0.9967,0.9967,0.9967,0.9834,0.9834,0.9967,0.9884,0.9967,0.9967,0.9884,0.9967
ARI,0.9923,0.9923,0.9923,0.9593,0.9593,0.9923,0.9716,0.9923,0.9923,0.9716,0.9922


In [49]:
tables[3]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9183,0.9183,0.9183,0.9217,0.9217,0.8,0.92,0.8,0.8,0.9,0.81
Precision,0.9199,0.9199,0.9199,0.9251,0.9251,0.8197,0.9234,0.8197,0.8197,0.9017,0.8315
Recall,0.9183,0.9183,0.9183,0.9217,0.9217,0.8,0.92,0.8,0.8,0.9,0.81
F1 score,0.9186,0.9186,0.9186,0.9223,0.9223,0.7895,0.9207,0.7895,0.7895,0.8998,0.7992
ARI,0.8071,0.8071,0.8071,0.8141,0.8141,0.6318,0.8107,0.6318,0.6318,0.7679,0.652


In [50]:
tables[4]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.635,0.635,0.635,0.6117,0.6117,0.6333,0.6217,0.63,0.63,0.5517,0.63
Precision,0.6126,0.6126,0.6126,0.5481,0.5481,0.5253,0.5968,0.5234,0.5234,0.4915,0.6057
Recall,0.635,0.635,0.635,0.6117,0.6117,0.6333,0.6217,0.63,0.63,0.5517,0.63
F1 score,0.5657,0.5657,0.5657,0.5202,0.5202,0.5556,0.55,0.5529,0.5529,0.4517,0.5593
ARI,0.5753,0.5753,0.5753,0.5891,0.5891,0.5642,0.5668,0.5603,0.5603,0.5418,0.5645


In [51]:
tables[5]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.52,0.52,0.52,0.515,0.515,0.52,0.4467,0.5217,0.5183,0.4083,0.52
Precision,0.4512,0.4512,0.4512,0.41,0.41,0.4512,0.3611,0.4523,0.4114,0.333,0.4512
Recall,0.52,0.52,0.52,0.515,0.515,0.52,0.4467,0.5217,0.5183,0.4083,0.52
F1 score,0.4513,0.4513,0.4513,0.4394,0.4394,0.4513,0.3582,0.4529,0.442,0.2887,0.4513
ARI,0.3592,0.3592,0.3592,0.3692,0.3692,0.3592,0.3482,0.3612,0.3475,0.356,0.3592


In [52]:
tables[6]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.4417,0.4417,0.4417,0.7483,0.7483,0.4417,0.6467,0.4417,0.485,0.5467,0.4383
Precision,0.427,0.427,0.3718,0.6096,0.6096,0.427,0.5196,0.427,0.4035,0.3654,0.3094
Recall,0.4417,0.4417,0.4417,0.7483,0.7483,0.4417,0.6467,0.4417,0.485,0.5467,0.4383
F1 score,0.3497,0.3497,0.3509,0.667,0.667,0.3497,0.556,0.3497,0.3527,0.4217,0.3421
ARI,0.3179,0.3179,0.3185,0.6569,0.6569,0.3179,0.6444,0.3179,0.3828,0.6151,0.3174


In [53]:
tables[7]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.5433,0.5433,0.5433,0.59,0.59,0.5683,0.6,0.5683,0.42,0.5233,0.5683
Precision,0.4263,0.4263,0.4263,0.5117,0.5117,0.4604,0.5174,0.4604,0.4653,0.3908,0.4604
Recall,0.5433,0.5433,0.5433,0.59,0.59,0.5683,0.6,0.5683,0.42,0.5233,0.5683
F1 score,0.4616,0.4616,0.4616,0.5343,0.5343,0.4933,0.5316,0.4933,0.3402,0.426,0.4933
ARI,0.4126,0.4126,0.4126,0.3892,0.3892,0.385,0.4729,0.385,0.239,0.4391,0.385


In [54]:
tables[8]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.4333,0.4333,0.4267,0.4367,0.4367,0.3833,0.4783,0.3467,0.3467,0.3917,0.385
Precision,0.324,0.324,0.3225,0.2708,0.2708,0.3054,0.3256,0.2863,0.2863,0.2105,0.3141
Recall,0.4333,0.4333,0.4267,0.4367,0.4367,0.3833,0.4783,0.3467,0.3467,0.3917,0.385
F1 score,0.3331,0.3331,0.3302,0.3164,0.3164,0.2895,0.3666,0.2557,0.2557,0.2556,0.2908
ARI,0.2809,0.2809,0.2809,0.4314,0.4314,0.2504,0.386,0.2526,0.2526,0.3638,0.2515


In [55]:
tables[9]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.4,0.4,0.4,0.3417,0.3417,0.3233,0.3433,0.3233,0.3233,0.3233,0.3233
Precision,0.2458,0.2458,0.2458,0.1991,0.1991,0.222,0.2316,0.222,0.2193,0.1643,0.222
Recall,0.4,0.4,0.4,0.3417,0.3417,0.3233,0.3433,0.3233,0.3233,0.3233,0.3233
F1 score,0.2893,0.2893,0.2893,0.2365,0.2365,0.2246,0.2373,0.2246,0.2242,0.2069,0.2246
ARI,0.2337,0.2337,0.2337,0.2739,0.2739,0.1854,0.2319,0.1854,0.1889,0.2581,0.1854


In [56]:
tables[10]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.4567,0.4567,0.4633,0.46,0.46,0.4617,0.3933,0.4617,0.4467,0.3317,0.4617
Precision,0.3344,0.3344,0.3391,0.2948,0.2948,0.3165,0.3134,0.3165,0.3242,0.2375,0.3173
Recall,0.4567,0.4567,0.4633,0.46,0.46,0.4617,0.3933,0.4617,0.4467,0.3317,0.4617
F1 score,0.3577,0.3577,0.3646,0.3372,0.3372,0.3515,0.3231,0.3515,0.3436,0.2162,0.3512
ARI,0.343,0.343,0.343,0.3831,0.3831,0.3476,0.2084,0.3476,0.3272,0.3164,0.348


In [57]:
tables[11]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.3867,0.3867,0.3867,0.3667,0.3667,0.3833,0.2967,0.3717,0.3683,0.3817,0.3833
Precision,0.2412,0.2412,0.2412,0.2107,0.2107,0.2418,0.1562,0.2465,0.2449,0.251,0.2418
Recall,0.3867,0.3867,0.3867,0.3667,0.3667,0.3833,0.2967,0.3717,0.3683,0.3817,0.3833
F1 score,0.2831,0.2831,0.2831,0.2607,0.2607,0.2823,0.1832,0.2795,0.2767,0.2846,0.2823
ARI,0.2373,0.2373,0.2373,0.2353,0.2353,0.23,0.1788,0.2033,0.2006,0.2091,0.23


---
#### Глубина 4, a = [0.75, 0.85], V = [10, 15] k = [5, 10, 15] 

In [None]:
# V = 10, k = 5, a = 0.75

N, V, k, alpha, nmin = 2000, 10, 5, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.985,0.985,0.985,0.985,0.9883,0.9783,0.985,0.9733,0.9517,0.985,0.985
Precision,0.9854,0.9854,0.9854,0.985,0.9884,0.9795,0.9854,0.9743,0.9589,0.9855,0.9855
Recall,0.985,0.985,0.985,0.985,0.9883,0.9783,0.985,0.9733,0.9517,0.985,0.985
F1 score,0.985,0.985,0.985,0.985,0.9883,0.9784,0.985,0.9734,0.9521,0.9851,0.9851
ARI,0.9628,0.9628,0.9628,0.9627,0.9704,0.9463,0.9628,0.9355,0.8845,0.9643,0.9643


In [14]:
# V = 10, k = 5, a = 0.85

N, V, k, alpha, nmin = 2000, 10, 5, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9017,0.9017,0.9017,0.89,0.895,0.9,0.8733,0.9,0.9,0.7483,0.8967
Precision,0.9027,0.9027,0.9027,0.8936,0.8993,0.9029,0.886,0.9029,0.9029,0.8007,0.8976
Recall,0.9017,0.9017,0.9017,0.89,0.895,0.9,0.8733,0.9,0.9,0.7483,0.8967
F1 score,0.9019,0.9019,0.9019,0.8905,0.8956,0.9001,0.8732,0.9001,0.9001,0.7283,0.8964
ARI,0.7722,0.7722,0.7722,0.7444,0.7559,0.7673,0.7041,0.7673,0.7673,0.6008,0.7591


In [15]:
# V = 15, k = 5, a = 0.75

N, V, k, alpha, nmin = 2000, 15, 5, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9967,0.9967,0.9933,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.9967
Precision,0.9967,0.9967,0.9934,0.9868,0.9868,0.9967,0.9918,0.9967,0.9967,0.9918,0.9967
Recall,0.9967,0.9967,0.9933,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.9967
F1 score,0.9967,0.9967,0.9933,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.9967
ARI,0.9923,0.9923,0.9836,0.9677,0.9677,0.9923,0.9801,0.9923,0.9923,0.9801,0.9922


In [16]:
# V = 15, k = 5, a = 0.85

N, V, k, alpha, nmin = 2000, 15, 5, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9317,0.9317,0.93,0.9233,0.9267,0.9217,0.9283,0.9217,0.9217,0.915,0.9217
Precision,0.9325,0.9325,0.931,0.9265,0.9298,0.9257,0.9303,0.9257,0.9257,0.9168,0.9243
Recall,0.9317,0.9317,0.93,0.9233,0.9267,0.9217,0.9283,0.9217,0.9217,0.915,0.9217
F1 score,0.9317,0.9317,0.9301,0.9241,0.9275,0.9219,0.9288,0.9219,0.9219,0.9154,0.9217
ARI,0.8394,0.8394,0.8353,0.8219,0.8299,0.8141,0.8326,0.8141,0.8141,0.8026,0.8152


In [21]:
# V = 10, k = 10, a = 0.75

N, V, k, alpha, nmin = 2000, 10, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.825,0.825,0.825,0.9,0.9017,0.8167,0.8283,0.825,0.8283,0.8233,0.8267
Precision,0.8653,0.8653,0.9016,0.9023,0.9041,0.8055,0.7968,0.9028,0.769,0.7914,0.8666
Recall,0.825,0.825,0.825,0.9,0.9017,0.8167,0.8283,0.825,0.8283,0.8233,0.8267
F1 score,0.8013,0.8013,0.8005,0.9002,0.9018,0.7935,0.8018,0.8011,0.7892,0.7969,0.803
ARI,0.7586,0.7586,0.7573,0.794,0.7973,0.7501,0.7577,0.7591,0.7288,0.749,0.7624


In [22]:
# V = 10, k = 10, a = 0.85

N, V, k, alpha, nmin = 2000, 10, 10, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6783,0.6783,0.6783,0.6967,0.6967,0.6683,0.6683,0.655,0.6117,0.5633,0.6767
Precision,0.683,0.683,0.683,0.697,0.697,0.6907,0.6798,0.6661,0.6169,0.5968,0.6807
Recall,0.6783,0.6783,0.6783,0.6967,0.6967,0.6683,0.6683,0.655,0.6117,0.5633,0.6767
F1 score,0.6588,0.6588,0.6588,0.6915,0.6915,0.6503,0.6476,0.6359,0.6013,0.5094,0.6568
ARI,0.4797,0.4797,0.4797,0.4619,0.4619,0.4595,0.4602,0.4323,0.3896,0.4516,0.4785


In [23]:
# V = 15, k = 10, a = 0.75

N, V, k, alpha, nmin = 2000, 15, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6517,0.6517,0.6517,0.95,0.95,0.6517,0.9517,0.6517,0.7567,0.87,0.6467
Precision,0.7891,0.7891,0.7345,0.9516,0.9516,0.7891,0.955,0.7891,0.7434,0.9288,0.6687
Recall,0.6517,0.6517,0.6517,0.95,0.95,0.6517,0.9517,0.6517,0.7567,0.87,0.6467
F1 score,0.612,0.612,0.6134,0.9502,0.9502,0.612,0.9517,0.612,0.7213,0.8446,0.6027
ARI,0.4756,0.4756,0.4768,0.8934,0.8934,0.4756,0.8936,0.4756,0.6084,0.8459,0.475


In [24]:
# V = 15, k = 10, a = 0.85

N, V, k, alpha, nmin = 2000, 15, 10, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7633,0.7633,0.7667,0.7417,0.7417,0.7067,0.8017,0.7067,0.58,0.68,0.7
Precision,0.779,0.779,0.7813,0.7649,0.7639,0.7279,0.8158,0.7279,0.614,0.649,0.7131
Recall,0.7633,0.7633,0.7667,0.7417,0.7417,0.7067,0.8017,0.7067,0.58,0.68,0.7
F1 score,0.765,0.765,0.7683,0.7455,0.7454,0.7005,0.8046,0.7005,0.5341,0.6321,0.6917
ARI,0.5475,0.5475,0.5537,0.5147,0.515,0.5224,0.608,0.5224,0.3888,0.5375,0.5113


In [25]:
# V = 10, k = 15, a = 0.75

N, V, k, alpha, nmin = 2000, 10, 15, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6133,0.6133,0.6133,0.7333,0.7333,0.5567,0.695,0.5867,0.5867,0.5167,0.56
Precision,0.5449,0.5459,0.5449,0.6725,0.6725,0.4911,0.7086,0.595,0.595,0.4373,0.4992
Recall,0.6133,0.6133,0.6133,0.7333,0.7333,0.5567,0.695,0.5867,0.5867,0.5167,0.56
F1 score,0.55,0.55,0.55,0.6869,0.6869,0.4824,0.6576,0.5304,0.5304,0.4075,0.4846
ARI,0.4821,0.4822,0.4821,0.6353,0.6353,0.4273,0.5653,0.4443,0.4443,0.4585,0.4296


In [26]:
# V = 10, k = 15, a = 0.85

N, V, k, alpha, nmin = 2000, 10, 15, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.53,0.53,0.53,0.5717,0.5717,0.4783,0.44,0.48,0.4883,0.4433,0.4783
Precision,0.545,0.545,0.545,0.5344,0.5344,0.4057,0.3751,0.4162,0.4084,0.3214,0.4057
Recall,0.53,0.53,0.53,0.5717,0.5717,0.4783,0.44,0.48,0.4883,0.4433,0.4783
F1 score,0.5045,0.5045,0.5045,0.5322,0.5322,0.4001,0.3567,0.403,0.4024,0.3577,0.4001
ARI,0.3212,0.3212,0.3212,0.3821,0.3821,0.2735,0.2831,0.2738,0.3009,0.3183,0.2735


In [27]:
# V = 15, k = 15, a = 0.75

N, V, k, alpha, nmin = 2000, 15, 15, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.675,0.6733,0.6767,0.7,0.7,0.685,0.5233,0.685,0.675,0.5633,0.6833
Precision,0.6493,0.6461,0.6599,0.6662,0.6662,0.687,0.4845,0.687,0.657,0.4596,0.716
Recall,0.675,0.6733,0.6767,0.7,0.7,0.685,0.5233,0.685,0.675,0.5633,0.6833
F1 score,0.6299,0.6258,0.626,0.6631,0.6631,0.6499,0.4652,0.6499,0.6306,0.4794,0.646
ARI,0.5381,0.5366,0.5419,0.5811,0.5811,0.5449,0.3745,0.5449,0.469,0.4735,0.5381


In [28]:
# V = 15, k = 15, a = 0.85

N, V, k, alpha, nmin = 2000, 15, 15, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.5317,0.5317,0.5317,0.5117,0.5117,0.525,0.4233,0.5233,0.5267,0.505,0.525
Precision,0.5182,0.5182,0.5182,0.5001,0.5001,0.5105,0.3646,0.5094,0.5217,0.4595,0.5637
Recall,0.5317,0.5317,0.5317,0.5117,0.5117,0.525,0.4233,0.5233,0.5267,0.505,0.525
F1 score,0.4975,0.4975,0.4975,0.4848,0.4848,0.4949,0.3577,0.4911,0.4947,0.4567,0.5052
ARI,0.3212,0.3212,0.3212,0.3045,0.3045,0.2895,0.2306,0.2873,0.2742,0.2842,0.2841


---
#### Глубина 6, a = [0.75, 0.85], V = [10, 15] k = [5, 10, 15] 

In [59]:
tables = []

for cluster in [5,10,15]:
    for feature in [10,15]:
        for squeeze in [0.75, 0.85]:
            N, V, k, alpha, nmin = 2000, feature, cluster, squeeze, 50
            Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)
            
            table = compare_metrics_train_test(max_depth=6, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)
            tables.append(table)


# Теперь tables[0], tables[1], ... содержат отдельные таблицы


N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 10, 5, 0.85, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 5, 0.85, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 10, 10, 0.85, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 10, 15, 0.85, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.75, 50, 6)

N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 6)


In [60]:
tables[0]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9883,0.9883,0.99,0.985,0.9917,0.985,0.9883,0.9867,0.9883,0.99,0.9883
Precision,0.9885,0.9885,0.9902,0.985,0.9917,0.9852,0.9885,0.9871,0.9884,0.9901,0.9884
Recall,0.9883,0.9883,0.99,0.985,0.9917,0.985,0.9883,0.9867,0.9883,0.99,0.9883
F1 score,0.9883,0.9883,0.99,0.985,0.9917,0.985,0.9883,0.9867,0.9884,0.99,0.9883
ARI,0.9703,0.9703,0.9744,0.9627,0.9789,0.9638,0.9703,0.9657,0.9723,0.9755,0.9713


In [61]:
tables[1]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.935,0.935,0.935,0.915,0.915,0.9183,0.925,0.9217,0.9217,0.91,0.9217
Precision,0.9355,0.9355,0.9367,0.9156,0.916,0.9196,0.9267,0.9226,0.9228,0.9119,0.9225
Recall,0.935,0.935,0.935,0.915,0.915,0.9183,0.925,0.9217,0.9217,0.91,0.9217
F1 score,0.9349,0.9349,0.9351,0.9151,0.9151,0.9184,0.9251,0.9217,0.9218,0.9099,0.9216
ARI,0.8439,0.8439,0.8414,0.7976,0.7976,0.8059,0.8218,0.8144,0.8155,0.785,0.8135


In [62]:
tables[2]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9933,0.995,0.995,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.995
Precision,0.9934,0.995,0.995,0.9868,0.9868,0.9967,0.9918,0.9967,0.9967,0.9918,0.995
Recall,0.9933,0.995,0.995,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.995
F1 score,0.9933,0.995,0.995,0.9867,0.9867,0.9967,0.9917,0.9967,0.9967,0.9917,0.995
ARI,0.9847,0.9885,0.988,0.9677,0.9677,0.9923,0.9801,0.9923,0.9923,0.9801,0.9884


In [63]:
tables[3]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.93,0.9317,0.93,0.9483,0.9417,0.93,0.94,0.93,0.93,0.9317,0.93
Precision,0.9307,0.9323,0.9305,0.9489,0.9432,0.9324,0.9424,0.9324,0.9324,0.9334,0.9323
Recall,0.93,0.9317,0.93,0.9483,0.9417,0.93,0.94,0.93,0.93,0.9317,0.93
F1 score,0.93,0.9317,0.9297,0.9484,0.942,0.9301,0.9407,0.9301,0.9301,0.9321,0.9305
ARI,0.8382,0.8423,0.8393,0.8766,0.8628,0.8331,0.8603,0.8331,0.8331,0.8398,0.8355


In [134]:
tables[4]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9333,0.9333,0.9267,0.9317,0.9317,0.9283,0.9233,0.935,0.9367,0.9217,0.935
Precision,0.9354,0.9354,0.9295,0.9334,0.9339,0.9328,0.9267,0.9395,0.9402,0.9272,0.937
Recall,0.9333,0.9333,0.9267,0.9317,0.9317,0.9283,0.9233,0.935,0.9367,0.9217,0.935
F1 score,0.9336,0.9336,0.9271,0.9316,0.932,0.929,0.9235,0.9356,0.9371,0.9224,0.9352
ARI,0.8581,0.8581,0.8464,0.8535,0.8544,0.8476,0.8371,0.8594,0.8644,0.8335,0.8613


In [65]:
tables[5]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7483,0.7483,0.75,0.7883,0.7883,0.75,0.7683,0.7717,0.7583,0.7633,0.7517
Precision,0.7693,0.7693,0.7719,0.7983,0.8019,0.7684,0.7887,0.7916,0.7759,0.7707,0.773
Recall,0.7483,0.7483,0.75,0.7883,0.7883,0.75,0.7683,0.7717,0.7583,0.7633,0.7517
F1 score,0.7498,0.7498,0.7515,0.7881,0.7887,0.7508,0.77,0.7715,0.7599,0.7631,0.755
ARI,0.5177,0.5177,0.5203,0.5868,0.5858,0.5237,0.5498,0.5559,0.5345,0.5518,0.5201


In [66]:
tables[6]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.96,0.96,0.9583,0.97,0.96,0.96,0.9767,0.9433,0.955,0.97,0.9567
Precision,0.9631,0.9631,0.9611,0.9712,0.9608,0.9631,0.977,0.9476,0.9554,0.9702,0.9584
Recall,0.96,0.96,0.9583,0.97,0.96,0.96,0.9767,0.9433,0.955,0.97,0.9567
F1 score,0.9604,0.9604,0.9586,0.9701,0.96,0.9604,0.9768,0.9437,0.955,0.97,0.9568
ARI,0.9114,0.9114,0.9074,0.9359,0.9136,0.9114,0.9502,0.8767,0.9061,0.9324,0.9065


In [67]:
tables[7]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8117,0.8117,0.8083,0.8267,0.8167,0.8083,0.8383,0.7917,0.775,0.8217,0.8017
Precision,0.8241,0.8241,0.8202,0.8346,0.8245,0.8113,0.8492,0.7935,0.7999,0.8332,0.813
Recall,0.8117,0.8117,0.8083,0.8267,0.8167,0.8083,0.8383,0.7917,0.775,0.8217,0.8017
F1 score,0.8134,0.8134,0.8094,0.8272,0.8169,0.8086,0.8399,0.7913,0.7781,0.8228,0.8035
ARI,0.6253,0.6253,0.619,0.651,0.6339,0.6235,0.6739,0.5938,0.5651,0.6443,0.6101


In [68]:
tables[8]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8717,0.8733,0.8717,0.915,0.9017,0.8467,0.8983,0.855,0.8467,0.825,0.855
Precision,0.8858,0.888,0.8844,0.9179,0.9056,0.8578,0.9017,0.8632,0.8538,0.8532,0.8682
Recall,0.8717,0.8733,0.8717,0.915,0.9017,0.8467,0.8983,0.855,0.8467,0.825,0.855
F1 score,0.8709,0.8725,0.872,0.9156,0.9014,0.8438,0.8968,0.8492,0.8412,0.8116,0.8518
ARI,0.7486,0.7512,0.7486,0.8243,0.799,0.7033,0.7939,0.7342,0.7189,0.7091,0.72


In [70]:
tables[9]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6817,0.6817,0.685,0.69,0.6867,0.6367,0.635,0.625,0.6383,0.6333,0.6367
Precision,0.7081,0.7088,0.7124,0.6987,0.6924,0.6481,0.6399,0.6439,0.6543,0.605,0.6504
Recall,0.6817,0.6817,0.685,0.69,0.6867,0.6367,0.635,0.625,0.6383,0.6333,0.6367
F1 score,0.686,0.6862,0.6899,0.6858,0.6813,0.634,0.6105,0.6243,0.6387,0.605,0.6344
ARI,0.4472,0.4466,0.4517,0.4631,0.457,0.3984,0.4128,0.3779,0.3887,0.4371,0.3955


In [71]:
tables[10]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.905,0.905,0.9133,0.8933,0.8867,0.8967,0.7767,0.895,0.8533,0.7733,0.905
Precision,0.9121,0.9104,0.9177,0.8976,0.8957,0.9043,0.7568,0.903,0.8961,0.7884,0.9119
Recall,0.905,0.905,0.9133,0.8933,0.8867,0.8967,0.7767,0.895,0.8533,0.7733,0.905
F1 score,0.9058,0.9057,0.9136,0.8929,0.8874,0.8972,0.7516,0.8955,0.8584,0.7417,0.9059
ARI,0.8027,0.8045,0.8219,0.7834,0.7672,0.7922,0.6635,0.7884,0.6893,0.6912,0.803


In [72]:
tables[11]

Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.68,0.68,0.6733,0.67,0.665,0.6767,0.62,0.645,0.6533,0.6067,0.6883
Precision,0.6998,0.6998,0.6954,0.6897,0.6835,0.6959,0.644,0.6679,0.6858,0.6483,0.6973
Recall,0.68,0.68,0.6733,0.67,0.665,0.6767,0.62,0.645,0.6533,0.6067,0.6883
F1 score,0.6825,0.6825,0.6758,0.6737,0.6688,0.6775,0.6039,0.6476,0.6581,0.5916,0.6893
ARI,0.4339,0.4339,0.4243,0.4271,0.4201,0.4339,0.388,0.387,0.3906,0.3945,0.4538


---
### Sklearn datasets

#### Iris

In [73]:
# Iris data
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9333,0.9333,1.0,0.9333,0.9778,0.9333,0.9333,1.0,1.0,0.9333,0.9333
Precision,0.9458,0.9458,1.0,0.9458,0.9794,0.9458,0.9458,1.0,1.0,0.9458,0.9458
Recall,0.9333,0.9333,1.0,0.9333,0.9778,0.9333,0.9333,1.0,1.0,0.9333,0.9333
F1 score,0.9324,0.9324,1.0,0.9324,0.9777,0.9324,0.9324,1.0,1.0,0.9324,0.9324
ARI,0.8436,0.8436,1.0,0.8436,0.943,0.8436,0.8436,1.0,1.0,0.8436,0.8436


In [74]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9556,0.9556,1.0,0.9556,1.0,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
Precision,0.9615,0.9615,1.0,0.9615,1.0,0.9615,0.9615,0.9615,0.9615,0.9615,0.9615
Recall,0.9556,0.9556,1.0,0.9556,1.0,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
F1 score,0.9553,0.9553,1.0,0.9553,1.0,0.9553,0.9553,0.9553,0.9553,0.9553,0.9553
ARI,0.8907,0.8907,1.0,0.8907,1.0,0.8907,0.8907,0.8907,0.8907,0.8907,0.8907


In [75]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
Precision,0.9615,0.9615,1.0,0.9111,0.9794,0.9615,0.9615,0.9615,0.9615,0.9615,0.9615
Recall,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
F1 score,0.9553,0.9553,1.0,0.9111,0.9777,0.9553,0.9553,0.9553,0.9553,0.9553,0.9553
ARI,0.8907,0.8907,1.0,0.7991,0.943,0.8907,0.8907,0.8907,0.8907,0.8907,0.8907


#### Wine

In [76]:
# Wine data 
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8333,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
Precision,0.9514,0.9514,0.9662,0.8348,0.8558,0.9514,0.9514,0.9099,0.9099,0.9055,0.9514
Recall,0.9444,0.9444,0.963,0.8333,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
F1 score,0.9449,0.9449,0.9632,0.8328,0.8518,0.9449,0.9449,0.9081,0.9081,0.9059,0.9449
ARI,0.8247,0.8247,0.8838,0.5462,0.5858,0.8247,0.8247,0.7198,0.7198,0.7605,0.8247


In [77]:
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
Precision,0.9466,0.9466,0.9662,0.8552,0.8552,0.9466,0.9466,0.9099,0.9099,0.9055,0.9466
Recall,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
F1 score,0.9442,0.9442,0.9632,0.8507,0.8507,0.9442,0.9442,0.9081,0.9081,0.9059,0.9442
ARI,0.8335,0.8335,0.8838,0.5939,0.5939,0.8335,0.8335,0.7198,0.7198,0.7605,0.8335


In [78]:
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
Precision,0.9466,0.9466,0.9638,0.8552,0.8552,0.9466,0.9466,0.9099,0.9099,0.9055,0.9466
Recall,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
F1 score,0.9442,0.9442,0.9628,0.8507,0.8507,0.9442,0.9442,0.9081,0.9081,0.9059,0.9442
ARI,0.8335,0.8335,0.8898,0.5939,0.5939,0.8335,0.8335,0.7198,0.7198,0.7605,0.8335


#### Obesity

In [79]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../DATA/Obesity.csv')
le = LabelEncoder()
df['NObeyesdad_LabelEncoded'] = le.fit_transform(df['NObeyesdad'])
df = df.drop('NObeyesdad', axis=1)

# 1. Найти ВСЕ категориальные колонки
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# 2. Преобразовать КАЖДУЮ категориальную колонку в числовую
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop('NObeyesdad_LabelEncoded', axis=1).to_numpy()
y = df['NObeyesdad_LabelEncoded'].to_numpy()

compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6388,0.6388,0.6388,0.6309,0.6309,0.6388,0.6293,0.6278,0.6262,0.6341,0.6388
Precision,0.6898,0.6898,0.6898,0.6685,0.6685,0.6898,0.5854,0.6681,0.6654,0.589,0.6898
Recall,0.6388,0.6388,0.6388,0.6309,0.6309,0.6388,0.6293,0.6278,0.6262,0.6341,0.6388
F1 score,0.6225,0.6225,0.6225,0.63,0.63,0.6225,0.5642,0.6235,0.6223,0.5686,0.6225
ARI,0.4812,0.4812,0.4812,0.4632,0.4632,0.4812,0.5027,0.4641,0.4637,0.5081,0.4812


In [80]:
compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7618,0.7618,0.7618,0.7445,0.7445,0.7618,0.6893,0.7729,0.7697,0.6719,0.7618
Precision,0.7994,0.7994,0.7994,0.7619,0.7619,0.7994,0.6196,0.8061,0.8006,0.673,0.7994
Recall,0.7618,0.7618,0.7618,0.7445,0.7445,0.7618,0.6893,0.7729,0.7697,0.6719,0.7618
F1 score,0.7623,0.7623,0.7623,0.7433,0.7433,0.7623,0.641,0.7785,0.775,0.6502,0.7623
ARI,0.5782,0.5782,0.5782,0.5942,0.5942,0.5782,0.5414,0.5931,0.5896,0.5208,0.5782


In [82]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8612,0.8612,0.8644,0.8864,0.8849,0.8612,0.8565,0.806,0.806,0.8707,0.8644
Precision,0.8667,0.8667,0.8701,0.8985,0.8971,0.8667,0.8712,0.8245,0.8234,0.888,0.8697
Recall,0.8612,0.8612,0.8644,0.8864,0.8849,0.8612,0.8565,0.806,0.806,0.8707,0.8644
F1 score,0.8622,0.8622,0.8653,0.8874,0.8858,0.8622,0.8558,0.8088,0.8086,0.8712,0.8653
ARI,0.7284,0.7284,0.7351,0.7743,0.7709,0.7284,0.7213,0.6393,0.6404,0.7455,0.7345


#### Glass

In [90]:
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/DATA/glass.csv')
X = df.drop('Type', axis=1).to_numpy()
y = df['Type'].to_numpy()
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [91]:
compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6769,0.6769,0.6769,0.6462,0.6462,0.6154,0.5231,0.6,0.6,0.5385,0.7231
Precision,0.626,0.626,0.626,0.6793,0.6828,0.5697,0.4356,0.5555,0.5555,0.4386,0.7032
Recall,0.6769,0.6769,0.6769,0.6462,0.6462,0.6154,0.5231,0.6,0.6,0.5385,0.7231
F1 score,0.641,0.641,0.641,0.6384,0.6403,0.5789,0.4308,0.5624,0.5624,0.4418,0.7062
ARI,0.3451,0.3451,0.3451,0.3198,0.3245,0.3011,0.3419,0.2765,0.2765,0.3818,0.4539


In [92]:
compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6923,0.6923,0.6923,0.6615,0.6769,0.5538,0.6615,0.5538,0.5538,0.6462,0.7692
Precision,0.724,0.724,0.7243,0.6698,0.6933,0.5786,0.6294,0.5682,0.5682,0.63,0.7666
Recall,0.6923,0.6923,0.6923,0.6615,0.6769,0.5538,0.6615,0.5538,0.5538,0.6462,0.7692
F1 score,0.6903,0.6903,0.6907,0.6539,0.6742,0.551,0.6315,0.5418,0.5418,0.6108,0.7621
ARI,0.3546,0.3546,0.3838,0.3011,0.3064,0.2651,0.3234,0.233,0.233,0.3162,0.47


In [93]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6462,0.6462,0.6769,0.6923,0.6923,0.6308,0.6462,0.5846,0.5846,0.6308,0.6769
Precision,0.6469,0.6469,0.677,0.719,0.7171,0.6384,0.6529,0.5954,0.5954,0.6442,0.6786
Recall,0.6462,0.6462,0.6769,0.6923,0.6923,0.6308,0.6462,0.5846,0.5846,0.6308,0.6769
F1 score,0.6388,0.6388,0.6704,0.684,0.682,0.6229,0.6207,0.569,0.569,0.6112,0.6754
ARI,0.2832,0.2832,0.355,0.3274,0.3294,0.3473,0.3333,0.2749,0.2749,0.3134,0.3263


#### Pima Indian Diabetes

In [1]:
import numpy as np 
import pandas as pd

In [4]:
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/DATA/diabetes.csv')
X = df.drop('Outcome', axis=1).to_numpy()
y = df['Outcome'].to_numpy()
df.isna().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [115]:
compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186
Precision,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176,0.7176
Recall,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186,0.7186
F1 score,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818,0.6818
ARI,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505,0.1505


In [116]:
compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.71,0.71,0.71,0.7229,0.7229,0.71,0.71,0.71,0.71,0.71,0.71
Precision,0.7457,0.7457,0.7491,0.7448,0.7448,0.7457,0.7457,0.7457,0.7457,0.7457,0.7457
Recall,0.71,0.71,0.71,0.7229,0.7229,0.71,0.71,0.71,0.71,0.71,0.71
F1 score,0.7168,0.7168,0.7169,0.7285,0.7285,0.7168,0.7168,0.7168,0.7168,0.7168,0.7168
ARI,0.1728,0.1728,0.173,0.1943,0.1943,0.1728,0.1728,0.1728,0.1728,0.1728,0.1728


In [117]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7013,0.7013,0.697,0.7229,0.7143,0.7013,0.7013,0.7013,0.7013,0.7013,0.7013
Precision,0.7124,0.7124,0.7117,0.7347,0.724,0.7124,0.7124,0.7124,0.7124,0.7124,0.7124
Recall,0.7013,0.7013,0.697,0.7229,0.7143,0.7013,0.7013,0.7013,0.7013,0.7013,0.7013
F1 score,0.7053,0.7053,0.7018,0.7269,0.7178,0.7053,0.7053,0.7053,0.7053,0.7053,0.7053
ARI,0.1553,0.1553,0.1492,0.1926,0.1768,0.1553,0.1553,0.1553,0.1553,0.1553,0.1553


#### Wisconsin breast cancer

In [118]:
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/DATA/wisconsin breast cancer (diag).csv')
df = pd.get_dummies(df, drop_first=True, dtype=int)
X = df.drop('diagnosis_M', axis=1).to_numpy()
y = df['diagnosis_M'].to_numpy()

In [119]:
compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9591,0.9591,0.9591,0.9649,0.9649,0.9591,0.9591,0.9591,0.9591,0.9591,0.9591
Precision,0.959,0.959,0.959,0.965,0.965,0.959,0.959,0.959,0.959,0.959,0.959
Recall,0.9591,0.9591,0.9591,0.9649,0.9649,0.9591,0.9591,0.9591,0.9591,0.9591,0.9591
F1 score,0.959,0.959,0.959,0.9648,0.9648,0.959,0.959,0.959,0.959,0.959,0.959
ARI,0.8413,0.8413,0.8413,0.8631,0.8631,0.8413,0.8413,0.8413,0.8413,0.8413,0.8413


In [120]:
compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9415,0.9415,0.9532,0.9474,0.9591,0.9415,0.9474,0.9415,0.9415,0.9415,0.9415
Precision,0.9415,0.9415,0.9532,0.9474,0.9593,0.9415,0.9473,0.9415,0.9415,0.9415,0.9415
Recall,0.9415,0.9415,0.9532,0.9474,0.9591,0.9415,0.9474,0.9415,0.9415,0.9415,0.9415
F1 score,0.9415,0.9415,0.9532,0.9471,0.9588,0.9415,0.9473,0.9415,0.9415,0.9415,0.9415
ARI,0.7776,0.7776,0.8198,0.7983,0.8412,0.7776,0.7985,0.7776,0.7776,0.7776,0.7776


In [121]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9415,0.9415,0.9357,0.9357,0.9474,0.9415,0.9474,0.9415,0.9415,0.9415,0.9415
Precision,0.9415,0.9415,0.9359,0.9355,0.9474,0.9415,0.9473,0.9415,0.9415,0.9415,0.9415
Recall,0.9415,0.9415,0.9357,0.9357,0.9474,0.9415,0.9474,0.9415,0.9415,0.9415,0.9415
F1 score,0.9415,0.9415,0.9358,0.9356,0.9471,0.9415,0.9473,0.9415,0.9415,0.9415,0.9415
ARI,0.7776,0.7776,0.7569,0.7567,0.7983,0.7776,0.7985,0.7776,0.7776,0.7776,0.7776


#### Ionosphere

In [None]:
df = pd.read_csv('/Users/user/HSE 24:25/Term Paper 2025/DATA/ionosphere.csv')
df = pd.get_dummies(df, drop_first=True, dtype=int)
X = df.drop('g_g', axis=1).to_numpy()
y = df['g_g'].to_numpy()

In [131]:
compare_metrics_train_test(max_depth=3, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 3)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8762,0.8762,0.8762,0.9143,0.8286,0.8762,0.8762,0.8762,0.8762,0.8762,0.8762
Precision,0.8834,0.8834,0.8834,0.915,0.8269,0.8834,0.8834,0.8834,0.8834,0.8834,0.8834
Recall,0.8762,0.8762,0.8762,0.9143,0.8286,0.8762,0.8762,0.8762,0.8762,0.8762,0.8762
F1 score,0.8777,0.8777,0.8777,0.9145,0.8274,0.8777,0.8777,0.8777,0.8777,0.8777,0.8777
ARI,0.5608,0.5608,0.5608,0.6818,0.4217,0.5608,0.5608,0.5608,0.5608,0.5608,0.5608


In [132]:
compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8667,0.8667,0.8762,0.8762,0.8095,0.8667,0.8667,0.8667,0.8667,0.8667,0.8667
Precision,0.8667,0.8667,0.8756,0.8756,0.8078,0.8667,0.8667,0.8667,0.8667,0.8667,0.8667
Recall,0.8667,0.8667,0.8762,0.8762,0.8095,0.8667,0.8667,0.8667,0.8667,0.8667,0.8667
F1 score,0.8667,0.8667,0.8758,0.8758,0.8033,0.8667,0.8667,0.8667,0.8667,0.8667,0.8667
ARI,0.5304,0.5304,0.5588,0.5588,0.3677,0.5304,0.5304,0.5304,0.5304,0.5304,0.5304


In [133]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8762,0.8762,0.8762,0.8762,0.9143,0.8762,0.8762,0.8762,0.8762,0.8762,0.8762
Precision,0.8771,0.8771,0.8771,0.8771,0.9173,0.8771,0.8771,0.8771,0.8771,0.8771,0.8771
Recall,0.8762,0.8762,0.8762,0.8762,0.9143,0.8762,0.8762,0.8762,0.8762,0.8762,0.8762
F1 score,0.8766,0.8766,0.8766,0.8766,0.915,0.8766,0.8766,0.8766,0.8766,0.8766,0.8766
ARI,0.5596,0.5596,0.5596,0.5596,0.6823,0.5596,0.5596,0.5596,0.5596,0.5596,0.5596
