In [20]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.tree = None                                                          # переменная, в которой будет храниться готовое дерево решений.
        self.feature_importances = None                                          # переменная для важности фич

    def entropy(self, y):
        counts = np.bincount(y)                                                   # Считаем количество объектов для каждого класса. Формат - [0,0,1,2,1,2,0]
        probabilities = counts / len(y)                                           # вероятность. Формат - [x/y, x1/y, x3/y]
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])          # суммируем вероятности. p - каждая итерация в полученном массиве 'probabilities'.

    def gini(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def information_gain(self, y, left_indices, right_indices):
        if self.criterion == 'entropy':                                            # Выбор критерия
            impurity_func = self.entropy
        elif self.criterion == 'gini':
            impurity_func = self.gini
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

        parent_impurity = impurity_func(y)                                         # неопределенность для всей выборки.
        left_impurity = impurity_func(y[left_indices])
        right_impurity = impurity_func(y[right_indices])

        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        weighted_impurity = (n_left / n) * left_impurity + (n_right / n) * right_impurity
        inf_gain = parent_impurity - weighted_impurity
        
        # print(f'Inf. gain "{self.criterion}": {inf_gain}')
        return inf_gain                                                            # возвращаем инф. выиг.
    
    
    def custom_1(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            b = 1
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    def custom_2(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l)
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_3(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10 
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l*(1 - p_l))
            
            # eps. для стабильности вычислений
            denominator_1 = max(p_1 * b**2, epsilon)
            denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / denominator_1
            sum_total += ((p_2l - p_2 * p_l)**2) / denominator_2

        return N * sum_total
    

    def custom_4(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2


        return N * sum_total
    
    
    def custom_5(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l**2
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_6(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.log(max(p_l, epsilon))
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_7(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = (-p_l)*np.log(max(p_l, epsilon))
            
            # # eps. для стабильности вычислений
            # denominator_1 = max(p_1 * b**2, epsilon)
            # denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    # Функция находит наиболее частый элемент в массиве y (метки классов).
    def most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]


    def find_best_split(self, X, y, num_features, y_oh=None):
        best_gain = -float('inf')                                                  # хранит лучшее значение критерия
        best_split = None                                                          # будет содержать параметры наилучшего разбиения

        for feature_index in range(num_features):                                  # перебираем по очереди признаки.
            # Сортируем значения признака
            feature_values = np.sort(X[:, feature_index])
            # Берем средние между соседними значениями
            thresholds = (feature_values[:-1] + feature_values[1:]) / 2     
            
            for threshold in thresholds:                                           # для каждого уникального значения делим данные на 2 части.
                left_indices = np.where(X[:, feature_index] <= threshold)[0]       # левый - меньше уникального значения. [0] - нужен для возвращения массива, а не кортежа.
                right_indices = np.where(X[:, feature_index] > threshold)[0]       # правый - больше ун. знач. feature_index - искомый признак.

                if (len(left_indices) < self.min_samples_leaf or 
                    len(right_indices) < self.min_samples_leaf):
                    continue                                                       # если условие срабатывает, переходим к следующей итерации, пропуская то, что ниже.

                if self.criterion == 'custom_1':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_1 criterion")
                    gain = self.custom_1(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_2':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_2 criterion")
                    gain = self.custom_2(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_3':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_3 criterion")
                    gain = self.custom_3(y_oh, left_indices, right_indices)                    
                
                elif self.criterion == 'custom_4':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_4 criterion")
                    gain = self.custom_4(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_5':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_5 criterion")
                    gain = self.custom_5(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_6':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_6 criterion")
                    gain = self.custom_6(y_oh, left_indices, right_indices)    
                    
                elif self.criterion == 'custom_7':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_7 criterion")
                    gain = self.custom_7(y_oh, left_indices, right_indices)                                                        
                
                else:
                    gain = self.information_gain(y, left_indices, right_indices)   # рассчитываем инф. прирост.

                if gain > best_gain:                                               # если текущий прирост больше самого большого
                    best_gain = gain                                               # приравниваем переменную наибольшего к текущему.
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'left_indices': left_indices,
                        'right_indices': right_indices,
                        'gain': gain
                    }                                                              # теперь это параметры разбиения, которые дают наилучший прирост.
        
        return best_split                                                          # После перебора всех признаков и порогов, возвращаем параметры лучшего найденного разбиения.


    def fit(self, X, y, y_oh=None):
        num_features = X.shape[1]
        self.feature_importances = np.zeros(num_features)                          # инициализируем нулями
        self.tree = self.grow_tree(X, y, y_oh, depth=0)

        # нормализуем важности, чтобы сумма = 1, как в sklearn
        total = self.feature_importances.sum()
        if total > 0:
            self.feature_importances /= total


    def grow_tree(self, X, y, y_oh, depth):
        num_samples, num_features = X.shape
        num_classes = len(set(y))

        if (depth == self.max_depth or 
            num_classes == 1 or 
            num_samples < self.min_samples_split):
            return self.most_common_label(y)

        if self.criterion.startswith('custom_'):
            best_split = self.find_best_split(X, y, num_features, y_oh)
        else:
            best_split = self.find_best_split(X, y, num_features)

        if best_split is None:
            return self.most_common_label(y)

        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
        
        # Вычисляем прирост информации для подсчета важности признаков
        if self.criterion == 'custom_1':
            gain = self.custom_1(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_2':
            gain = self.custom_2(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_3':
            gain = self.custom_3(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_4':
            gain = self.custom_4(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_5':
            gain = self.custom_5(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_6':
            gain = self.custom_6(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_7':
            gain = self.custom_7(y_oh, left_indices, right_indices)
        else:
            gain = self.information_gain(y, left_indices, right_indices)

        self.feature_importances[best_split['feature_index']] += gain              # Сохраняем вклад этого признака в важность

        left_subtree = self.grow_tree(X[left_indices], y[left_indices], 
                                    y_oh[left_indices] if y_oh is not None else None, 
                                    depth + 1)
        right_subtree = self.grow_tree(X[right_indices], y[right_indices], 
                                     y_oh[right_indices] if y_oh is not None else None, 
                                     depth + 1)

        return {
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }


    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])


    def _traverse_tree(self, x, node):
        if isinstance(node, dict):
            if x[node['feature_index']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])

        return node                                                             # Если нет, то это лист и присваиваем метку.

In [21]:
def generdat(N, V, k, alpha, nmin, seed=None, sig_range=(0.05, 0.1)):
    if N < k * nmin:
        raise ValueError(f"N must be >= k * nmin. Got N={N}, k={k}, nmin={nmin}")
    if k < 1:
        raise ValueError("k must be at least 1")
    if alpha == 1:
        raise ValueError("alpha cannot be 1")

    if seed is not None:
        np.random.seed(seed)

    # Более равномерное распределение размеров кластеров
    if k == 1:
        Nk = np.array([N])
    else:
        base_sizes = np.ones(k, dtype=int) * nmin
        remaining = N - k * nmin
        if remaining > 0:
            additional = np.random.multinomial(remaining, np.ones(k)/k)
            Nk = base_sizes + additional
        else:
            Nk = base_sizes

    # Центры кластеров
    cen = (alpha - 1) + 2 * (1 - alpha) * np.random.rand(k, V)

    # Генерация данных с предварительным выделением памяти
    X = np.zeros((N, V))
    y = np.zeros(N, dtype=int)
    R = []
    
    sig_min, sig_max = sig_range
    start_idx = 0
    
    for k0 in range(k):
        nk = Nk[k0]
        end_idx = start_idx + nk
        
        # Диапазон для текущего кластера
        R.append(range(start_idx, end_idx))
        y[start_idx:end_idx] = k0  # Метки начинаются с 0
        
        # Генерация данных кластера
        sig = sig_min + (sig_max - sig_min) * np.random.rand(V)
        X[start_idx:end_idx] = np.random.randn(nk, V) * sig + cen[k0, :]
        
        start_idx = end_idx

    return Nk, R, y, X, cen


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_train_test(max_depth, X, y, *, N=None, V=None, k=None, alpha=None, nmin=None):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    encoder = OneHotEncoder(sparse_output=False)
    y_oh_train = encoder.fit_transform(y_train.reshape(-1,1))


    '''Custom_1'''
    custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
    custom_1.fit(X_train, y_train, y_oh_train)
    y_pred = custom_1.predict(X_test)
    accuracy_1, precision_1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_1, f1_1 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_1 = adjusted_rand_score(y_test, y_pred)

    '''GINI'''
    gini = DecisionTree(max_depth=max_depth, criterion='gini')
    gini.fit(X_train, y_train)
    y_pred = gini.predict(X_test)
    accuracy_gini, precision_gini = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini, f1_gini = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_GINI'''
    sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
    sk_gini.fit(X_train, y_train)
    y_pred = sk_gini.predict(X_test)
    accuracy_gini_sk, precision_gini_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini_sk, f1_gini_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini_sk = adjusted_rand_score(y_test, y_pred)

    '''Entropy'''
    entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
    entropy.fit(X_train, y_train)
    y_pred = entropy.predict(X_test)
    accuracy_entropy, precision_entropy = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy, f1_entropy = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_Entropy'''
    sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    sk_entropy.fit(X_train, y_train)
    y_pred = sk_entropy.predict(X_test)
    accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy_sk, f1_entropy_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy_sk = adjusted_rand_score(y_test, y_pred)
    
    '''Custom_2'''
    custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
    custom_2.fit(X_train, y_train, y_oh_train)
    y_pred = custom_2.predict(X_test)
    accuracy_2, precision_2 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_2, f1_2 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_2 = adjusted_rand_score(y_test, y_pred)

    '''Custom_3'''
    custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
    custom_3.fit(X_train, y_train, y_oh_train)
    y_pred = custom_3.predict(X_test)
    accuracy_3, precision_3 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_3, f1_3 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_3 = adjusted_rand_score(y_test, y_pred)

    '''Custom_4'''
    custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
    custom_4.fit(X_train, y_train, y_oh_train)
    y_pred = custom_4.predict(X_test)
    accuracy_4, precision_4 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_4, f1_4 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_4 = adjusted_rand_score(y_test, y_pred)

    '''Custom_5'''
    custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
    custom_5.fit(X_train, y_train, y_oh_train)
    y_pred = custom_5.predict(X_test)
    accuracy_5, precision_5 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_5, f1_5 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_5 = adjusted_rand_score(y_test, y_pred)

    '''Custom_6'''
    custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
    custom_6.fit(X_train, y_train, y_oh_train)
    y_pred = custom_6.predict(X_test)
    accuracy_6, precision_6 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_6, f1_6 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_6 = adjusted_rand_score(y_test, y_pred)

    '''Custom_7'''
    custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
    custom_7.fit(X_train, y_train, y_oh_train)
    y_pred = custom_7.predict(X_test)
    accuracy_7, precision_7 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_7, f1_7 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_7 = adjusted_rand_score(y_test, y_pred)

    results = np.round([[accuracy_1, accuracy_gini, accuracy_gini_sk, accuracy_entropy, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7],
                    [precision_1, precision_gini, precision_gini_sk, precision_entropy, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7],
                    [recall_1, recall_gini, recall_gini_sk, recall_entropy, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7],
                    [f1_1, f1_gini, f1_gini_sk, f1_entropy, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7],
                    [ari_1, ari_gini, ari_gini_sk, ari_entropy, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7]],4)

    column = ['b = 1','gini','gini_sklearn', 'entropy', 'entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = log(p_l)', 'b = -p_l * log(p_l)']
    table = pd.DataFrame(data=results, columns=column, index=['Accuracy', 'Precision', 'Recall','F1 score','ARI'])
    
    print(f'\nN, V, k, alpha, nmin, max_depth = {N, V, k, alpha, nmin, max_depth}')

    return table

In [23]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 7, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 7, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9667,0.9667,0.9667,0.965,0.9667,0.97,0.965,0.97,0.9483,0.825,0.9667
Precision,0.9673,0.9673,0.9674,0.9662,0.9684,0.9706,0.9657,0.9706,0.9503,0.8132,0.9675
Recall,0.9667,0.9667,0.9667,0.965,0.9667,0.97,0.965,0.97,0.9483,0.825,0.9667
F1 score,0.9667,0.9667,0.9668,0.9652,0.9669,0.97,0.9651,0.97,0.9485,0.7808,0.9668
ARI,0.9238,0.9238,0.9232,0.9215,0.9251,0.9308,0.9198,0.9308,0.8844,0.771,0.9229


In [24]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6517,0.6517,0.6517,0.95,0.95,0.6517,0.9517,0.6517,0.7567,0.87,0.6467
Precision,0.7891,0.7891,0.7345,0.9516,0.9516,0.7891,0.955,0.7891,0.7434,0.9288,0.6687
Recall,0.6517,0.6517,0.6517,0.95,0.95,0.6517,0.9517,0.6517,0.7567,0.87,0.6467
F1 score,0.612,0.612,0.6134,0.9502,0.9502,0.612,0.9517,0.612,0.7213,0.8446,0.6027
ARI,0.4756,0.4756,0.4768,0.8934,0.8934,0.4756,0.8936,0.4756,0.6084,0.8459,0.475


In [25]:
# Squeeze = 0.85

N, V, k, alpha, nmin = 2000, 15, 10, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7633,0.7633,0.7667,0.7417,0.7417,0.7067,0.8017,0.7067,0.58,0.68,0.7
Precision,0.779,0.779,0.7813,0.7649,0.7639,0.7279,0.8158,0.7279,0.614,0.649,0.7131
Recall,0.7633,0.7633,0.7667,0.7417,0.7417,0.7067,0.8017,0.7067,0.58,0.68,0.7
F1 score,0.765,0.765,0.7683,0.7455,0.7454,0.7005,0.8046,0.7005,0.5341,0.6321,0.6917
ARI,0.5475,0.5475,0.5537,0.5147,0.515,0.5224,0.608,0.5224,0.3888,0.5375,0.5113


In [10]:
# Iris data
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
Precision,0.9615,0.9615,1.0,0.9111,0.9794,0.9615,0.9615,0.9615,0.9615,0.9615,0.9615
Recall,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
F1 score,0.9553,0.9553,1.0,0.9111,0.9777,0.9553,0.9553,0.9553,0.9553,0.9553,0.9553
ARI,0.8907,0.8907,1.0,0.7991,0.943,0.8907,0.8907,0.8907,0.8907,0.8907,0.8907


In [11]:
# Wine data 
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
Precision,0.9466,0.9466,0.9662,0.8552,0.8552,0.9466,0.9466,0.9099,0.9099,0.9055,0.9466
Recall,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9074,0.9074,0.9074,0.9444
F1 score,0.9442,0.9442,0.9632,0.8507,0.8507,0.9442,0.9442,0.9081,0.9081,0.9059,0.9442
ARI,0.8335,0.8335,0.8838,0.5939,0.5939,0.8335,0.8335,0.7198,0.7198,0.7605,0.8335


In [12]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../DATA/Obesity.csv')
le = LabelEncoder()
df['NObeyesdad_LabelEncoded'] = le.fit_transform(df['NObeyesdad'])
df = df.drop('NObeyesdad', axis=1)

# 1. Найти ВСЕ категориальные колонки
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# 2. Преобразовать КАЖДУЮ категориальную колонку в числовую
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop('NObeyesdad_LabelEncoded', axis=1).to_numpy()
y = df['NObeyesdad_LabelEncoded'].to_numpy()

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7618,0.7618,0.7618,0.7445,0.7445,0.7618,0.6893,0.7729,0.7697,0.6719,0.7618
Precision,0.7994,0.7994,0.7994,0.7619,0.7619,0.7994,0.6196,0.8061,0.8006,0.673,0.7994
Recall,0.7618,0.7618,0.7618,0.7445,0.7445,0.7618,0.6893,0.7729,0.7697,0.6719,0.7618
F1 score,0.7623,0.7623,0.7623,0.7433,0.7433,0.7623,0.641,0.7785,0.775,0.6502,0.7623
ARI,0.5782,0.5782,0.5782,0.5942,0.5942,0.5782,0.5414,0.5931,0.5896,0.5208,0.5782


In [13]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8612,0.8612,0.8612,0.8864,0.8849,0.8612,0.8565,0.806,0.806,0.8707,0.8644
Precision,0.8667,0.8667,0.8668,0.8985,0.8969,0.8667,0.8712,0.8245,0.8234,0.888,0.8697
Recall,0.8612,0.8612,0.8612,0.8864,0.8849,0.8612,0.8565,0.806,0.806,0.8707,0.8644
F1 score,0.8622,0.8622,0.8622,0.8874,0.8858,0.8622,0.8558,0.8088,0.8086,0.8712,0.8653
ARI,0.7284,0.7284,0.7285,0.7743,0.771,0.7284,0.7213,0.6393,0.6404,0.7455,0.7345


In [15]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 1000, 10, 7, 0.5, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (1000, 10, 7, 0.5, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8567,0.8567,0.8567,0.9833,0.9833,0.8567,0.8567,0.8567,0.8567,0.8567,0.8567
Precision,0.7883,0.7883,0.7883,0.9851,0.9841,0.7883,0.7883,0.7883,0.7883,0.7796,0.7883
Recall,0.8567,0.8567,0.8567,0.9833,0.9833,0.8567,0.8567,0.8567,0.8567,0.8567,0.8567
F1 score,0.8109,0.8109,0.8109,0.9835,0.9833,0.8109,0.8109,0.8109,0.8109,0.8066,0.8109
ARI,0.8268,0.8268,0.8268,0.959,0.9588,0.8268,0.8268,0.8268,0.8268,0.8268,0.8268


In [16]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 1000, 10, 7, 0.3, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (1000, 10, 7, 0.3, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8633,0.8633,0.8633,0.9933,0.9933,0.86,0.8633,0.86,0.86,0.69,0.86
Precision,0.7926,0.7926,0.7926,0.9934,0.9934,0.7893,0.7926,0.7893,0.7893,0.5407,0.7893
Recall,0.8633,0.8633,0.8633,0.9933,0.9933,0.86,0.8633,0.86,0.86,0.69,0.86
F1 score,0.8167,0.8167,0.8167,0.9933,0.9933,0.8134,0.8167,0.8134,0.8134,0.5897,0.8134
ARI,0.8517,0.8517,0.8517,0.9833,0.9833,0.8441,0.8517,0.8441,0.8441,0.7125,0.8441


In [17]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 1000, 10, 7, 0.7, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (1000, 10, 7, 0.7, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8467,0.8467,0.8467,0.97,0.97,0.8433,0.8467,0.8467,0.8333,0.8433,0.8467
Precision,0.9155,0.9155,0.9155,0.9711,0.9711,0.9124,0.9155,0.9155,0.784,0.7691,0.9155
Recall,0.8467,0.8467,0.8467,0.97,0.97,0.8433,0.8467,0.8467,0.8333,0.8433,0.8467
F1 score,0.8005,0.8005,0.8005,0.9702,0.9702,0.7972,0.8005,0.8005,0.7944,0.794,0.8005
ARI,0.7996,0.7996,0.7996,0.9268,0.9268,0.7937,0.7996,0.7996,0.7654,0.7976,0.7996


In [18]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 1000, 10, 7, 0.8, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (1000, 10, 7, 0.8, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7867,0.7867,0.78,0.9033,0.9033,0.7867,0.79,0.7867,0.7867,0.7767,0.78
Precision,0.8565,0.8565,0.7051,0.9056,0.9056,0.8569,0.7309,0.8569,0.8569,0.7086,0.8505
Recall,0.7867,0.7867,0.78,0.9033,0.9033,0.7867,0.79,0.7867,0.7867,0.7767,0.78
F1 score,0.7424,0.7424,0.7316,0.9024,0.9024,0.7423,0.7492,0.7423,0.7423,0.7309,0.7352
ARI,0.6972,0.6972,0.6884,0.7873,0.7873,0.6939,0.7114,0.6939,0.6939,0.6798,0.6889


In [19]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 1000, 10, 7, 0.9, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (1000, 10, 7, 0.9, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.5833,0.5833,0.5833,0.57,0.57,0.5767,0.57,0.5767,0.5767,0.5833,0.5833
Precision,0.6739,0.6739,0.6739,0.6201,0.6201,0.6497,0.6726,0.6734,0.6734,0.6697,0.6715
Recall,0.5833,0.5833,0.5833,0.57,0.57,0.5767,0.57,0.5767,0.5767,0.5833,0.5833
F1 score,0.5846,0.5846,0.5846,0.5704,0.5704,0.5765,0.5659,0.5789,0.5789,0.576,0.5852
ARI,0.3043,0.3043,0.3043,0.2755,0.2755,0.2989,0.2908,0.29,0.29,0.3097,0.3011
