In [106]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None                                                          # переменная, в которой будет храниться готовое дерево решений.
        self.feature_importances = None                                           # переменная для важности фич

    def entropy(self, y):
        counts = np.bincount(y)                                                   # Считаем количество объектов для каждого класса. Формат - [0,0,1,2,1,2,0]
        probabilities = counts / len(y)                                           # вероятность. Формат - [x/y, x1/y, x3/y]
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])          # суммируем вероятности. p - каждая итерация в полученном массиве 'probabilities'.

    def gini(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def information_gain(self, y, left_indices, right_indices):
        if self.criterion == 'entropy':                                            # Выбор критерия
            impurity_func = self.entropy
        elif self.criterion == 'gini':
            impurity_func = self.gini
        else:
            raise ValueError(f"Unknown criterion: {self.criterion}")

        parent_impurity = impurity_func(y)                                         # неопределенность для всей выборки.
        left_impurity = impurity_func(y[left_indices])
        right_impurity = impurity_func(y[right_indices])

        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        weighted_impurity = (n_left / n) * left_impurity + (n_right / n) * right_impurity
        inf_gain = parent_impurity - weighted_impurity
        
        # print(f'Inf. gain "{self.criterion}": {inf_gain}')
        return inf_gain                                                            # возвращаем инф. выиг.
    
    
    def custom_1(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            b = 1
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    def custom_2(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l)
            
            # Избегаем деления на ноль
            # if p_1 > 0:
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            # if p_2 > 0:
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_3(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10 
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.sqrt(p_l*(1 - p_l))
            
            # eps. для стабильности вычислений
            denominator_1 = max(p_1 * b**2, epsilon)
            denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / denominator_1
            sum_total += ((p_2l - p_2 * p_l)**2) / denominator_2

        return N * sum_total
    

    def custom_4(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2


        return N * sum_total
    
    
    def custom_5(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = p_l**2
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_6(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = np.log(max(p_l, epsilon))
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    

    def custom_7(self, y_oh, left_indices, right_indices):
        N = y_oh.sum()

        left = y_oh[left_indices]
        right = y_oh[right_indices]
        p_1 = left.sum() / N
        p_2 = right.sum() / N
        num_classes = y_oh.shape[1]                                                # .shape[1] кол-во столбцов, .shape[0] - кол-во строк.

        sum_total = 0
        epsilon = 1e-10
        
        for l in range(num_classes):
            p_1l = left[:, l].sum() / N
            p_2l = right[:, l].sum() / N
            p_l = p_1l + p_2l
            
            b = (-p_l)*np.log(max(p_l, epsilon))
            
            # # eps. для стабильности вычислений
            # denominator_1 = max(p_1 * b**2, epsilon)
            # denominator_2 = max(p_2 * b**2, epsilon)
            
            sum_total += ((p_1l - p_1 * p_l)**2) / p_1 * b**2
            sum_total += ((p_2l - p_2 * p_l)**2) / p_2 * b**2

        return N * sum_total
    
    
    # Функция находит наиболее частый элемент в массиве y (метки классов).
    def most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]


    def find_best_split(self, X, y, num_features, y_oh=None):
        best_gain = -float('inf')                                                  # хранит лучшее значение критерия
        best_split = None                                                          # будет содержать параметры наилучшего разбиения

        for feature_index in range(num_features):                                  # перебираем по очереди признаки.
            thresholds = np.unique(X[:, feature_index])                            # находим уникальные значения.
            for threshold in thresholds:                                           # для каждого уникального значения делим данные на 2 части.
                left_indices = np.where(X[:, feature_index] <= threshold)[0]       # левый - меньше уникального значения. [0] - нужен для возвращения массива, а не кортежа.
                right_indices = np.where(X[:, feature_index] > threshold)[0]       # правый - больше ун. знач. feature_index - искомый признак.

                if len(left_indices) == 0 or len(right_indices) == 0:              # условие неинформативности разбиения.
                    continue                                                       # если условие срабатывает, переходим к следующей итерации, пропуская то, что ниже.

                if self.criterion == 'custom_1':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_1 criterion")
                    gain = self.custom_1(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_2':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_2 criterion")
                    gain = self.custom_2(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_3':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_3 criterion")
                    gain = self.custom_3(y_oh, left_indices, right_indices)                    
                
                elif self.criterion == 'custom_4':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_4 criterion")
                    gain = self.custom_4(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_5':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_5 criterion")
                    gain = self.custom_5(y_oh, left_indices, right_indices)
                
                elif self.criterion == 'custom_6':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_6 criterion")
                    gain = self.custom_6(y_oh, left_indices, right_indices)    
                    
                elif self.criterion == 'custom_7':
                    if y_oh is None:
                        raise ValueError("y_oh required for custom_7 criterion")
                    gain = self.custom_7(y_oh, left_indices, right_indices)                                                        
                
                else:
                    gain = self.information_gain(y, left_indices, right_indices)   # рассчитываем инф. прирост.

                if gain > best_gain:                                               # если текущий прирост больше самого большого
                    best_gain = gain                                               # приравниваем переменную наибольшего к текущему.
                    best_split = {
                        'feature_index': feature_index,
                        'threshold': threshold,
                        'left_indices': left_indices,
                        'right_indices': right_indices,
                        'gain': gain
                    }                                                              # теперь это параметры разбиения, которые дают наилучший прирост.
        
        return best_split                                                          # После перебора всех признаков и порогов, возвращаем параметры лучшего найденного разбиения.


    def fit(self, X, y, y_oh=None):
        num_features = X.shape[1]
        self.feature_importances = np.zeros(num_features)                          # инициализируем нулями
        self.tree = self.grow_tree(X, y, y_oh, depth=0)

        # нормализуем важности, чтобы сумма = 1, как в sklearn
        total = self.feature_importances.sum()
        if total > 0:
            self.feature_importances /= total


    def grow_tree(self, X, y, y_oh, depth):
        num_samples, num_features = X.shape
        num_classes = len(set(y))

        if (depth == self.max_depth or 
            num_classes == 1 or 
            num_samples < self.min_samples_split):
            return self.most_common_label(y)

        if self.criterion.startswith('custom_'):
            best_split = self.find_best_split(X, y, num_features, y_oh)
        else:
            best_split = self.find_best_split(X, y, num_features)

        if best_split is None:
            return self.most_common_label(y)

        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
        
        # Вычисляем прирост информации для подсчета важности признаков
        if self.criterion == 'custom_1':
            gain = self.custom_1(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_2':
            gain = self.custom_2(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_3':
            gain = self.custom_3(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_4':
            gain = self.custom_4(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_5':
            gain = self.custom_5(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_6':
            gain = self.custom_6(y_oh, left_indices, right_indices)
        elif self.criterion == 'custom_7':
            gain = self.custom_7(y_oh, left_indices, right_indices)
        else:
            gain = self.information_gain(y, left_indices, right_indices)

        self.feature_importances[best_split['feature_index']] += gain              # Сохраняем вклад этого признака в важность

        left_subtree = self.grow_tree(X[left_indices], y[left_indices], 
                                    y_oh[left_indices] if y_oh is not None else None, 
                                    depth + 1)
        right_subtree = self.grow_tree(X[right_indices], y[right_indices], 
                                     y_oh[right_indices] if y_oh is not None else None, 
                                     depth + 1)

        return {
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }


    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])


    def _traverse_tree(self, x, node):
        if isinstance(node, dict):
            if x[node['feature_index']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])

        return node                                                             # Если нет, то это лист и присваиваем метку.

---
### Generated dataset

---
#### Data generator

Parameters:
- N: Total number of data points
- V: Number of dimensions/features
- k: Number of clusters
- alpha: Controls cluster center spread (centers are in [α-1, 1-α])
- nmin: Minimum points per cluster
- seed: Random seed for reproducibility
- sig_range: Tuple (min, max) for cluster standard deviations

Returns:
- Nk: Array of cluster sizes
- R: List of ranges for each cluster
- y: Cluster labels for each point
- X: Generated data (N x V array)
- cen: Cluster centers (k x V array)

In [107]:
def generdat(N, V, k, alpha, nmin, seed=None, sig_range=(0.05, 0.1)):
    if N < k * nmin:
        raise ValueError(f"N must be >= k * nmin. Got N={N}, k={k}, nmin={nmin}")
    if k < 1:
        raise ValueError("k must be at least 1")
    if alpha == 1:
        raise ValueError("alpha cannot be 1")

    if seed is not None:
        np.random.seed(seed)

    # Более равномерное распределение размеров кластеров
    if k == 1:
        Nk = np.array([N])
    else:
        base_sizes = np.ones(k, dtype=int) * nmin
        remaining = N - k * nmin
        if remaining > 0:
            additional = np.random.multinomial(remaining, np.ones(k)/k)
            Nk = base_sizes + additional
        else:
            Nk = base_sizes

    # Центры кластеров
    cen = (alpha - 1) + 2 * (1 - alpha) * np.random.rand(k, V)

    # Генерация данных с предварительным выделением памяти
    X = np.zeros((N, V))
    y = np.zeros(N, dtype=int)
    R = []
    
    sig_min, sig_max = sig_range
    start_idx = 0
    
    for k0 in range(k):
        nk = Nk[k0]
        end_idx = start_idx + nk
        
        # Диапазон для текущего кластера
        R.append(range(start_idx, end_idx))
        y[start_idx:end_idx] = k0  # Метки начинаются с 0
        
        # Генерация данных кластера
        sig = sig_min + (sig_max - sig_min) * np.random.rand(V)
        X[start_idx:end_idx] = np.random.randn(nk, V) * sig + cen[k0, :]
        
        start_idx = end_idx

    return Nk, R, y, X, cen


---
#### Exp. 1

---
##### No train-test-split 

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_full_dataset(max_depth, X, y):
    
    encoder = OneHotEncoder(sparse_output=False)
    y_oh = encoder.fit_transform(y.reshape(-1,1))


    '''Custom_1'''
    custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
    custom_1.fit(X, y, y_oh)
    y_pred = custom_1.predict(X)
    accuracy_1, precision_1 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_1, f1_1 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_1 = adjusted_rand_score(y, y_pred)

    '''GINI'''
    gini = DecisionTree(max_depth=max_depth, criterion='gini')
    gini.fit(X, y)
    y_pred = gini.predict(X)
    accuracy_gini, precision_gini = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_gini, f1_gini = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_gini = adjusted_rand_score(y, y_pred)

    '''Sklearn_GINI'''
    sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
    sk_gini.fit(X, y)
    y_pred = sk_gini.predict(X)
    accuracy_gini_sk, precision_gini_sk = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_gini_sk, f1_gini_sk = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_gini_sk = adjusted_rand_score(y, y_pred)

    '''Entropy'''
    entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
    entropy.fit(X, y)
    y_pred = entropy.predict(X)
    accuracy_entropy, precision_entropy = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_entropy, f1_entropy = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_entropy = adjusted_rand_score(y, y_pred)

    '''Sklearn_Entropy'''
    sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    sk_entropy.fit(X, y)
    y_pred = sk_entropy.predict(X)
    accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_entropy_sk, f1_entropy_sk = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_entropy_sk = adjusted_rand_score(y, y_pred)
    
    '''Custom_2'''
    custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
    custom_2.fit(X, y, y_oh)
    y_pred = custom_2.predict(X)
    accuracy_2, precision_2 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_2, f1_2 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_2 = adjusted_rand_score(y, y_pred)

    '''Custom_3'''
    custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
    custom_3.fit(X, y, y_oh)
    y_pred = custom_3.predict(X)
    accuracy_3, precision_3 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_3, f1_3 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_3 = adjusted_rand_score(y, y_pred)

    '''Custom_4'''
    custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
    custom_4.fit(X, y, y_oh)
    y_pred = custom_4.predict(X)
    accuracy_4, precision_4 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_4, f1_4 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_4 = adjusted_rand_score(y, y_pred)

    '''Custom_5'''
    custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
    custom_5.fit(X, y, y_oh)
    y_pred = custom_5.predict(X)
    accuracy_5, precision_5 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_5, f1_5 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_5 = adjusted_rand_score(y, y_pred)

    '''Custom_6'''
    custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
    custom_6.fit(X, y, y_oh)
    y_pred = custom_6.predict(X)
    accuracy_6, precision_6 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_6, f1_6 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_6 = adjusted_rand_score(y, y_pred)

    '''Custom_7'''
    custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
    custom_7.fit(X, y, y_oh)
    y_pred = custom_7.predict(X)
    accuracy_7, precision_7 = accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted', zero_division=0)
    recall_7, f1_7 = recall_score(y, y_pred, average='weighted'), f1_score(y, y_pred, average='weighted')
    ari_7 = adjusted_rand_score(y, y_pred)

    results = np.round([[accuracy_1, accuracy_gini, accuracy_gini_sk, accuracy_entropy, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7],
                    [precision_1, precision_gini, precision_gini_sk, precision_entropy, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7],
                    [recall_1, recall_gini, recall_gini_sk, recall_entropy, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7],
                    [f1_1, f1_gini, f1_gini_sk, f1_entropy, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7],
                    [ari_1, ari_gini, ari_gini_sk, ari_entropy, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7]],4)

    column = ['b = 1','gini','gini_sklearn', 'entropy', 'entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = log(p_l)', 'b = -p_l * log(p_l)']
    table = pd.DataFrame(data=results, columns=column, index=['Accuracy', 'Precision', 'Recall','F1 score','ARI'])
    
    print('='*150)
    print(f'Full_dataset_trained: Max_depth = {max_depth}')
    
    return table

In [102]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_full_dataset(max_depth=6, X=X, y=y)

Full_dataset_trained: Max_depth = 6


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.988,0.988,0.988,0.997,0.997,0.988,0.997,0.9885,0.9895,0.9955,0.987
Precision,0.9882,0.9882,0.9882,0.997,0.997,0.9882,0.997,0.9887,0.9896,0.9955,0.9872
Recall,0.988,0.988,0.988,0.997,0.997,0.988,0.997,0.9885,0.9895,0.9955,0.987
F1 score,0.988,0.988,0.988,0.997,0.997,0.988,0.997,0.9885,0.9895,0.9955,0.987
ARI,0.9729,0.9729,0.9729,0.9933,0.9933,0.9731,0.9934,0.9739,0.9766,0.9897,0.9709


---
##### Train-test-split

In [149]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def compare_metrics_train_test(max_depth, X, y, *, N=None, V=None, k=None, alpha=None, nmin=None):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    encoder = OneHotEncoder(sparse_output=False)
    y_oh_train = encoder.fit_transform(y_train.reshape(-1,1))


    '''Custom_1'''
    custom_1 = DecisionTree(max_depth=max_depth, criterion='custom_1')
    custom_1.fit(X_train, y_train, y_oh_train)
    y_pred = custom_1.predict(X_test)
    accuracy_1, precision_1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_1, f1_1 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_1 = adjusted_rand_score(y_test, y_pred)

    '''GINI'''
    gini = DecisionTree(max_depth=max_depth, criterion='gini')
    gini.fit(X_train, y_train)
    y_pred = gini.predict(X_test)
    accuracy_gini, precision_gini = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini, f1_gini = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_GINI'''
    sk_gini = DecisionTreeClassifier(max_depth=max_depth, criterion='gini')
    sk_gini.fit(X_train, y_train)
    y_pred = sk_gini.predict(X_test)
    accuracy_gini_sk, precision_gini_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_gini_sk, f1_gini_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_gini_sk = adjusted_rand_score(y_test, y_pred)

    '''Entropy'''
    entropy = DecisionTree(max_depth=max_depth, criterion='entropy')
    entropy.fit(X_train, y_train)
    y_pred = entropy.predict(X_test)
    accuracy_entropy, precision_entropy = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy, f1_entropy = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy = adjusted_rand_score(y_test, y_pred)

    '''Sklearn_Entropy'''
    sk_entropy = DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    sk_entropy.fit(X_train, y_train)
    y_pred = sk_entropy.predict(X_test)
    accuracy_entropy_sk, precision_entropy_sk = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_entropy_sk, f1_entropy_sk = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_entropy_sk = adjusted_rand_score(y_test, y_pred)
    
    '''Custom_2'''
    custom_2 = DecisionTree(max_depth=max_depth, criterion='custom_2')
    custom_2.fit(X_train, y_train, y_oh_train)
    y_pred = custom_2.predict(X_test)
    accuracy_2, precision_2 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_2, f1_2 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_2 = adjusted_rand_score(y_test, y_pred)

    '''Custom_3'''
    custom_3 = DecisionTree(max_depth=max_depth, criterion='custom_3')
    custom_3.fit(X_train, y_train, y_oh_train)
    y_pred = custom_3.predict(X_test)
    accuracy_3, precision_3 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_3, f1_3 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_3 = adjusted_rand_score(y_test, y_pred)

    '''Custom_4'''
    custom_4 = DecisionTree(max_depth=max_depth, criterion='custom_4')
    custom_4.fit(X_train, y_train, y_oh_train)
    y_pred = custom_4.predict(X_test)
    accuracy_4, precision_4 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_4, f1_4 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_4 = adjusted_rand_score(y_test, y_pred)

    '''Custom_5'''
    custom_5 = DecisionTree(max_depth=max_depth, criterion='custom_5')
    custom_5.fit(X_train, y_train, y_oh_train)
    y_pred = custom_5.predict(X_test)
    accuracy_5, precision_5 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_5, f1_5 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_5 = adjusted_rand_score(y_test, y_pred)

    '''Custom_6'''
    custom_6 = DecisionTree(max_depth=max_depth, criterion='custom_6')
    custom_6.fit(X_train, y_train, y_oh_train)
    y_pred = custom_6.predict(X_test)
    accuracy_6, precision_6 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_6, f1_6 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_6 = adjusted_rand_score(y_test, y_pred)

    '''Custom_7'''
    custom_7 = DecisionTree(max_depth=max_depth, criterion='custom_7')
    custom_7.fit(X_train, y_train, y_oh_train)
    y_pred = custom_7.predict(X_test)
    accuracy_7, precision_7 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_7, f1_7 = recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')
    ari_7 = adjusted_rand_score(y_test, y_pred)

    results = np.round([[accuracy_1, accuracy_gini, accuracy_gini_sk, accuracy_entropy, accuracy_entropy_sk, accuracy_2, accuracy_3, accuracy_4, accuracy_5, accuracy_6, accuracy_7],
                    [precision_1, precision_gini, precision_gini_sk, precision_entropy, precision_entropy_sk, precision_2, precision_3, precision_4, precision_5, precision_6, precision_7],
                    [recall_1, recall_gini, recall_gini_sk, recall_entropy, recall_entropy_sk, recall_2, recall_3, recall_4, recall_5, recall_6, recall_7],
                    [f1_1, f1_gini, f1_gini_sk, f1_entropy, f1_entropy_sk, f1_2, f1_3, f1_4, f1_5, f1_6, f1_7],
                    [ari_1, ari_gini, ari_gini_sk, ari_entropy, ari_entropy_sk, ari_2, ari_3, ari_4, ari_5, ari_6, ari_7]],4)

    column = ['b = 1','gini','gini_sklearn', 'entropy', 'entropy_sklearn', 'b = p_l ^ 0.5', 'b = (p_l*(1 - p_l)) ^ 0.5', 'b = p_l', 'b = p_l ^ 2', 'b = log(p_l)', 'b = -p_l * log(p_l)']
    table = pd.DataFrame(data=results, columns=column, index=['Accuracy', 'Precision', 'Recall','F1 score','ARI'])
    
    print(f'\nN, V, k, alpha, nmin, max_depth = {N, V, k, alpha, nmin, max_depth}')

    return table

---
##### N, V, k = 2000, 15, 7

In [199]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 7, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 7, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9717,0.9717,0.9667,0.9633,0.9667,0.9733,0.965,0.9733,0.9483,0.8267,0.9683
Precision,0.9725,0.9725,0.9674,0.9646,0.9684,0.9743,0.9663,0.9743,0.9506,0.8154,0.9695
Recall,0.9717,0.9717,0.9667,0.9633,0.9667,0.9733,0.965,0.9733,0.9483,0.8267,0.9683
F1 score,0.9717,0.9717,0.9668,0.9635,0.9669,0.9734,0.9652,0.9734,0.9485,0.7823,0.9685
ARI,0.9348,0.9348,0.9232,0.9177,0.9251,0.9384,0.9197,0.9384,0.884,0.7729,0.9267


---
##### N, V, k = 2000, 15, 10

In [136]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6483,0.6483,0.6517,0.9383,0.95,0.6483,0.9467,0.6483,0.76,0.8633,0.6467
Precision,0.7872,0.7872,0.7345,0.9416,0.9516,0.7872,0.9494,0.7872,0.7455,0.9223,0.6706
Recall,0.6483,0.6483,0.6517,0.9383,0.95,0.6483,0.9467,0.6483,0.76,0.8633,0.6467
F1 score,0.6102,0.6102,0.6134,0.9387,0.9502,0.6102,0.9465,0.6102,0.7236,0.8378,0.6033
ARI,0.4767,0.4767,0.4768,0.867,0.8934,0.4767,0.8838,0.4767,0.6117,0.8323,0.4751


In [137]:
# Squeeze = 0.85

N, V, k, alpha, nmin = 2000, 15, 10, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7667,0.7667,0.7667,0.7417,0.7417,0.7083,0.8033,0.7083,0.5783,0.6817,0.7017
Precision,0.78,0.78,0.7813,0.7654,0.7639,0.7279,0.8168,0.7279,0.6131,0.6495,0.7143
Recall,0.7667,0.7667,0.7667,0.7417,0.7417,0.7083,0.8033,0.7083,0.5783,0.6817,0.7017
F1 score,0.7681,0.7681,0.7683,0.7456,0.7454,0.702,0.8061,0.702,0.5321,0.6335,0.6936
ARI,0.5534,0.5534,0.5537,0.5139,0.515,0.5258,0.6105,0.5258,0.3873,0.5401,0.5143


In [134]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 10, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=6, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.75, 50, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9533,0.9533,0.9583,0.955,0.96,0.9517,0.97,0.935,0.95,0.9667,0.9517
Precision,0.9568,0.9568,0.9611,0.9584,0.9608,0.9556,0.9711,0.9396,0.9513,0.9671,0.9536
Recall,0.9533,0.9533,0.9583,0.955,0.96,0.9517,0.97,0.935,0.95,0.9667,0.9517
F1 score,0.9538,0.9538,0.9586,0.9553,0.96,0.9522,0.9702,0.9354,0.9502,0.9667,0.9519
ARI,0.8996,0.8996,0.9074,0.9014,0.9136,0.8951,0.936,0.8611,0.8962,0.9247,0.8959


In [135]:
# Squeeze = 0.85

N, V, k, alpha, nmin = 2000, 15, 10, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=6, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 10, 0.85, 50, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8167,0.8167,0.8083,0.8317,0.8167,0.81,0.8383,0.7933,0.7783,0.8233,0.8033
Precision,0.8279,0.8279,0.8202,0.8386,0.8245,0.8141,0.8475,0.7964,0.8038,0.8331,0.8173
Recall,0.8167,0.8167,0.8083,0.8317,0.8167,0.81,0.8383,0.7933,0.7783,0.8233,0.8033
F1 score,0.8181,0.8181,0.8094,0.8316,0.8169,0.8108,0.8397,0.7934,0.7813,0.8247,0.8056
ARI,0.6334,0.6334,0.619,0.6603,0.6339,0.6266,0.6741,0.5963,0.5696,0.6485,0.6118


---
#### N, V, k = 2000, 15, 15

In [138]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 15, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.75, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.675,0.675,0.6767,0.7017,0.7,0.6867,0.5233,0.6867,0.6683,0.5617,0.6817
Precision,0.6483,0.6581,0.6599,0.668,0.6662,0.7005,0.4845,0.7005,0.6476,0.4588,0.7248
Recall,0.675,0.675,0.6767,0.7017,0.7,0.6867,0.5233,0.6867,0.6683,0.5617,0.6817
F1 score,0.6277,0.6244,0.626,0.6648,0.6631,0.6502,0.4652,0.6502,0.6256,0.4783,0.6398
ARI,0.5395,0.5397,0.5419,0.5834,0.5811,0.5455,0.3745,0.5455,0.4603,0.4729,0.5372


In [139]:
# Squeeze = 0.85

N, V, k, alpha, nmin = 2000, 15, 15, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=4, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.5283,0.5283,0.5317,0.5083,0.5117,0.5217,0.4217,0.5233,0.5267,0.4967,0.525
Precision,0.5169,0.5169,0.5182,0.5011,0.5001,0.5093,0.3617,0.5101,0.5212,0.4524,0.5708
Recall,0.5283,0.5283,0.5317,0.5083,0.5117,0.5217,0.4217,0.5233,0.5267,0.4967,0.525
F1 score,0.4952,0.4952,0.4975,0.4822,0.4848,0.4913,0.3562,0.4915,0.4949,0.45,0.5054
ARI,0.3174,0.3174,0.3212,0.301,0.3045,0.2863,0.2309,0.2857,0.2738,0.2806,0.2843


In [198]:
# Squeeze = 0.75

N, V, k, alpha, nmin = 2000, 15, 15, 0.75, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=6, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.75, 50, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8967,0.9,0.9133,0.885,0.8867,0.8833,0.78,0.8833,0.8417,0.7767,0.8967
Precision,0.9015,0.9034,0.9177,0.8895,0.8957,0.8891,0.7608,0.8894,0.881,0.802,0.9021
Recall,0.8967,0.9,0.9133,0.885,0.8867,0.8833,0.78,0.8833,0.8417,0.7767,0.8967
F1 score,0.8973,0.9003,0.9136,0.8846,0.8874,0.8835,0.755,0.8835,0.845,0.7441,0.8972
ARI,0.7873,0.7948,0.8219,0.7671,0.7672,0.7687,0.6665,0.768,0.6739,0.6936,0.7891


In [141]:
# Squeeze = 0.85

N, V, k, alpha, nmin = 2000, 15, 15, 0.85, 50
Nk, R, y, X, cen = generdat(N, V, k, alpha, nmin, seed=101)

compare_metrics_train_test(max_depth=6, X=X, y=y, N=N, V=V, k=k, alpha=alpha, nmin=nmin)


N, V, k, alpha, nmin, max_depth = (2000, 15, 15, 0.85, 50, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.6733,0.6733,0.6733,0.6517,0.665,0.6667,0.6167,0.64,0.6583,0.6017,0.68
Precision,0.6943,0.6943,0.6954,0.6789,0.6835,0.6861,0.6661,0.6631,0.6954,0.643,0.6902
Recall,0.6733,0.6733,0.6733,0.6517,0.665,0.6667,0.6167,0.64,0.6583,0.6017,0.68
F1 score,0.6768,0.6768,0.6758,0.6575,0.6688,0.6684,0.5992,0.6423,0.664,0.5851,0.6817
ARI,0.4253,0.4253,0.4243,0.4022,0.4201,0.4214,0.3843,0.381,0.3953,0.3846,0.4428


---
### Sklearn datasets

In [150]:
# Iris data
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9556,0.9556,1.0,0.9556,1.0,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
Precision,0.9615,0.9615,1.0,0.9615,1.0,0.9615,0.9615,0.9615,0.9615,0.9615,0.9615
Recall,0.9556,0.9556,1.0,0.9556,1.0,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
F1 score,0.9553,0.9553,1.0,0.9553,1.0,0.9553,0.9553,0.9553,0.9553,0.9553,0.9553
ARI,0.8907,0.8907,1.0,0.8907,1.0,0.8907,0.8907,0.8907,0.8907,0.8907,0.8907


In [151]:
# Iris data
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target

compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
Precision,0.9615,0.9615,1.0,0.9111,0.9794,0.9615,0.9615,0.9615,0.9615,0.9615,0.9615
Recall,0.9556,0.9556,1.0,0.9111,0.9778,0.9556,0.9556,0.9556,0.9556,0.9556,0.9556
F1 score,0.9553,0.9553,1.0,0.9111,0.9777,0.9553,0.9553,0.9553,0.9553,0.9553,0.9553
ARI,0.8907,0.8907,1.0,0.7991,0.943,0.8907,0.8907,0.8907,0.8907,0.8907,0.8907


In [None]:
# Wine data 
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9259,0.9259,0.9074,0.9444
Precision,0.9463,0.9463,0.9662,0.8552,0.8552,0.9463,0.9463,0.9278,0.9278,0.9055,0.9463
Recall,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9259,0.9259,0.9074,0.9444
F1 score,0.9448,0.9448,0.9632,0.8507,0.8507,0.9448,0.9448,0.9263,0.9263,0.9059,0.9448
ARI,0.8298,0.8298,0.8838,0.5939,0.5939,0.8298,0.8298,0.7731,0.7731,0.7605,0.8298


In [155]:
# Wine data
from sklearn.datasets import load_wine

wine = load_wine()
X,y = wine.data, wine.target

compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9259,0.9259,0.9074,0.9444
Precision,0.9463,0.9463,0.9638,0.8552,0.8552,0.9463,0.9463,0.9278,0.9278,0.9055,0.9463
Recall,0.9444,0.9444,0.963,0.8519,0.8519,0.9444,0.9444,0.9259,0.9259,0.9074,0.9444
F1 score,0.9448,0.9448,0.9628,0.8507,0.8507,0.9448,0.9448,0.9263,0.9263,0.9059,0.9448
ARI,0.8298,0.8298,0.8898,0.5939,0.5939,0.8298,0.8298,0.7731,0.7731,0.7605,0.8298


In [196]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('../DATA/Obesity.csv')
le = LabelEncoder()
df['NObeyesdad_LabelEncoded'] = le.fit_transform(df['NObeyesdad'])
df = df.drop('NObeyesdad', axis=1)

# 1. Найти ВСЕ категориальные колонки
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# 2. Преобразовать КАЖДУЮ категориальную колонку в числовую
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop('NObeyesdad_LabelEncoded', axis=1).to_numpy()
y = df['NObeyesdad_LabelEncoded'].to_numpy()

compare_metrics_train_test(max_depth=4, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 4)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.7603,0.7603,0.7618,0.7445,0.7445,0.7603,0.6893,0.7713,0.7681,0.6735,0.7603
Precision,0.7987,0.7987,0.7994,0.7619,0.7619,0.7987,0.6196,0.8053,0.7998,0.6754,0.7987
Recall,0.7603,0.7603,0.7618,0.7445,0.7445,0.7603,0.6893,0.7713,0.7681,0.6735,0.7603
F1 score,0.7606,0.7606,0.7623,0.7433,0.7433,0.7606,0.641,0.7769,0.7734,0.6512,0.7606
ARI,0.5766,0.5766,0.5782,0.5942,0.5942,0.5766,0.5414,0.5912,0.5877,0.5214,0.5766


In [197]:
compare_metrics_train_test(max_depth=6, X=X, y=y)


N, V, k, alpha, nmin, max_depth = (None, None, None, None, None, 6)


Unnamed: 0,b = 1,gini,gini_sklearn,entropy,entropy_sklearn,b = p_l ^ 0.5,b = (p_l*(1 - p_l)) ^ 0.5,b = p_l,b = p_l ^ 2,b = log(p_l),b = -p_l * log(p_l)
Accuracy,0.8565,0.8565,0.8644,0.8864,0.8849,0.8565,0.8533,0.8028,0.8013,0.8691,0.8596
Precision,0.862,0.862,0.8701,0.8985,0.8971,0.862,0.8678,0.8212,0.8182,0.8849,0.865
Recall,0.8565,0.8565,0.8644,0.8864,0.8849,0.8565,0.8533,0.8028,0.8013,0.8691,0.8596
F1 score,0.8575,0.8575,0.8653,0.8874,0.8858,0.8575,0.8528,0.8055,0.8037,0.8697,0.8606
ARI,0.7189,0.7189,0.7351,0.7743,0.7709,0.7189,0.7132,0.635,0.6345,0.741,0.7249
