In [1]:
!pip install treenode

Collecting treenode
  Downloading treenode-0.1.5.zip (6.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: treenode
  Building wheel for treenode (setup.py) ... [?25l[?25hdone
  Created wheel for treenode: filename=treenode-0.1.5-py3-none-any.whl size=3865 sha256=82ece1b965e7094969e2a7b21ce0019ada26030e0a3e22cee44caeabf9472c8e
  Stored in directory: /root/.cache/pip/wheels/8c/80/9d/5f041fe62b5136b1109f25f95e3e80dc1b3e6cba867930b576
Successfully built treenode
Installing collected packages: treenode
Successfully installed treenode-0.1.5


In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from treenode import TreeNode

In [3]:
class TreeNode:
    def __init__(self,feature_idx, feature_val, prediction_probs, information_gain) -> None:
        self.feature_idx = feature_idx
        self.feature_val = feature_val
        self.prediction_probs = prediction_probs
        self.information_gain = information_gain
        self.left = None
        self.right = None

In [4]:
class Decision_tree:
  def __init__(self,max_depth=4,min_samples_leaf=1,min_information_gain=0.0):
    self.max_depth = max_depth
    self.min_samples_leaf = min_samples_leaf
    self.min_information_gain = min_information_gain
    self.tree = None
  def entropy(self,class_probablities:list)->float:
    return -sum([p*np.log2(p) for p in class_probablities if p>0])

  def class_probablities(self,y:list)->list:
    total_count = len(y)
    # y ⁵means labels
    return [count/total_count for count in Counter(y).values()]

  def data_entropy(self,y:list)->float:
    return self.entropy(self.class_probablities(y))

  def _partition_entropy(self, subsets: list) -> float:
        """Returns the entropy from this partition of data into subsets"""
        total_count = sum([len(subset) for subset in subsets])
        return sum([self.data_entropy(subset) * (len(subset) / total_count) for subset in subsets])
  def _split_data(self,data:np.ndarray,split_column:int,split_value:float)->tuple:
    a = data[:,split_column]
    left_data = data[a<=split_value]
    right_data = data[a>split_value]
    return left_data,right_data

  def select_feature(self, data: np.ndarray) -> int:
    feature_idx = list(range(data.shape[1] - 1))
    if self.numb_of_features_splitting == "sqrt":
        feature_idx_to_use = np.random.choice(feature_idx, size=int(np.sqrt(len(feature_idx))))
    elif self.numb_of_features_splitting == "log2":
        feature_idx_to_use = np.random.choice(feature_idx, size=int(np.log2(len(feature_idx))))
    else:
        feature_idx_to_use = feature_idx
    return np.random.choice(feature_idx_to_use)

  def best_split(self,data:np.ndarray)->dict:
    min_part_entropy = 1e6
    min_entropy_freature_idx = None
    min_entropy_freature_value = None
    for freature_idx in range(data.shape[1]-1):
      feature_values = np.median(data[:,freature_idx])
      left,right= self._split_data(data,freature_idx,feature_values)
      part_entropy=self._partition_entropy([left[:,-1],right[:,-1]])
      if  part_entropy < min_part_entropy:
        min_part_entropy = part_entropy
        min_entropy_freature_idx = freature_idx
        min_entropy_freature_value = feature_values
        left_min,right_min = left,right
    return left_min,right_min,min_entropy_freature_idx,min_entropy_freature_value,min_part_entropy

  def label_probabilities(self, data: np.array) -> np.array:
    labels_as_int = data[:, -1].astype(int)
    total_labels = len(labels_as_int)
    label_probabilities = np.zeros(len(self.labels_in_train), dtype=float)
    for i, label in enumerate(self.labels_in_train):
        label_indices = np.where(labels_as_int == label)[0]
        if len(label_indices) > 0:
            label_probabilities[i] = len(label_indices) / total_labels
    return label_probabilities

  def create_tree(self, data: np.ndarray, current_depth=0) -> TreeNode:
    if current_depth > self.max_depth:
        return None

    split_1_data, split_2_data, feature_idx, feature_value, split_entropy = self.best_split(data)

    _labels_probabilities = self.label_probabilities(data)
    node_entropy = self.data_entropy(data[:, -1])
    information_gain = node_entropy - split_entropy

    node = TreeNode(feature_idx, feature_value, _labels_probabilities, information_gain)

    if split_1_data.shape[0] < self.min_samples_leaf or split_2_data.shape[0] < self.min_samples_leaf:
        return node
    if information_gain < self.min_information_gain:
        return node

    current_depth += 1
    node.left = self.create_tree(split_1_data, current_depth)
    node.right = self.create_tree(split_2_data, current_depth)
    return node

  def fit(self, X_train: np.array, y_train: np.array) -> None:
    self.labels_in_train = np.unique(y_train)
    train_data = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1)
    self.tree = self.create_tree(train_data, current_depth=0)


  def predict_one_sample(self, X: np.array) -> np.array:
    node = self.tree
    while node.left or node.right:
        if X[node.feature_idx] <= node.feature_val:
            node = node.left
        else:
            node = node.right
    return node.prediction_probs

  def predict(self, X: np.array) -> np.array:
    pred_probs = np.array([self.predict_one_sample(x) for x in X])
    return np.argmax(pred_probs, axis=1)



In [5]:
from sklearn.datasets import load_wine
wine = load_wine()
X, y = wine.data, wine.target
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
model=Decision_tree(max_depth=5)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
accuracy_score(y_test,y_pred)
# print_tree(model)

0.9444444444444444