# Node Class
<strong>X</strong> : Part of Training Data this Node has<br>
<strong>Y</strong> : Output of part of Training Data this Node has<br>
<strong>Features_Unused</strong> : Features that are available for this Node to use<br>
<strong>Feature_Selected</strong> : Feature that is used to split this node<br>
<strong>Level</strong> : Level of this Node in the decision tree<br>
<strong>Children</strong> : Array of all Children this Node has

In [2]:
from collections import Counter
import numpy as np

In [35]:
class Node :
    # Constructor function
    def __init__(self, x, y, features_unused, level) :
        self.x = x
        self.y = y
        self.features_unused = features_unused
        self.feature_selected = None
        self.level = level
        self.children = []
        
    # find the best feature to split and then build it's subtree
    def buildTree(self) :
        # check if this node is leaf, if it is then return
        if self.isLeaf() == True :
            return 
        
        
    
    # check if node is leaf or not
    def isLeaf(self) :
        # node is leaf if all y belong to same class or number of feature_unused = 0
        if len(self.features_unused) == 0 :
            return True
        elif len(set(self.y)) == 1 :
            return True
        else :
            return False
        
    # gets the best feature to split 
    def getBestFeatureToSplit(self) :
        first = True  # for first feature
        
        best_feature = None  # best feature that will be selected 
        best_gain_ratio = None # best gain ratio
        
        # Trying each feature and choosing the one with best gain ratio
        for feature in self.features_unused :
            self.feature_selected = feature
            print('Trying feature : ', feature)
            self.children = []  # Empty the list of children this node has
            
            # Creating Feature_Unused for Children after removing this feature
            new_features_unused = self.features_unused.copy()
            new_features_unused.remove(feature)
            
            # getting all values this feature can take
            feature_classes = set(self.x[:, feature])
            
            # splitting the node into multiple children
            for feature_class in feature_classes :
                x_new = self.x[self.x[:, feature] == feature_class]
                y_new = self.y[self.x[:, feature] == feature_class]
                self.children.append(Node(x_new, y_new, new_features_unused.copy(), self.level+1))
            
            curr_gain_ratio = self.getGainRatio()
            
            # compare this gain ratio with gain ratio obtained from other features
            if first == True or curr_gain_ratio > best_gain_ratio :
                first = False
                best_feature = feature
                best_gain_ratio = curr_gain_ratio
        
        return feature  # This is the best feature to split, return it

    # get Gain Ratio of this Node on splitting
    def getGainRatio(self) :
        
        # gain ratio = information gain / split info
        
        info_gain = self.getInformationGain()  # get value of information gain
        split_info = self.getSplitInfo()
        
        return info_gain / split_info
    
    # get value of information gain for this node on splitting
    def getInformationGain(self) :
        
        # information gain = initial entropy - final entropy
        initial_entropy = self.getEntropy()
        final_entropy = 0 
        for child in self.children :
            final_entropy += ((len(child.y)/len(self.y))*child.getEntropy())
        
        return initial_entropy - final_entropy
    
    # get split info for this node after splitting
    def getSplitInfo(self) :
        count = Counter(self.x[:, self.feature_selected])
        D = np.array(list(count.values())) / len(self.y)
        split_info = -((D*np.log(D)).sum())
        return split_info
    
    # get entropy of this node
    def getEntropy(self) :
        output_classes = set(self.y)  # set of all output classes in y
        count = Counter(self.y)  # counter to keep a count of various output_classes
        p = np.array(list(count.values())) / len(self.y)  # p is an array that stores probabilities of various output_classes
        entropy = -((p * np.log(p)).sum())  # Compute entropy using Vectorization
        
        return entropy

In [24]:
dataset = [[1, 2, 0],
          [2, 3, 0],
          [1, 3, 1],
          [2, 2, 1],
          [1, 2, 0]]

dataset = np.array(dataset)

In [36]:
node = Node(dataset[:, 0:-1], dataset[:, -1], [0, 1], 0)
node.getBestFeatureToSplit()

Trying feature :  0
Curr Gain Ratio is :  0.020570659450693123
Trying feature :  1
Curr Gain Ratio is :  0.020570659450693123


In [None]:
[]