In [504]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from collections import Counter

In [505]:
class Node:
    def __init__(self,
                 left = None,
                 right = None,
                 prediction = None,
                 numSamples = None,
                 featureSplit = None,
                 threshold = None,
                 samples = None
                ):
        
        self.left = left
        self.right = right  
        self.prediction = prediction
        self.featureSplit = featureSplit 
        self.threshold = threshold
        self.samples = samples
        
class DecisionTree:
    
    def __init__(self, maxDepth = 10, minSamples = 2, minIG = None):
        self.maxDepth = maxDepth
        self.minSamples = minSamples
        self.minIG = minIG
        self.root = None
    
    def fit(self, X, y):
        #recursively grows tree
        self.root = self.__growTree(X, y)
        
    def predict(self, X):
        #traverses fit tree for every observation in prediction set
        return [self.__traverseTree(x, self.root) for x in X]
    
    def __growTree(self, X, y, depth=0):

        numSamples, numFeatures = X.shape
        
        #check stopping criteria
        if depth >= self.maxDepth or numSamples <= self.minSamples or len(set(y))==1:
            return Node(prediction = self.__mostCommonClass(y), samples = X)
        
        #find best split
        bestFeature, bestThreshold, IG = self.__bestSplit(X, y, numFeatures)
        
        #create child nodes based on best determined split
        leftSplit, rightSplit = self.__split(X[:,bestFeature], bestThreshold)
        
        #continue growing tree
        leftNode = self.__growTree(X[leftSplit], y[leftSplit], depth=depth+1)
        rightNode = self.__growTree(X[rightSplit], y[rightSplit], depth=depth+1)
    
        return Node(left= leftNode,
                    right=rightNode,
                    featureSplit = bestFeature,
                    threshold = bestThreshold,
                    samples = X
                   )
        
    
    def __mostCommonClass(self, y):
        #given a set of labeled data points, determines majority class. represents final prediction in leaf nodes
        return Counter(y).most_common()[0][0]
    
    def __bestSplit(self, X, y, numFeatures):
        
        maxGain = None
        
        #iteratively run through all possible features, thresholds and find best split (max information gain) 
        for feature in range(numFeatures):
            featureData = X[:, feature]
            for threshold in self.__getThresholds(featureData):
                #calculate information gain for current feature, threshold
                IG = self.__informationGain(y, featureData, threshold)
                
                #store feature, threshold that produces largest information gain
                if not maxGain or IG > maxGain:
                    maxGain = IG
                    bestFeature, bestThreshold = feature, threshold
        
        return  bestFeature, bestThreshold, maxGain  
    
    def __getThresholds(self, featureData):
        #given a set of continuous values, determines possible thresholds to split values on (using midpoint of adjacent values)
        allFeatureValues = np.sort(np.unique(featureData))
        thresholds = []
        for i in range(1, len(allFeatureValues)):
            thresholds.append((allFeatureValues[i]+allFeatureValues[i-1])/2)
        
        return thresholds
    
    
    def __informationGain(self, y, featureData, threshold):
        # Information Gain = entropy of parent - weighted avg. entropy of children
        parentEntropy = self.__entropy(y)
        
        #create children
        leftNode, rightNode = self.__split(featureData, threshold)
        
        #caculate weighted avg. entropy of children
        entropyLeft = self.__entropy(y[leftNode])
        entropyRight = self.__entropy(y[rightNode])
        weightedEntropyChildren = entropyLeft * len(leftNode)/len(y) + entropyRight * len(rightNode)/len(y)        
        
        #calculate final information gain
        IG = parentEntropy - weightedEntropyChildren
        
        return IG
    
    def __entropy(self, y):
        #calculates entropy of a set of labeled observations. entropy = 0 when all classes are homogeneous, 1 if class occurrence is 50/50
        pX = np.bincount(y)/len(y)
        return -1 *sum([p * np.log(p) for p in pX if p > 0])
    
    def __split(self, featureData, threshold):
        #splits observations based on a given feature and threshold
        leftNode, rightNode = np.where(featureData <= threshold), np.where(featureData > threshold)
        return leftNode, rightNode
    
    def __traverseTree(self, x, node):
        #given an observation X, traverses through tree to determine its class prediction
        
        if node.prediction is not None: #if node is a leaf node, return class prediction. else continue running through tree
            return node.prediction
        
        elif x[node.featureSplit] <= node.threshold:
            return self.__traverseTree(x, node.left)
        
        elif x[node.featureSplit] > node.threshold:
            return self.__traverseTree(x, node.right)
    

In [506]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [507]:
model = DecisionTree()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [508]:
accuracy_score(Y_test, predictions)

0.9122807017543859