In [1]:
import numpy as np
import pandas as pd
import cv2 as cv
import glob
from sklearn import metrics


In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left_child = None, right_child = None, info_gain=None, value=None):
        self.left_child = left
        self.right_child = right
        self.feature = feature
        self.threshold = threshold
        self.info_gain = info_gain
        self.value = value

In [3]:
class DecisionTreeClassifier:
    
    def __init__(self, min_samples_split=7, max_depth=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        
    def calc_entropy(self, y):
        #np.bincount calcultes number of elements in each class
        number_of_each_class = np.bincount(np.array(y, dtype=np.int64))
        
        #These are the probabilities
        probabilities = number_of_each_class/len(y)

        entropy = 0
        
        for prob in probabilities:
            if prob > 0:
                #using the formula for entropy
                entropy -= np.log2(prob) * prob
                
        return entropy
    
    def information_gain(self, left, right, parent):
        p = len(parent)
        
        #calculate number of entries in the left and right child nodes
        l_prob = len(left)/p
        r_prob = len(right)/p
        
        #Using the formula for information gain
        gain = self.calc_entropy(parent) - r_prob * self.calc_entropy(right) - l_prob * self.calc_entropy(left)
        
        return gain
    
    def find_split(self, x, y):
        split_best = {}
        gain_best = -1
        
        rows = x.shape[0]
        columns = x.shape[1]
        
        for feature in range(columns):
            
            selected_column = x[:, feature]

            for threshold in np.unique(selected_column):
                
                data = np.concatenate((x, y.reshape(1, -1).T), axis=1)
                
                left_child = [vector for vector in data if vector[feature] <= threshold]
                left_child = np.array(left_child)
                
                right_child = [vector for vector in data if vector[feature] > threshold]
                right_child = np.array(right_child)

                if len(left_child) > 0 and len(right_child) > 0:
                    y = data[:, -1]
                    y_left = left_child[:, -1]
                    y_right = right_child[:, -1]

                    gain = self.information_gain(y_left, y_right, y)
                    if gain > gain_best:
                        split_best = {
                            'feature_index': feature,
                            'threshold': threshold,
                            'df_left': left_child,
                            'df_right': right_child,
                            'gain': gain
                        }
                        gain_best = gain
                        
        return split_best
    
    def construct_tree(self, x, y, depth=0):

        rows = x.shape[0]
        columns = x.shape[1]
        
        if rows >= self.min_samples_split and depth <= self.max_depth:
            
            optimal_split = self.find_split(x, y)
            if optimal_split['gain'] > 0:

                left = self.construct_tree(
                    x = optimal_split['df_left'][:, :-1], 
                    y = optimal_split['df_left'][:, -1], 
                    depth += 1
                )
                right = self.construct_tree(
                    x = optimal_split['df_right'][:, :-1], 
                    y = optimal_split['df_right'][:, -1], 
                    depth += 1)
                
                return Node(
                    feature = optimal_split['feature_index'], 
                    threshold = optimal_split['threshold'], 
                    left_child = left, 
                    right_child = right, 
                    info_gain = optimal_split['gain'])

        return Node( value=Counter(y).most_common(1)[0][0] )
    
    def fit(self, x, y):
        self.root = self.construct_tree(x, y)
        
    def predict_helper(self, my_node, x):
        #value is Not None only for leaf nodes
        if my_node.value != None:
            return my_node.value
        
        #take that particular feature of the entry
        compare_this = x[my_node.feature]
        
        #check on which side of the threshold it is
        if compare_this <= my_node.threshold:
            return self.predict_helper(my_node.left_child, x)
        else:
            return self.predict_helper(my_node.right_child, x)
        
    def predict(self, x):
        y_prediction = [self.predict_helper(self.root, i) for i in x]
        return y_prediction

In [4]:
path = glob.glob("data/train/person/*.png")
cv_img = []
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/car/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/dog/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/airplane/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
x_train = np.array(cv_img)
y_ones = np.ones(500)
y_zeros = np.zeros(1500)
y_train = np.hstack((y_ones, y_zeros))

In [5]:
path = glob.glob("data/validation/person/*.png")
cv_img = []
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/car/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/dog/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/airplane/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
x_validation = np.array(cv_img)
y_validation = np.hstack((np.ones(100), np.zeros(300)))

In [None]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_validation_predicted = model.predict(x_validation)

In [None]:
print(sklearn.metrics.accuracy_score(y_validation, y_validation_predicted)*100)
print(sklearn.metrics.precision_score(y_validation, y_validation_predicted)*100)
print(sklearn.metrics.recall_score(y_validation, y_validation_predicted)*100)