In [9]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import cv2 as cv
import glob
from sklearn import tree
import sklearn
import time
from sklearn.feature_selection import SelectKBest
import pygraphviz
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [2]:
class Node:
    '''
    Helper class which implements a single tree node.
    '''
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain
        self.value = value

In [32]:
class DecisionTree:
    
    def __init__(self, min_samples_split=7, max_depth=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        
    def calc_entropy(self, y):
        #np.bincount calcultes number of elements in each class
        number_of_each_class = np.bincount(np.array(y, dtype=np.int64))
        
        #These are the probabilities
        probabilities = number_of_each_class/len(y)

        entropy = 0
        
        for prob in probabilities:
            if prob > 0:
                #using the formula for entropy
                entropy -= np.log2(prob) * prob
                
        return entropy
    
    def information_gain(self, left, right, parent):
        p = len(parent)
        
        #calculate number of entries in the left and right child nodes
        l_prob = len(left)/p
        r_prob = len(right)/p
        
        #Using the formula for information gain
        gain = self.calc_entropy(parent) - r_prob * self.calc_entropy(right) - l_prob * self.calc_entropy(left)
        
        return gain
    
    def _best_split(self, X, y):
        '''
        Helper function, calculates the best split for given features and target
        
        :param X: np.array, features
        :param y: np.array or list, target
        :return: dict
        '''
        best_split = {}
        best_info_gain = -1
        n_rows, n_cols = X.shape
        
        # For every dataset feature
        for f_idx in range(n_cols):
            X_curr = X[:, f_idx]
            # For every unique value of that feature
            for threshold in np.unique(X_curr):
                # Construct a dataset and split it to the left and right parts
                # Left part includes records lower or equal to the threshold
                # Right part includes records higher than the threshold
                df = np.concatenate((X, y.reshape(1, -1).T), axis=1)
                df_left = np.array([row for row in df if row[f_idx] <= threshold])
                df_right = np.array([row for row in df if row[f_idx] > threshold])

                # Do the calculation only if there's data in both subsets
                if len(df_left) > 0 and len(df_right) > 0:
                    # Obtain the value of the target variable for subsets
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]

                    # Caclulate the information gain and save the split parameters
                    # if the current split if better then the previous best
                    gain = self._information_gain(y, y_left, y_right)
                    if gain > best_info_gain:
                        best_split = {
                            'feature_index': f_idx,
                            'threshold': threshold,
                            'df_left': df_left,
                            'df_right': df_right,
                            'gain': gain
                        }
                        best_info_gain = gain
        return best_split
    
    def find_maxcount(self, y):
        zero_count = 0
        one_count = 0
        
        for i in y:
            if i == 0:
                zero_count += 1
            else:
                one_count += 1
            
        
        if zero_count > one_count:
            return 0
        else:
            return 1
    
    def _build(self, X, y, depth=0):
        '''
        Helper recursive function, used to build a decision tree from the input data.
        
        :param X: np.array, features
        :param y: np.array or list, target
        :param depth: current depth of a tree, used as a stopping criteria
        :return: Node
        '''
        n_rows, n_cols = X.shape
        
        # Check to see if a node should be leaf node
        if n_rows >= self.min_samples_split and depth <= self.max_depth:
            # Get the best split
            best = self._best_split(X, y)
            # If the split isn't pure
            if best['gain'] > 0:
                # Build a tree on the left
                left = self._build(
                    X=best['df_left'][:, :-1], 
                    y=best['df_left'][:, -1], 
                    depth=depth + 1
                )
                right = self._build(
                    X=best['df_right'][:, :-1], 
                    y=best['df_right'][:, -1], 
                    depth=depth + 1
                )
                return Node(
                    feature=best['feature_index'], 
                    threshold=best['threshold'], 
                    data_left=left, 
                    data_right=right, 
                    gain=best['gain']
                )
        # Leaf node - value is the most common target value 
        return Node(
            value=self.find_maxcount(y)
        )
    
    def fit(self, X, y):
        '''
        Function used to train a decision tree classifier model.
        
        :param X: np.array, features
        :param y: np.array or list, target
        :return: None
        '''
        # Call a recursive function to build the tree
        self.root = self._build(X, y)
        
    def predict_helper(self, my_node, x):
        #value is Not None only for leaf nodes
        if my_node.value != None:
            return my_node.value
        
        #take that particular feature of the entry
        compare_this = x[my_node.feature]
        
        #check on which side of the threshold it is
        if compare_this <= my_node.threshold:
            return self.predict_helper(my_node.left_child, x)
        else:
            return self.predict_helper(my_node.right_child, x)
        
    def predict(self, x):
        y_prediction = [self.predict_helper(self.root, i) for i in x]
        return y_prediction

In [26]:
path = glob.glob("data/train/person/*.png")
cv_img = []
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/car/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/dog/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/train/airplane/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
x_train = np.array(cv_img)
y_ones = np.ones(500)
y_zeros = np.zeros(1500)
y_train = np.hstack((y_ones, y_zeros))

In [27]:
path = glob.glob("data/validation/person/*.png")
cv_img = []
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/car/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/dog/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
path = glob.glob("data/validation/airplane/*.png")
for img in path:
    n = np.array(cv.imread(img).flatten())
    cv_img.append(n)
x_validation = np.array(cv_img)
y_validation = np.hstack((np.ones(100), np.zeros(300)))

In [28]:
selection = SelectKBest(k=10).fit(x_train, y_train)
x_train_new = selection.transform(x_train)
x_val_new = selection.transform(x_validation)

In [29]:
model = DecisionTree()
model.fit(x_train_new, y_train)


NameError: name 'x_validation_new' is not defined

In [33]:
y_validation_predicted = model.predict(x_val_new)

In [34]:
print(sklearn.metrics.accuracy_score(y_validation, y_validation_predicted)*100)
print(sklearn.metrics.precision_score(y_validation, y_validation_predicted)*100)
print(sklearn.metrics.recall_score(y_validation, y_validation_predicted)*100)

83.75
64.95726495726495
76.0
