In [17]:
# we will code a descision tree
import numpy as np
import pandas as pd

dataset = pd.read_csv('Iris.csv')
Y= dataset['Species']
unique_labels = Y.unique()
label_to_int = {label: i for i, label in enumerate(unique_labels)}
Y_encoded = Y.replace(label_to_int)
X = dataset.drop('Species', axis=1)

class Node():
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier():
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_impurity_decrease=0.0):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)
        
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        #print(n_samples, n_features, n_labels)
        if (depth >= self.max_depth) or (n_labels == 1) or (n_samples < self.min_samples_split):
            return Node(value=np.bincount(y).argmax())
        else:
            best_feature, best_threshold = self._find_best_split(X, y)
            left_indices = X[:, best_feature] < best_threshold
            right_indices = X[:, best_feature] >= best_threshold
            left = self._grow_tree(X[left_indices], y[left_indices], depth+1)
            right = self._grow_tree(X[right_indices], y[right_indices], depth+1)
            return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)
    
    def _find_best_split(self, X, y):
        n_samples, n_features = X.shape
        best_feature = None
        best_threshold = None
        best_impurity = 1.0
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                y_left = y[X[:, feature] < threshold]
                y_right = y[X[:, feature] >= threshold]
                impurity = self._gini_impurity(y_left, y_right)
                if impurity < best_impurity:
                    best_impurity = impurity
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold
    
    def _gini_impurity(self, y_left, y_right):
        n_left = len(y_left)
        n_right = len(y_right)
        n_total = n_left + n_right
        gini_left = 1.0 - (np.sum(np.square(np.bincount(y_left)/n_left)) if n_left != 0 else 0)
        gini_right = 1.0 - (np.sum(np.square(np.bincount(y_right)/n_right)) if n_right != 0 else 0)
        gini = (n_left/n_total)*gini_left + (n_right/n_total)*gini_right
        return gini
    
    def predict(self, X):
        return self._predict(X, self.root)
        
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.sum(y_pred == y)/len(y)
    
    def _predict(self, X, node):
        if node.value is not None:
            return np.array([node.value]*len(X))
        else:
            left_indices = X[:, node.feature] < node.threshold
            right_indices = X[:, node.feature] >= node.threshold
            y_left = self._predict(X[left_indices], node.left)
            y_right = self._predict(X[right_indices], node.right)
            return np.concatenate([y_left, y_right])

    def print_tree(self):
        self._print_tree(self.root)

    def _print_tree(self, node, depth=0):
        if node.value is not None:
            print(" "*depth, "value: ", node.value)
        else:
            print(" "*depth, "feature: ", node.feature, "threshold: ", node.threshold)
            self._print_tree(node.left, depth+1)
            self._print_tree(node.right, depth+1)
    
    
DecisionTreeClassifier = DecisionTreeClassifier( max_depth=5, min_samples_split=2, min_samples_leaf=1, min_impurity_decrease=0.0)

DecisionTreeClassifier.fit(X.values, Y_encoded)
DecisionTreeClassifier.print_tree()

 feature:  0 threshold:  51.0
  value:  0
  feature:  0 threshold:  101.0
   value:  1
   value:  2


In [None]:
a = np.array([[1, 2, 3], [4, 5, 6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [9]:
import pandas as pd
dataset = pd.read_csv('Iris.csv')
X

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object