In [15]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn import model_selection

In [16]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature  # la caractéristique utilisée pour la division
        self.threshold = threshold  # le seuil utilisé pour la division
        self.left = left  # le sous-arbre gauche
        self.right = right  # le sous-arbre droit
        self.value = value  # la prédiction de la feuille

In [33]:
class DecisionTreeClassifier:

    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # la profondeur maximale de l'arbre
        self.root = None  # la racine de l'arbre

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        return [self._predict(x, self.root) for x in X]
    
    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(set(y))

        # Arrêter la construction de l'arbre si la profondeur maximale est atteinte
        if self.max_depth is not None and depth >= self.max_depth:
            value = self._most_common_label(y)
            return Node(value=value)

        # Arrêter la construction de l'arbre si toutes les étiquettes sont identiques
        if len(set(y)) == 1:
            return Node(value=y[0])

        # Trouver la meilleure caractéristique et le meilleur seuil pour la division
        best_feature, best_threshold = self._best_split(X, y, n_samples, n_features, n_classes)

        # Diviser les données en fonction de la meilleure caractéristique et du meilleur seuil
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)

        # Construire le sous-arbre gauche et le sous-arbre droit
        left = self._build_tree(X[left_indices, :], y[left_indices], depth+1)
        right = self._build_tree(X[right_indices, :], y[right_indices], depth+1)

        # Retourner le nœud de décision
        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _gini_impurity(self, y, n_classes):
        # Calcul de la gini impurity
        return 1.0 - sum((np.sum(y == c) / len(y)) ** 2 for c in range(n_classes))

    def _best_split(self, X, y, n_samples, n_features, n_classes):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_indices, right_indices = self._split(X[:, feature], threshold)
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = (len(left_indices) / n_samples) * self._gini_impurity(y[left_indices],n_classes) + \
                     (len(right_indices) / n_samples) * self._gini_impurity(y[right_indices],n_classes)

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _split(self, feature, threshold):
        left_indices = np.where(feature <= threshold)[0]
        right_indices = np.where(feature > threshold)[0]
        return left_indices, right_indices

    def _most_common_label(self, y):
        # Trouver la classe 'pincipale' de la région
        return sorted([(c, np.sum(y == c)) for c in set(y)], key=lambda x: x[1])[-1][0]

    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

In [36]:
iris = load_iris()
X_train, X_test, y_train, y_test = model_selection.train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
print('len of X_train: ',len(X_train),' ; len of X_test: ',len(X_test))

len of X_train:  120  ; len of X_test:  30


In [37]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
print(predictions)
accuracy = np.mean(predictions == y_test)

print(f"Accuracy: {accuracy}")

[1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
Accuracy: 0.9666666666666667


In [6]:
# Classification case
class DecisionTreeClassifier:
    
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
    
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
    
    def _grow_tree(self, X, y, depth=0):
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(n_samples_per_class)
        
        node = {'predicted_class': predicted_class}
        if depth < self.max_depth:
            feature, threshold = self._best_split(X, y)
            if feature is not None:
                indices_left = X[:, feature] < threshold
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node['feature'] = feature
                node['threshold'] = threshold
                node['left'] = self._grow_tree(X_left, y_left, depth+1)
                node['right'] = self._grow_tree(X_right, y_right, depth+1)
        return node
    
    def _best_split(self, X, y):
        best_gini = 1.0 - np.sum([(np.sum(y == c) / y.size) ** 2 for c in range(self.n_classes_)])
        best_feature, best_threshold = None, None
        for feature in range(self.n_features_):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                y_left = y[X[:, feature] < threshold]
                y_right = y[X[:, feature] >= threshold]
                if y_left.size == 0 or y_right.size == 0:
                    continue
                gini_left = 1.0 - np.sum([(np.sum(y_left == c) / y_left.size) ** 2 for c in range(self.n_classes_)])
                gini_right = 1.0 - np.sum([(np.sum(y_right == c) / y_right.size) ** 2 for c in range(self.n_classes_)])
                gini = (y_left.size / y.size) * gini_left + (y_right.size / y.size) * gini_right
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold
    
    def _predict(self, inputs):
        node = self.tree_
        while 'predicted_class' not in node:
            if inputs[node['feature']] < node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['predicted_class']

In [11]:
clf = DecisionTreeClassifier(max_depth = 5)
clf.fit(X_train,y_train)
print(clf._predict(X_test))

2


In [None]:
class DecisionTreeRegressor:
    
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)
    
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
    
    def _grow_tree(self, X, y, depth=0):
        var_y = np.var(y)
        node = {'value': np.mean(y)}
        if depth < self.max_depth and var_y != 0:
            feature, threshold = self._best_split(X, y, var_y)
            if feature is not None:
                indices_left = X[:, feature] < threshold
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node['feature'] = feature
                node['threshold'] = threshold
                node['left'] = self._grow_tree(X_left, y_left, depth+1)
                node['right'] = self._grow_tree(X_right, y_right, depth+1)
        return node
    
    def _best_split(self, X, y, var_y):
        best_reduction = 0
        best_feature, best_threshold = None, None
        for feature in range(self.n_features_):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                y_left = y[X[:, feature] < threshold]
                y_right = y[X[:, feature] >= threshold]
                if y_left.size == 0 or y_right.size == 0:
                    continue
                var_left = np.var(y_left)
                var_right = np.var(y_right)
                reduction = var_y - ((y_left.size / y.size) * var_left + (y_right.size / y.size) * var_right)
                if reduction > best_reduction:
                    best_reduction = reduction
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold
    
    def _predict(self, inputs):
        node = self.tree_
        while 'value' not in node:
            if inputs[node['feature']] < node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['value']