In [59]:
import numpy as np
from sklearn.datasets import load_iris,load_boston
from sklearn import model_selection
from sklearn.metrics import mean_squared_error

In [50]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature  # la caractéristique utilisée pour la division
        self.threshold = threshold  # le seuil utilisé pour la division
        self.left = left  # le sous-arbre gauche
        self.right = right  # le sous-arbre droit
        self.value = value  # la prédiction de la feuille

In [51]:
class DecisionTreeClassifierFromScratch:

    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # la profondeur maximale de l'arbre
        self.root = None  # la racine de l'arbre

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        return [self._predict(x, self.root) for x in X]
    
    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(set(y))

        # Arrêter la construction de l'arbre si la profondeur maximale est atteinte
        if self.max_depth is not None and depth >= self.max_depth:
            value = self._most_common_label(y)
            return Node(value=value)

        # Arrêter la construction de l'arbre si toutes les étiquettes sont identiques
        if len(set(y)) == 1:
            return Node(value=y[0])

        # Trouver la meilleure caractéristique et le meilleur seuil pour la division
        best_feature, best_threshold = self._best_split(X, y, n_samples, n_features, n_classes)

        # Diviser les données en fonction de la meilleure caractéristique et du meilleur seuil
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)

        # Construire le sous-arbre gauche et le sous-arbre droit
        left = self._build_tree(X[left_indices, :], y[left_indices], depth+1)
        right = self._build_tree(X[right_indices, :], y[right_indices], depth+1)

        # Retourner le nœud de décision
        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _gini_impurity(self, y, n_classes):
        # Calcul de la gini impurity
        return 1.0 - sum((np.sum(y == c) / len(y)) ** 2 for c in range(n_classes))

    def _best_split(self, X, y, n_samples, n_features, n_classes):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_indices, right_indices = self._split(X[:, feature], threshold)
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = (len(left_indices) / n_samples) * self._gini_impurity(y[left_indices],n_classes) + \
                     (len(right_indices) / n_samples) * self._gini_impurity(y[right_indices],n_classes)

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _split(self, feature, threshold):
        left_indices = np.where(feature <= threshold)[0]
        right_indices = np.where(feature > threshold)[0]
        return left_indices, right_indices

    def _most_common_label(self, y):
        # Trouver la classe 'pincipale' de la région
        return sorted([(c, np.sum(y == c)) for c in set(y)], key=lambda x: x[1])[-1][0]

    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

In [52]:
# In the case of classification we chose the IRIS dataset
iris = load_iris()
X_train, X_test, y_train, y_test = model_selection.train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
print('len of X_train: ',len(X_train),' ; len of X_test: ',len(X_test))

len of X_train:  120  ; len of X_test:  30


In [53]:
# From scratch predictions and accuracy
tree = DecisionTreeClassifierFromScratch(max_depth=3)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
print(predictions)
print(list(y_test))
accuracy = np.mean(predictions == y_test)

print(f"Accuracy: {accuracy}")
# One mistake on the 5th element

[1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
Accuracy: 0.9666666666666667


In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# With sklearn package

clf = DecisionTreeClassifier(random_state=42,max_depth=3)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=42)

In [55]:
y_pred = clf.predict(X_test)
print(y_pred)
print(y_test)
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy: 1.0


In [57]:
class DecisionTreeRegressorFromScratch:

    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # la profondeur maximale de l'arbre
        self.root = None  # la racine de l'arbre

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        return [self._predict(x, self.root) for x in X]
    
    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        # Arrêter la construction de l'arbre si la profondeur maximale est atteinte
        if self.max_depth is not None and depth >= self.max_depth:
            value = np.mean(y)
            return Node(value=value)

        # Arrêter la construction de l'arbre si toutes les étiquettes sont identiques
        if np.all(y == y[0]):
            return Node(value=y[0])

        # Trouver la meilleure caractéristique et le meilleur seuil pour la division
        best_feature, best_threshold = self._best_split(X, y, n_samples, n_features)

        # Diviser les données en fonction de la meilleure caractéristique et du meilleur seuil
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)

        # Construire le sous-arbre gauche et le sous-arbre droit
        left = self._build_tree(X[left_indices, :], y[left_indices], depth+1)
        right = self._build_tree(X[right_indices, :], y[right_indices], depth+1)

        # Retourner le nœud de décision
        return Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _mse(self, y):
        # Calculer la mean squared error
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y, n_samples, n_features):
        best_mse = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                left_indices, right_indices = self._split(X[:, feature], threshold)
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                mse = (len(left_indices) / n_samples) * self._mse(y[left_indices]) + \
                      (len(right_indices) / n_samples) * self._mse(y[right_indices])

                if mse < best_mse:
                    best_mse = mse
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _split(self, feature, threshold):
        left_indices = np.where(feature <= threshold)[0]
        right_indices = np.where(feature > threshold)[0]
        return left_indices, right_indices
    
    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)

In [None]:
# In the case of regression we took the Boston Housing dataset
# Charger le dataset Boston Housing
X, y = load_boston(return_X_y=True)

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
# Entraîner un arbre de décision de régression
tree = DecisionTreeRegressorFromScratch(max_depth=5)
tree.fit(X_train, y_train)

# Prédire sur l'ensemble de test
y_pred = tree.predict(X_test)
print(y_pred)

# Calculer l'erreur quadratique moyenne
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

[22.726190476190474, 30.05, 19.733333333333334, 20.355555555555554, 15.910526315789474, 22.726190476190474, 15.910526315789474, 15.910526315789474, 22.726190476190474, 20.355555555555554, 19.733333333333334, 19.733333333333334, 9.203703703703704, 22.726190476190474, 20.355555555555554, 24.45, 19.733333333333334, 9.203703703703704, 44.65555555555556, 14.135, 22.726190476190474, 22.726190476190474, 15.910526315789474, 25.985185185185188, 14.135, 14.135, 20.355555555555554, 14.135, 15.910526315789474, 20.355555555555554, 19.733333333333334, 22.726190476190474, 10.4, 20.355555555555554, 15.910526315789474, 15.910526315789474, 24.45, 20.355555555555554, 22.299999999999997, 22.726190476190474, 19.6, 25.985185185185188, 44.65555555555556, 20.355555555555554, 22.726190476190474, 14.135, 15.910526315789474, 22.726190476190474, 15.910526315789474, 30.05, 20.355555555555554, 33.1, 15.910526315789474, 25.985185185185188, 44.65555555555556, 22.726190476190474, 15.910526315789474, 30.05, 22.72619047

In [62]:
from sklearn.tree import DecisionTreeRegressor

# With sklearn package

dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = dt.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 8.553906584646844
