<a href="https://colab.research.google.com/github/adamd1985/Lectures_On_MLAI/blob/main/4_7_DecisionTrees_Lecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Trees

##Initialization

In [1]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, prediction=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.prediction = prediction  # This is only used if the node is a leaf.

    def is_leaf(self):
        return self.prediction is not None


In [3]:
class DecisionTree:
    def __init__(self, max_depth=None, cost_function=None):
        self.max_depth = max_depth
        self.cost_function = cost_function
        self.root = None  # Root node of the tree.

    def fit(self, X, y):
        """Starts building the decision tree."""
        print("\nStarting to build the Decision Tree...\n")
        self.root = self._build_tree(X, y, depth=0)
        print("\nFinished building the Decision Tree!\n")

    def _build_tree(self, X, y, depth):
        """Recursively builds the decision tree."""
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # Check stopping conditions
        if len(unique_classes) == 1 or (self.max_depth and depth >= self.max_depth):
            leaf_class = Counter(y).most_common(1)[0][0]
            print(f"{'  ' * depth}Leaf node created at depth {depth}: Class = {leaf_class}")
            return Node(prediction=leaf_class)

        # Find the best split
        best_feature, best_threshold, feature_scores = self._best_split(X, y)
        if best_feature is None:
            leaf_class = Counter(y).most_common(1)[0][0]
            print(f"{'  ' * depth}Leaf node created at depth {depth}: Class = {leaf_class}")
            return Node(prediction=leaf_class)

        # Print feature scores before selecting the best one
        print(f"{'  ' * depth}Feature evaluation at depth {depth}:")
        for feat, (thresh, score) in feature_scores.items():
            print(f"{'  ' * depth}  Feature {feat} -> Best Threshold: {thresh:.4f}, Score: {score:.6f}")

        print(f"{'  ' * depth}Depth {depth}: Selected Feature {best_feature} at Threshold {best_threshold}")

        # Split dataset
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = ~left_indices

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _best_split(self, X, y):
        """Finds the best feature and threshold to split the dataset, while logging scores."""
        num_samples, num_features = X.shape
        best_gain = -1
        best_feature, best_threshold = None, None
        feature_scores = {}

        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            best_feature_score = -1
            best_feature_threshold = None

            for threshold in thresholds:
                left_y = y[X[:, feature] <= threshold]
                right_y = y[X[:, feature] > threshold]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                gain = self._information_gain(y, left_y, right_y)
                if gain > best_feature_score:
                    best_feature_score = gain
                    best_feature_threshold = threshold

                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold

            # Store the best threshold and gain score for this feature
            feature_scores[feature] = (best_feature_threshold, best_feature_score)

        return best_feature, best_threshold, feature_scores

    def _information_gain(self, y, left_y, right_y):
        """Computes information gain based on the chosen cost function."""
        weight_left = len(left_y) / len(y)
        weight_right = len(right_y) / len(y)

        return self.cost_function(y) - (weight_left * self.cost_function(left_y) + weight_right * self.cost_function(right_y))

    def predict(self, X):
        """Predicts the class labels for a given dataset."""
        return np.array([self._traverse_tree(sample, self.root) for sample in X])

    def _traverse_tree(self, sample, node):
        """Recursively traverses the tree to make predictions."""
        if node.is_leaf():
            return node.prediction
        if sample[node.feature] <= node.threshold:
            return self._traverse_tree(sample, node.left)
        else:
            return self._traverse_tree(sample, node.right)


In [4]:
def gini_impurity(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))

In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tree = DecisionTree(max_depth=3, cost_function=gini_impurity)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, predictions))
print("Decision Tree Classification Report:\n", classification_report(y_test, predictions))


Starting to build the Decision Tree...

Feature evaluation at depth 0:
  Feature 0 -> Best Threshold: 5.4000, Score: 0.226456
  Feature 1 -> Best Threshold: 3.3000, Score: 0.135683
  Feature 2 -> Best Threshold: 1.9000, Score: 0.333403
  Feature 3 -> Best Threshold: 0.6000, Score: 0.333403
Depth 0: Selected Feature 2 at Threshold 1.9
  Leaf node created at depth 1: Class = 0
  Feature evaluation at depth 1:
    Feature 0 -> Best Threshold: 6.1000, Score: 0.091072
    Feature 1 -> Best Threshold: 2.4000, Score: 0.035916
    Feature 2 -> Best Threshold: 4.7000, Score: 0.364898
    Feature 3 -> Best Threshold: 1.7000, Score: 0.364291
  Depth 1: Selected Feature 2 at Threshold 4.7
    Feature evaluation at depth 2:
      Feature 0 -> Best Threshold: 4.9000, Score: 0.025566
      Feature 1 -> Best Threshold: 2.5000, Score: 0.003453
      Feature 2 -> Best Threshold: 4.4000, Score: 0.003453
      Feature 3 -> Best Threshold: 1.6000, Score: 0.052593
    Depth 2: Selected Feature 3 at Thresho

In [6]:
class PrunedDecisionTree(DecisionTree):
    def __init__(self, max_depth=None, cost_function=None, alpha=0.01):
        """
        Pruned Decision Tree constructor.

        :param max_depth: Maximum depth of the tree.
        :param cost_function: A function to compute impurity (e.g., Gini or Entropy).
        :param alpha: Regularization parameter for pruning.
        """
        super().__init__(max_depth, cost_function)
        self.alpha = alpha  # Regularization parameter for pruning

    def _evaluate_accuracy(self, X_val, y_val):
        """Computes accuracy of the tree on validation data."""
        predictions = self.predict(X_val)
        return np.mean(predictions == y_val)


    def prune(self, X_val, y_val):
        """
        Prunes the tree using cost-complexity pruning.

        :param X_val: Validation feature set for pruning evaluation.
        :param y_val: Validation labels.
        """
        print("\nStarting pruning process...\n")
        self._prune_tree(self.root, X_val, y_val, depth=0)
        print("\nPruning completed!\n")

    def _prune_tree(self, node, X_val, y_val, depth):
      """
      Recursively prunes the tree using validation data.

      :param node: The current node in the tree.
      :param X_val: Validation feature set.
      :param y_val: Validation labels.
      :param depth: Depth of the current node in the tree.
      """
      if node.is_leaf():
          return

      # Prune left and right children recursively
      self._prune_tree(node.left, X_val, y_val, depth + 1)
      self._prune_tree(node.right, X_val, y_val, depth + 1)

      # Check if the node can be pruned
      if node.left.is_leaf() and node.right.is_leaf():
          # Compute validation accuracy before pruning
          pre_prune_accuracy = self._evaluate_accuracy(X_val, y_val)

          # Save the original node structure
          left_node, right_node = node.left, node.right

          # Convert node into a leaf (majority class)
          node.prediction = Counter(y_val).most_common(1)[0][0]
          node.left = None
          node.right = None

          # Compute validation accuracy after pruning
          post_prune_accuracy = self._evaluate_accuracy(X_val, y_val)

          # Apply pruning only if accuracy does NOT drop significantly
          if post_prune_accuracy < pre_prune_accuracy - self.alpha:
              # Revert pruning
              node.prediction = None
              node.left = left_node
              node.right = right_node
              print(f"{'  ' * depth}Retained split at depth {depth}, feature {node.feature}, threshold {node.threshold}")
          else:
              print(f"{'  ' * depth}Pruned subtree at depth {depth}, feature {node.feature}, threshold {node.threshold}")


In [7]:
from sklearn.model_selection import train_test_split

# Load and split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train tree with Gini impurity
tree = PrunedDecisionTree(max_depth=3, cost_function=gini_impurity, alpha=0.01)
tree.fit(X_train, y_train)

# Prune the tree
tree.prune(X_val, y_val)

# Predict and evaluate
predictions = tree.predict(X_val)
accuracy = np.mean(predictions == y_val)
print(f"Final Accuracy after pruning: {accuracy:.4f}")



Starting to build the Decision Tree...

Feature evaluation at depth 0:
  Feature 0 -> Best Threshold: 5.4000, Score: 0.226456
  Feature 1 -> Best Threshold: 3.3000, Score: 0.135683
  Feature 2 -> Best Threshold: 1.9000, Score: 0.333403
  Feature 3 -> Best Threshold: 0.6000, Score: 0.333403
Depth 0: Selected Feature 2 at Threshold 1.9
  Leaf node created at depth 1: Class = 0
  Feature evaluation at depth 1:
    Feature 0 -> Best Threshold: 6.1000, Score: 0.091072
    Feature 1 -> Best Threshold: 2.4000, Score: 0.035916
    Feature 2 -> Best Threshold: 4.7000, Score: 0.364898
    Feature 3 -> Best Threshold: 1.7000, Score: 0.364291
  Depth 1: Selected Feature 2 at Threshold 4.7
    Feature evaluation at depth 2:
      Feature 0 -> Best Threshold: 4.9000, Score: 0.025566
      Feature 1 -> Best Threshold: 2.5000, Score: 0.003453
      Feature 2 -> Best Threshold: 4.4000, Score: 0.003453
      Feature 3 -> Best Threshold: 1.6000, Score: 0.052593
    Depth 2: Selected Feature 3 at Thresho

In [8]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error

class RegressionTree:
    def __init__(self, max_depth=5):
        """
        Regression Tree constructor.

        :param max_depth: Maximum depth of the tree.
        """
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        """Starts building the regression tree."""
        print("\nStarting to build the Regression Tree...\n")
        self.root = self._build_tree(X, y, depth=0)
        print("\nFinished building the Regression Tree!\n")

    def _build_tree(self, X, y, depth):
        """Recursively builds the regression tree."""
        num_samples, num_features = X.shape

        # Stopping condition: If only one unique value left or max depth reached, create a leaf
        if len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            leaf_value = np.mean(y)  # Leaf stores mean value for regression
            print(f"{'  ' * depth}Leaf node created at depth {depth}: Value = {leaf_value:.4f}")
            return Node(prediction=leaf_value)

        # Find the best split
        best_feature, best_threshold, best_score, feature_scores = self._best_split(X, y)
        if best_feature is None:
            leaf_value = np.mean(y)
            print(f"{'  ' * depth}Leaf node created at depth {depth}: Value = {leaf_value:.4f}")
            return Node(prediction=leaf_value)

        # Print feature scores before selecting the best one
        print(f"{'  ' * depth}Feature evaluation at depth {depth}:")
        for feat, (thresh, score) in feature_scores.items():
            print(f"{'  ' * depth}  Feature {feat} -> Best Threshold: {thresh:.4f}, Variance Reduction: {score:.6f}")

        print(f"{'  ' * depth}Depth {depth}: Selected Feature {best_feature} at Threshold {best_threshold}, Variance Reduction: {best_score:.6f}")

        # Split dataset
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = ~left_indices

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _best_split(self, X, y):
        """Finds the best feature and threshold to split the dataset, while logging scores."""
        num_samples, num_features = X.shape
        best_score = float("inf")  # Lower variance is better
        best_feature, best_threshold = None, None
        feature_scores = {}

        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            best_feature_score = float("inf")
            best_feature_threshold = None

            for threshold in thresholds:
                left_y = y[X[:, feature] <= threshold]
                right_y = y[X[:, feature] > threshold]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                score = self._variance_reduction(y, left_y, right_y)
                if score < best_feature_score:  # Lower variance is better
                    best_feature_score = score
                    best_feature_threshold = threshold

                if score < best_score:
                    best_score, best_feature, best_threshold = score, feature, threshold

            # Store the best threshold and variance reduction score for this feature
            feature_scores[feature] = (best_feature_threshold, best_feature_score)

        return best_feature, best_threshold, best_score, feature_scores

    def _variance_reduction(self, y, left_y, right_y):
        """Computes variance reduction (MSE reduction)."""
        weight_left = len(left_y) / len(y)
        weight_right = len(right_y) / len(y)

        return np.var(y) - (weight_left * np.var(left_y) + weight_right * np.var(right_y))

    def predict(self, X):
        """Predicts the target values for a given dataset."""
        return np.array([self._traverse_tree(sample, self.root) for sample in X])

    def _traverse_tree(self, sample, node):
        """Recursively traverses the tree to make predictions."""
        if node.is_leaf():
            return node.prediction
        if sample[node.feature] <= node.threshold:
            return self._traverse_tree(sample, node.left)
        else:
            return self._traverse_tree(sample, node.right)


In [9]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Select first 100 for training, last 20 for testing
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[-20:], y[-20:]

# Train regression tree
tree = RegressionTree(max_depth=5)
tree.fit(X_train, y_train)

# Make predictions
predictions = tree.predict(X_test)

# Evaluate using Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f"\nMean Squared Error on Test Set: {mse:.4f}")



Starting to build the Regression Tree...

Feature evaluation at depth 0:
  Feature 0 -> Best Threshold: -1.7743, Variance Reduction: 0.008262
  Feature 1 -> Best Threshold: 1.6178, Variance Reduction: 0.000050
  Feature 2 -> Best Threshold: -1.3675, Variance Reduction: 0.005838
  Feature 3 -> Best Threshold: -0.7223, Variance Reduction: 0.000095
  Feature 4 -> Best Threshold: 1.0045, Variance Reduction: 0.000013
  Feature 5 -> Best Threshold: 0.0921, Variance Reduction: 0.000007
  Feature 6 -> Best Threshold: 1.0010, Variance Reduction: 0.017737
  Feature 7 -> Best Threshold: -1.3628, Variance Reduction: 0.034647
Depth 0: Selected Feature 5 at Threshold 0.09205300097808652, Variance Reduction: 0.000007
  Feature evaluation at depth 1:
    Feature 0 -> Best Threshold: -1.7743, Variance Reduction: 0.008480
    Feature 1 -> Best Threshold: 1.6973, Variance Reduction: 0.000047
    Feature 2 -> Best Threshold: -1.3675, Variance Reduction: 0.005996
    Feature 3 -> Best Threshold: -0.7223, 