In [1]:
import pandas as pd
import numpy as np

  from pandas.core import (


In [9]:
class DecisionTree:   
    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value
        
    def __init__(self, mode='classifier', max_depth=5):
        assert mode in ['classifier', 'regressor'], "Mode must be 'classifier' or 'regressor'"
        self.mode = mode
        self.max_depth = max_depth
        self.root = None
        
    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1-np.sum(probs**2)
    
    def _mse(self, y):
        return np.mean((y - np.mean(y)) ** 2)
    
    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = X[:, feature_index] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]
    
    def _best_split(self, X, y):
        best_metric = float('inf') if self.mode == 'regressor' else 1
        best_feature, best_thresh = None, None

        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for t in thresholds:
                _, _, y_left, y_right = self._split_dataset(X, y, feature_index, t)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                if self.mode == 'classifier':
                    metric = (len(y_left) * self._gini(y_left) + len(y_right) * self._gini(y_right)) / len(y)
                    if metric < best_metric:
                        best_metric = metric
                        best_feature = feature_index
                        best_thresh = t
                else:  # regressor
                    metric = (len(y_left) * self._mse(y_left) + len(y_right) * self._mse(y_right)) / len(y)
                    if metric < best_metric:
                        best_metric = metric
                        best_feature = feature_index
                        best_thresh = t

        return best_feature, best_thresh

    def _build_tree(self, X, y, depth):
        if self.mode == 'classifier':
            if len(set(y)) == 1 or depth == self.max_depth:
                value = np.bincount(y).argmax()
                return self.Node(value=value)
        else:
            if len(set(y)) == 1 or depth == self.max_depth:
                return self.Node(value=np.mean(y))

        feat, thresh = self._best_split(X, y)
        if feat is None:
            value = np.bincount(y).argmax() if self.mode == 'classifier' else np.mean(y)
            return self.Node(value=value)

        X_left, X_right, y_left, y_right = self._split_dataset(X, y, feat, thresh)
        left = self._build_tree(X_left, y_left, depth+1)
        right = self._build_tree(X_right, y_right, depth+1)
        return self.Node(feat, thresh, left, right)

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.root = self._build_tree(X, y, 0)

    def _predict_one(self, x, tree):
        if tree.value is not None:
            return tree.value
        if x[tree.feature] <= tree.threshold:
            return self._predict_one(x, tree.left)
        else:
            return self._predict_one(x, tree.right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.root) for x in X])
        

        

In [10]:
X_clf = np.array([[2, 3], [1, 1], [3, 2], [4, 5], [3, 4]])
y_clf = np.array([0, 0, 1, 1, 1])

clf = DecisionTree(mode='classifier', max_depth=3)
clf.fit(X_clf, y_clf)
print("Classifier predictions:", clf.predict(X_clf))


Classifier predictions: [0 0 1 1 1]


In [11]:
y_reg = np.array([1.2, 1.0, 2.5, 3.7, 3.0])

reg = DecisionTree(mode='regressor', max_depth=3)
reg.fit(X_clf, y_reg)
print("Regressor predictions:", reg.predict(X_clf))


Regressor predictions: [1.2 1.  2.5 3.7 3. ]


In [12]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train custom DecisionTree classifier
tree = DecisionTree(mode='classifier', max_depth=3)
tree.fit(X_train, y_train)

# Predict and evaluate
y_pred = tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Predictions:", y_pred)
print("True Labels:", y_test)
print("Accuracy:", accuracy)


Predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
True Labels: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy: 1.0
