# Importing necessary libraries

In [1]:
import numpy as np
import plotly.graph_objects as go

# Creating Synthetic Data

In [5]:
np.random.seed(42)
X = np.vstack([
    np.random.normal([2, 2], 1, (50, 2)),  # Class 0 cluster
    np.random.normal([5, 5], 1, (50, 2))   # Class 1 cluster
])
y = np.array([0]*50 + [1]*50)

# Quick Check

In [3]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (100, 2)
y shape: (100,)


# DGini Impurity Function

In [10]:
import numpy as np

def gini_impurity(y):
    if len(y) == 0:
        return 0
    y = np.array(y, dtype=int)  # Force y to integer type
    p = np.bincount(y) / len(y)  # Proportion of each class
    return 1 - np.sum(p**2)      # Gini = 1 - sum(p^2)

# Test it
print("Gini (all 0s):", gini_impurity(np.zeros(5)))  # Should be 0.0 (pure)
print("Gini (mixed):", gini_impurity([0, 1, 0, 1]))  # Should be 0.5 (impure)

Gini (all 0s): 0.0
Gini (mixed): 0.5


# Decision Tree code

In [11]:
import numpy as np
import plotly.graph_objects as go

# Synthetic data
np.random.seed(42)
X = np.vstack([
    np.random.normal(2, 0.5, (10, 2)),  # Class 0
    np.random.normal(4, 0.5, (10, 2))   # Class 1
])
y = np.array([0]*10 + [1]*10)

# Fixed Gini function
def gini_impurity(y):
    if len(y) == 0:
        return 0
    y = np.array(y, dtype=int)  # Ensure integer type
    p = np.bincount(y) / len(y)
    return 1 - np.sum(p**2)

# Best split function
def best_split(X, y):
    best_gini = 1
    best_feature = None
    best_threshold = None

    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        for thresh in thresholds:
            left_idx = X[:, feature] <= thresh
            right_idx = X[:, feature] > thresh
            left_y = y[left_idx]
            right_y = y[right_idx]

            if len(left_y) == 0 or len(right_y) == 0:
                continue

            gini = (len(left_y) * gini_impurity(left_y) +
                    len(right_y) * gini_impurity(right_y)) / len(y)

            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = thresh

    return best_feature, best_threshold, best_gini

# Test the split
feature, thresh, gini = best_split(X, y)
print(f"Best split: Feature {feature} <= {thresh:.2f}, Gini: {gini:.2f}")

Best split: Feature 0 <= 2.79, Gini: 0.00


# Complete Tree Code

In [12]:
class DecisionTree:
    def __init__(self):
        self.feature = None
        self.threshold = None
        self.left_label = None
        self.right_label = None

    def fit(self, X, y):
        self.feature, self.threshold, _ = best_split(X, y)
        if self.feature is not None:
            left_idx = X[:, self.feature] <= self.threshold
            right_idx = X[:, self.feature] > self.threshold
            self.left_label = np.bincount(y[left_idx]).argmax()
            self.right_label = np.bincount(y[right_idx]).argmax()

    def predict(self, X):
        preds = []
        for x in X:
            if self.feature is None or x[self.feature] <= self.threshold:
                preds.append(self.left_label)
            else:
                preds.append(self.right_label)
        return np.array(preds)

# Train and predict
tree = DecisionTree()
tree.fit(X, y)
y_pred = tree.predict(X)
accuracy = np.mean(y_pred == y)
print(f"Accuracy: {accuracy:.2f}")

# 2D Visualization
fig = go.Figure()
fig.add_trace(go.Scatter(x=X[y==0, 0], y=X[y==0, 1], mode='markers', name='Class 0', marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=X[y==1, 0], y=X[y==1, 1], mode='markers', name='Class 1', marker=dict(color='red')))
if tree.feature == 0:
    fig.add_shape(type="line", x0=thresh, y0=min(X[:, 1]), x1=thresh, y1=max(X[:, 1]), line=dict(color="green"))
else:
    fig.add_shape(type="line", x0=min(X[:, 0]), y0=thresh, x1=max(X[:, 0]), y1=thresh, line=dict(color="green"))
fig.update_layout(title=f"Decision Tree Split (Feature {tree.feature} <= {thresh:.2f})",
                  xaxis_title="X1", yaxis_title="X2")
fig.show()

Accuracy: 1.00


# 3D version

In [13]:
X_3d = np.hstack((X, np.random.normal(3, 0.5, (20, 1))))
tree.fit(X_3d, y)
fig_3d = go.Figure()
fig_3d.add_trace(go.Scatter3d(x=X_3d[y==0, 0], y=X_3d[y==0, 1], z=X_3d[y==0, 2], mode='markers', name='Class 0', marker=dict(color='blue')))
fig_3d.add_trace(go.Scatter3d(x=X_3d[y==1, 0], y=X_3d[y==1, 1], z=X_3d[y==1, 2], mode='markers', name='Class 1', marker=dict(color='red')))
fig_3d.update_layout(title=f"3D Decision Tree (Feature {tree.feature} <= {thresh:.2f})",
                     scene=dict(xaxis_title='X1', yaxis_title='X2', zaxis_title='X3'))
fig_3d.show()