# Setup and Synthetic Data

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Generate synthetic 2D data
np.random.seed(42)
n_samples = 200
X = np.random.randn(n_samples, 2)  # 2 features
y = (X[:, 0] + X[:, 1] > 0).astype(int)  # Class 1 if sum > 0, else Class 0

# Convert to DataFrame for convenience
data = pd.DataFrame(X, columns=['Feature1', 'Feature2'])
data['Label'] = y

# 2D Scatter Plot
fig = px.scatter(data, x='Feature1', y='Feature2', color='Label', title='Synthetic Data')
fig.show()

# Basic Decision Tree (Simplified)

In [2]:
class DecisionNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx  # Feature to split on
        self.threshold = threshold      # Threshold value
        self.left = left                # Left subtree
        self.right = right              # Right subtree
        self.value = value              # Leaf value (class)

def gini_impurity(y):
    _, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return 1 - np.sum(probs ** 2)

def best_split(X, y):
    n_features = X.shape[1]
    best_gini = float('inf')
    best_idx, best_thr = None, None

    for idx in range(n_features):
        thresholds = np.unique(X[:, idx])
        for thr in thresholds:
            left_mask = X[:, idx] <= thr
            right_mask = ~left_mask
            if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                continue
            gini = (len(y[left_mask]) * gini_impurity(y[left_mask]) +
                    len(y[right_mask]) * gini_impurity(y[right_mask])) / len(y)
            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = thr
    return best_idx, best_thr

def build_tree(X, y, max_depth=3, min_samples=2):
    if len(y) < min_samples or max_depth == 0 or len(np.unique(y)) == 1:
        return DecisionNode(value=np.bincount(y).argmax())

    idx, thr = best_split(X, y)
    if idx is None:
        return DecisionNode(value=np.bincount(y).argmax())

    left_mask = X[:, idx] <= thr
    right_mask = ~left_mask
    left = build_tree(X[left_mask], y[left_mask], max_depth - 1, min_samples)
    right = build_tree(X[right_mask], y[right_mask], max_depth - 1, min_samples)
    return DecisionNode(idx, thr, left, right)

def predict_tree(node, X):
    if node.value is not None:
        return node.value
    if X[node.feature_idx] <= node.threshold:
        return predict_tree(node.left, X)
    return predict_tree(node.right, X)

# Random Forest Implementation

In [4]:
import numpy as np

class RandomForest:
    def __init__(self, n_trees=10, max_depth=3, max_features='sqrt'):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        n_features = X.shape[1]
        if self.max_features == 'sqrt':
            self.max_features = int(np.sqrt(n_features))

        for _ in range(self.n_trees):
            X_sample, y_sample = self.bootstrap_sample(X, y)
            # Randomly select features for this tree
            feat_idxs = np.random.choice(n_features, self.max_features, replace=False)
            X_subset = X_sample[:, feat_idxs]
            tree = build_tree(X_subset, y_sample, self.max_depth)
            self.trees.append((tree, feat_idxs))

    def predict(self, X):
        # Initialize tree_preds with int dtype to avoid float issues
        tree_preds = np.zeros((X.shape[0], self.n_trees), dtype=int)
        for i, (tree, feat_idxs) in enumerate(self.trees):
            preds = np.array([predict_tree(tree, x[feat_idxs]) for x in X], dtype=int)
            tree_preds[:, i] = preds
        # Aggregate votes using np.bincount
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=tree_preds)

# Train the model
np.random.seed(42)
n_samples = 200
X = np.random.randn(n_samples, 2)  # 2 features
y = (X[:, 0] + X[:, 1] > 0).astype(int)  # Class 1 if sum > 0, else Class 0

X_np = X  # Already a NumPy array
rf = RandomForest(n_trees=10, max_depth=3)
rf.fit(X_np, y)
y_pred = rf.predict(X_np)

# Check predictions
print("Predictions:", y_pred[:10])
print("Accuracy:", np.mean(y_pred == y))

Predictions: [1 1 0 1 1 0 0 0 0 0]
Accuracy: 0.865


# Visualizations 2D Decision Boundary

In [5]:
# Create a mesh grid
xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 100),
                     np.linspace(X[:, 1].min(), X[:, 1].max(), 100))
X_grid = np.c_[xx.ravel(), yy.ravel()]
Z = rf.predict(X_grid).reshape(xx.shape)

# Plot
fig = go.Figure()
fig.add_trace(go.Contour(x=xx[0], y=yy[:, 0], z=Z, colorscale='Viridis', opacity=0.5))
fig.add_trace(go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', marker=dict(color=y, size=10)))
fig.update_layout(title='Random Forest Decision Boundary', xaxis_title='Feature1', yaxis_title='Feature2')
fig.show()

# 3D Feature Importance (Simplified)

In [7]:
feature_counts = np.zeros(X.shape[1])
for tree, feat_idxs in rf.trees:
    feature_counts[feat_idxs] += 1

fig = go.Figure(data=[go.Bar(x=['Feature1', 'Feature2'], y=feature_counts)])
fig.update_layout(title='Feature Importance', xaxis_title='Features', yaxis_title='Times Used in Splits')
fig.show()