In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'Iris.csv'
iris_data = pd.read_csv(file_path)

# Prepare the data
X = iris_data.drop(columns=['Id', 'Species'])
y = iris_data['Species']

# Manually encode the class labels as integers
class_mapping = {label: idx for idx, label in enumerate(np.unique(y))}
y_encoded = np.array([class_mapping[label] for label in y])

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X.iloc[train_indices], X.iloc[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded)

# Define the structure of the tree node
class TreeNode:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

# Function to calculate the Gini index
def gini_index(y):
    m = len(y)
    return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

# Function to find the best split
def best_split(X, y):
    m, n = X.shape
    if m <= 1:
        return None, None

    num_classes = len(np.unique(y))
    num_parent = [np.sum(y == c) for c in range(num_classes)]
    best_gini = 1.0 - sum((num / m) ** 2 for num in num_parent)
    best_idx, best_thr = None, None

    for idx in range(n):
        sorted_indices = np.argsort(X[:, idx])
        thresholds, classes = X[sorted_indices, idx], y[sorted_indices]
        num_left = [0] * num_classes
        num_right = num_parent.copy()
        for i in range(1, m):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(num_classes))
            gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(num_classes))
            gini = (i * gini_left + (m - i) * gini_right) / m
            if thresholds[i] == thresholds[i - 1]:
                continue
            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2
    return best_idx, best_thr

# Recursive function to build the tree
def build_tree(X, y, depth=0, max_depth=10):
    num_samples_per_class = [np.sum(y == i) for i in range(len(np.unique(y)))]
    predicted_class = np.argmax(num_samples_per_class)
    node = TreeNode(
        gini=gini_index(y),
        num_samples=len(y),
        num_samples_per_class=num_samples_per_class,
        predicted_class=predicted_class,
    )

    if depth < max_depth:
        idx, thr = best_split(X.values, y)
        if idx is not None:
            indices_left = X.values[:, idx] < thr
            X_left, y_left = X[indices_left], y[indices_left]
            X_right, y_right = X[~indices_left], y[~indices_left]
            node.feature_index = idx
            node.threshold = thr
            node.left = build_tree(X_left, y_left, depth + 1, max_depth)
            node.right = build_tree(X_right, y_right, depth + 1, max_depth)
    return node

# Train the decision tree
tree = build_tree(X_train, y_train)

# Function to print the tree
def print_tree(node, depth=0):
    if node is not None:
        print(f"{'|   ' * depth}Node(depth={depth}, gini={node.gini:.4f}, samples={node.num_samples}, value={node.num_samples_per_class}, class={node.predicted_class})")
        if node.left is not None or node.right is not None:
            print(f"{'|   ' * depth} feature_{node.feature_index} <= {node.threshold:.4f}")
            print_tree(node.left, depth + 1)
            print(f"{'|   ' * depth} feature_{node.feature_index} > {node.threshold:.4f}")
            print_tree(node.right, depth + 1)

# Function to make predictions
def predict_tree(node, X):
    if node.left is None and node.right is None:
        return node.predicted_class
    if X[node.feature_index] <= node.threshold:
        return predict_tree(node.left, X)
    else:
        return predict_tree(node.right, X)

# Print the tree structure
print_tree(tree)

# Make predictions on the test set
y_pred = [predict_tree(tree, x) for x in X_test.values]

# Evaluate the accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"Accuracy: {accuracy:.4f}")

# Function to plot the tree
def plot_tree(node, depth=0, ax=None, pos=(0.5, 1), parent_pos=None, node_txt=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 8))
        ax.axis('off')
        
    if node is not None:
        x, y = pos
        ax.text(x, y, f'Gini={node.gini:.4f}\nSamples={node.num_samples}\nClass={node.predicted_class}',
                bbox=dict(facecolor='white', edgecolor='black'), ha='center')
        if parent_pos is not None:
            ax.plot([parent_pos[0], x], [parent_pos[1], y], 'k-')

        if node.left is not None:
            plot_tree(node.left, depth + 1, ax, (x - 0.5 / (depth + 1), y - 0.1), pos, '<=' + str(node.threshold))
        if node.right is not None:
            plot_tree(node.right, depth + 1, ax, (x + 0.5 / (depth + 1), y - 0.1), pos, '>' + str(node.threshold))

    if ax is None:
        plt.show()

# Plot the decision tree
plot_tree(tree)


IndexError: list index out of range