<a href="https://colab.research.google.com/github/abinavharsath41-ctrl/FOML-exp/blob/main/decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from itertools import combinations
import pandas as pd

# --- Configuration based on Exercise ---
RANDOM_STATE = 42
PLOT_COLORS = "ryb"
PLOT_STEP = 0.02
# The target names are needed for the legend/tree
TARGET_NAMES = ['Setosa', 'Versicolor', 'Virginica']
# ----------------------------------------

def load_data():
    """Step 2: Load the dataset (Iris) and Step 3: View first few rows."""

    # Load the Iris dataset
    iris = load_iris(as_frame=True)
    X = iris.data
    y = iris.target
    feature_names = iris.feature_names

    # Step 3: View first few rows of the dataset
    print("--- Iris Dataset Head ---")
    print(pd.concat([X, y], axis=1).head())
    print("\nFeature Names:", feature_names)
    print("Target Names:", TARGET_NAMES)

    return X, y, feature_names

def plot_decision_boundary(X_pair, y, clf, title, feature_names):
    """
    Step 5: Creates a grid, predicts, plots the background, and plots the data points.

    FIXED: The indexing within plt.scatter is corrected to use single-dimension indexing
    on the Series (feature columns).
    """

    # Extract the two columns of the DataFrame X_pair as pandas Series
    feature1 = X_pair.iloc[:, 0]
    feature2 = X_pair.iloc[:, 1]

    # Define the axes limits based on the features
    x_min, x_max = feature1.min() - 1, feature1.max() + 1
    y_min, y_max = feature2.min() - 1, feature2.max() + 1

    # Create a grid of x and y values to cover all feature points.
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, PLOT_STEP),
        np.arange(y_min, y_max, PLOT_STEP)
    )

    # Predict the class for each grid point
    # We use clf.predict on the flattened meshgrid coordinates
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=(8, 6))

    # Color the background based on predicted class using contourf()
    plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu, alpha=0.8)

    # Plot the real data points (Iris samples) on top of the background
    for i, color in zip(np.unique(y), PLOT_COLORS):
        # Select data points belonging to the current class
        idx = np.where(y == i)

        # --- THE INDEXING FIX IS HERE ---
        # The scatter plot takes the values from the feature Series, indexed by the row indices (idx[0]).
        # The original code X_pair.iloc[idx, 0] caused "Too many indexers" because X_pair.iloc[idx]
        # is a two-dimensional call on a Series derived from a DataFrame, which is ambiguous/incorrect.
        plt.scatter(
            feature1.iloc[idx[0]],  # Corrected: Use iloc on the Series, index with the row indices array idx[0]
            feature2.iloc[idx[0]],  # Corrected: Use iloc on the Series, index with the row indices array idx[0]
            c=color,
            label=TARGET_NAMES[i],
            edgecolor='black',
            s=20
        )

    # Label the axes and add a legend
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title(title)
    plt.legend(loc="lower right")
    plt.grid(True, linestyle='--', alpha=0.6)


def train_and_plot_2d_boundaries(X, y, feature_names):
    """Handles the iteration over feature pairs (Step 5)."""

    # pairs -> makes all combinations of 2 features: (0, 1), (0, 2), etc.
    feature_indices = range(X.shape[1])
    feature_pairs = list(combinations(feature_indices, 2))

    for pair_indices in feature_pairs:

        # Take only those two features -> X_pair
        X_pair_data = X.iloc[:, list(pair_indices)]

        # Train the Decision Tree Model -> clf.fit(X_pair, y)
        clf = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=5)
        clf.fit(X_pair_data, y)

        # Get the names of the two features for the plot title and labels
        pair_feature_names = [feature_names[i] for i in pair_indices]
        title = f"Decision Boundary: {pair_feature_names[0]} vs {pair_feature_names[1]}"

        # Call the plotting function
        plot_decision_boundary(X_pair_data, y.values, clf, title, pair_feature_names)


def train_and_plot_full_tree(X, y, feature_names):
    """Step 6: Train a tree using all features and display the tree diagram."""

    print("\n--- Training Full Decision Tree (All 4 Features) ---")

    # Train one Decision Tree using all 4 features (not just pairs)
    full_clf = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=4)
    full_clf.fit(X, y)

    print(f"Full Tree Depth: {full_clf.get_depth()}")

    plt.figure(figsize=(15, 10))

    # plot_tree() is used to display the tree diagram.
    plot_tree(
        full_clf,
        feature_names=feature_names,
        class_names=TARGET_NAMES,
        filled=True,              # Color nodes by class
        rounded=True,             # Round the boxes
        proportion=True,          # Show class proportions
        fontsize=8
    )
    plt.title("Full Decision Tree Trained on All Features")


def main():
    # Load data
    X, y, feature_names = load_data()

    # Train and plot decision boundaries for all 2D pairs (Step 5)
    train_and_plot_2d_boundaries(X, y, feature_names)

    # Train and plot the full tree (Step 6)
    train_and_plot_full_tree(X, y, feature_names)

    # Step 7: Show all plots
    print("\nDisplaying all plots (decision boundaries and full tree)...")
    plt.show()

if __name__ == "__main__":
    # Standard check for non-interactive environments
    try:
        plt.switch_backend('TkAgg') # Use a standard interactive backend
    except ImportError:
        pass

    main()

--- Iris Dataset Head ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target Names: ['Setosa', 'Versicolor', 'Virginica']





--- Training Full Decision Tree (All 4 Features) ---
Full Tree Depth: 4

Displaying all plots (decision boundaries and full tree)...
