In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# --- 3. Splitting the Data ---
print("--- 3. Data Splitting ---")
# Split the dataset into training and testing sets (80% train, 20% test)
# 'random_state' ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution: {np.bincount(y_train)}")
print(f"y_test distribution: {np.bincount(y_test)}")

print("-" * 50)

--- 1. Data Loading ---
Features (X) shape: (150, 4)
Target (y) shape: (150,)
Target Species: ['setosa' 'versicolor' 'virginica']

First 5 rows of features:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
--------------------------------------------------
--- 2. Data Preprocessing ---
Number of missing values per feature:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64
Target variable is already encoded (0, 1, 2) and ready for training.
--------------------------------------------------
--- 3. Data Splitting ---
X_train shape: (1

In [None]:
# --- 1. Load Data ---
print("--- 1. Data Loading ---")
# Load the Iris dataset, which is conveniently packaged with Scikit-learn
iris = load_iris()
# Create a DataFrame for easier handling
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target
target_names = iris.target_names

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target Species: {target_names}")
# Display the first few rows of the data
print("\nFirst 5 rows of features:")
print(X.head())

print("-" * 50)

In [None]:
# --- 2. Preprocessing the Data ---
print("--- 2. Data Preprocessing ---")

# a) Handling Missing Values:
# The standard Scikit-learn Iris dataset is clean and has no missing values.
# However, this block demonstrates how one would check and handle them.
print(f"Number of missing values per feature:\n{X.isnull().sum()}")
# If there were missing values, a common strategy would be:
# X = X.fillna(X.mean()) # For numerical features

# b) Label Encoding (Target Variable):
# The target variable 'y' is already numerically encoded (0, 1, 2), which is
# suitable for Scikit-learn. If it were string labels (e.g., 'setosa'), we'd use LabelEncoder.
# Demonstrating LabelEncoder for completeness:
# le = LabelEncoder()
# y_encoded = le.fit_transform(y_string_labels) # Use y_encoded for training

print("Target variable is already encoded (0, 1, 2) and ready for training.")

print("-" * 50)

In [None]:
# --- 4. Training the Decision Tree Classifier ---
print("--- 4. Model Training (Decision Tree) ---")
# Initialize the Decision Tree Classifier
# 'criterion' can be 'gini' (default) or 'entropy'
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
dt_classifier.fit(X_train, y_train)

print("Decision Tree Classifier trained successfully.")

print("-" * 50)

# --- 5. Prediction and Evaluation ---
print("--- 5. Prediction and Evaluation ---")
# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Calculate Evaluation Metrics
# The 'average' parameter is crucial for multi-class classification:
# 'macro': Calculate metric for each class and find their unweighted mean.
# 'weighted': Calculate metric for each class and find their mean weighted by support (number of true instances for each label).
# 'micro': Global calculation, counting total true positives, false negatives, and false positives.

# 1. Accuracy: Overall correctness of the model
accuracy = accuracy_score(y_test, y_pred)

# 2. Precision: Ability of the classifier not to label as positive a sample that is negative
# Use 'weighted' to account for class imbalance (though minor here)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

# 3. Recall: Ability of the classifier to find all the positive samples
# Use 'weighted'
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

In [None]:
# --- Display Results ---
print("\n--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy:.4f} (Overall correctness)")
print(f"Precision: {precision:.4f} (Weighted ability to avoid false positives)")
print(f"Recall: {recall:.4f} (Weighted ability to find all positive samples)")
print("------------------------------")