# Iris Species Classification using Decision Tree Classifier
# A complete machine learning pipeline with scikit-learn

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn import tree

In [4]:
# Load and explore the data
print("=" * 60)
print("IRIS SPECIES CLASSIFICATION - DECISION TREE CLASSIFIER")
print("=" * 60)

IRIS SPECIES CLASSIFICATION - DECISION TREE CLASSIFIER


In [5]:
df = pd.read_csv('Iris.csv')

In [6]:
print("\n1. DATASET OVERVIEW")
print("-" * 30)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nSpecies distribution:")
print(df['Species'].value_counts())


1. DATASET OVERVIEW
------------------------------
Dataset shape: (150, 6)
Columns: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

First 5 rows:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengt

In [7]:
print("\n\n2. DATA PREPROCESSING")
print("-" * 30)

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Basic statistics
print("\nBasic statistics for numerical features:")
print(df.describe())

# Separate features and target
X = df.drop(['Id', 'Species'], axis=1)  # Remove ID column and target
y = df['Species']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Encode the target labels (Species)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nLabel encoding mapping:")
for i, species in enumerate(label_encoder.classes_):
    print(f"{species}: {i}")



2. DATA PREPROCESSING
------------------------------
Missing values per column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Duplicate rows: 0

Basic statistics for numerical features:
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000

Features shape: (150, 4)
Target shape: (150,)

In [8]:

# Step 3: Split the data
print("\n\n3. DATA SPLITTING")
print("-" * 30)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded  # Ensure balanced split across all classes
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")



3. DATA SPLITTING
------------------------------
Training set shape: (120, 4)
Test set shape: (30, 4)
Training set class distribution: [40 40 40]
Test set class distribution: [10 10 10]


In [9]:
# Step 4: Train the Decision Tree Classifier
print("\n\n4. MODEL TRAINING")
print("-" * 30)

# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(
    random_state=42,
    max_depth=5,  # Limit depth to prevent overfitting
    min_samples_split=2,
    min_samples_leaf=1
)

# Train the model
dt_classifier.fit(X_train, y_train)

print("Decision Tree Classifier trained successfully!")
print(f"Tree depth: {dt_classifier.get_depth()}")
print(f"Number of leaves: {dt_classifier.get_n_leaves()}")




4. MODEL TRAINING
------------------------------
Decision Tree Classifier trained successfully!
Tree depth: 5
Number of leaves: 8


In [10]:
# Step 5: Make predictions
print("\n\n5. PREDICTIONS")
print("-" * 30)

# Predictions on training set
y_train_pred = dt_classifier.predict(X_train)

# Predictions on test set
y_test_pred = dt_classifier.predict(X_test)

print("Predictions completed!")



5. PREDICTIONS
------------------------------
Predictions completed!


In [11]:
# Step 6: Model Evaluation
print("\n\n6. MODEL EVALUATION")
print("-" * 30)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Calculate precision and recall (using weighted average for multiclass)
train_precision = precision_score(y_train, y_train_pred, average='weighted')
train_recall = recall_score(y_train, y_train_pred, average='weighted')

test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print(f"\nTraining Metrics:")
print(f"  Precision: {train_precision:.4f}")
print(f"  Recall: {train_recall:.4f}")

print(f"\nTest Metrics:")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall: {test_recall:.4f}")

# Detailed classification report
print("\n\nDETAILED CLASSIFICATION REPORT (Test Set):")
print("-" * 50)
target_names = label_encoder.classes_
print(classification_report(y_test, y_test_pred, target_names=target_names))

# Confusion Matrix
print("\nCONFUSION MATRIX (Test Set):")
print("-" * 30)
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

# Pretty print confusion matrix with labels
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)
print("\nConfusion Matrix (with labels):")
print(cm_df)




6. MODEL EVALUATION
------------------------------
Training Accuracy: 1.0000 (100.00%)
Test Accuracy: 0.9333 (93.33%)

Training Metrics:
  Precision: 1.0000
  Recall: 1.0000

Test Metrics:
  Precision: 0.9333
  Recall: 0.9333


DETAILED CLASSIFICATION REPORT (Test Set):
--------------------------------------------------
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.90      0.90      0.90        10
 Iris-virginica       0.90      0.90      0.90        10

       accuracy                           0.93        30
      macro avg       0.93      0.93      0.93        30
   weighted avg       0.93      0.93      0.93        30


CONFUSION MATRIX (Test Set):
------------------------------
[[10  0  0]
 [ 0  9  1]
 [ 0  1  9]]

Confusion Matrix (with labels):
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               10                0               0
Iris-versicolor    

In [12]:
# Step 7: Feature Importance
print("\n\n7. FEATURE IMPORTANCE")
print("-" * 30)

feature_importance = dt_classifier.feature_importances_
feature_names = X.columns

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("Feature Importance Ranking:")
for idx, row in importance_df.iterrows():
    print(f"{row['Feature']}: {row['Importance']:.4f}")



7. FEATURE IMPORTANCE
------------------------------
Feature Importance Ranking:
PetalLengthCm: 0.5586
PetalWidthCm: 0.4060
SepalWidthCm: 0.0292
SepalLengthCm: 0.0062


In [13]:
# Step 8: Model Insights and Analysis
print("\n\n8. MODEL INSIGHTS")
print("-" * 30)

# Per-class metrics
print("Per-class Performance Metrics:")
precision_per_class = precision_score(y_test, y_test_pred, average=None)
recall_per_class = recall_score(y_test, y_test_pred, average=None)

for i, species in enumerate(target_names):
    print(f"{species}:")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")

# Prediction examples
print(f"\nSample Predictions (First 10 test samples):")
print("Actual -> Predicted")
for i in range(min(10, len(y_test))):
    actual = label_encoder.inverse_transform([y_test[i]])[0]
    predicted = label_encoder.inverse_transform([y_test_pred[i]])[0]
    print(f"{actual} -> {predicted}")

# Model parameters
print(f"\n\nMODEL PARAMETERS:")
print("-" * 20)
print(f"Max Depth: {dt_classifier.max_depth}")
print(f"Min Samples Split: {dt_classifier.min_samples_split}")
print(f"Min Samples Leaf: {dt_classifier.min_samples_leaf}")
print(f"Criterion: {dt_classifier.criterion}")




8. MODEL INSIGHTS
------------------------------
Per-class Performance Metrics:
Iris-setosa:
  Precision: 1.0000
  Recall: 1.0000
Iris-versicolor:
  Precision: 0.9000
  Recall: 0.9000
Iris-virginica:
  Precision: 0.9000
  Recall: 0.9000

Sample Predictions (First 10 test samples):
Actual -> Predicted
Iris-setosa -> Iris-setosa
Iris-virginica -> Iris-virginica
Iris-versicolor -> Iris-versicolor
Iris-versicolor -> Iris-versicolor
Iris-setosa -> Iris-setosa
Iris-versicolor -> Iris-versicolor
Iris-setosa -> Iris-setosa
Iris-setosa -> Iris-setosa
Iris-virginica -> Iris-virginica
Iris-versicolor -> Iris-versicolor


MODEL PARAMETERS:
--------------------
Max Depth: 5
Min Samples Split: 2
Min Samples Leaf: 1
Criterion: gini


In [20]:
# Step 9: Data Visualization
print("\n\n9. DATA VISUALIZATION")
print("-" * 30)

# Set up plotting style
plt.style.use('default')

# Plot 1: Feature Importance
print("Creating Feature Importance plot...")
plt.figure(figsize=(10, 6))
colors = ['skyblue', 'lightcoral', 'lightgreen', 'gold']
bars = plt.barh(importance_df['Feature'], importance_df['Importance'], color=colors)
plt.title('Feature Importance in Decision Tree Classifier', fontsize=16, fontweight='bold')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)

# Add value labels on bars
for bar, value in zip(bars, importance_df['Importance']):
    plt.text(value + 0.01, bar.get_y() + bar.get_height()/2, 
             f'{value:.3f}', ha# Iris Species Classification using Decision Tree Classifier
# A complete machine learning pipeline with scikit-learn

<class 'SyntaxError'>: incomplete input (<ipython-input-20-ee115d77d61e>, line 21)

In [15]:
# Step 10: Summary and Conclusions
print("\n\n10. SUMMARY AND CONCLUSIONS")
print("-" * 40)
print(f"✓ Dataset successfully loaded with {df.shape[0]} samples and {df.shape[1]-2} features")
print(f"✓ No missing values found - data is clean")
print(f"✓ Perfect class balance with 50 samples per species")
print(f"✓ Decision Tree achieved {test_accuracy*100:.2f}% accuracy on test set")
print(f"✓ Model shows {feature_names[np.argmax(feature_importance)]} as most important feature")
print(f"✓ All three iris species classified with high precision and recall")

# Additional model validation
if test_accuracy > 0.95:
    print("✓ Excellent model performance - very high accuracy achieved!")
elif test_accuracy > 0.85:
    print("✓ Good model performance - satisfactory accuracy achieved!")
else:
    print("! Model performance could be improved - consider hyperparameter tuning")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)



10. SUMMARY AND CONCLUSIONS
----------------------------------------
✓ Dataset successfully loaded with 150 samples and 4 features
✓ No missing values found - data is clean
✓ Perfect class balance with 50 samples per species
✓ Decision Tree achieved 93.33% accuracy on test set
✓ Model shows PetalLengthCm as most important feature
✓ All three iris species classified with high precision and recall
✓ Good model performance - satisfactory accuracy achieved!

ANALYSIS COMPLETE
