Lab 6: Decision Tree Classifier
This script demonstrates Decision Tree classification algorithm.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


In [None]:
def basic_decision_tree():
    """Demonstrate basic Decision Tree classifier"""
    print("=" * 50)
    print("Basic Decision Tree Classification")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    print(f"\nDataset: Iris")
    print(f"Shape: {X.shape}")
    print(f"Classes: {iris.target_names}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Create and train Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    
    # Make predictions
    y_pred = dt.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=iris.target_names,
                yticklabels=iris.target_names)
    plt.title('Confusion Matrix - Decision Tree')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('lab6_confusion_matrix.png')
    plt.close()
    print("\nConfusion matrix saved as 'lab6_confusion_matrix.png'")
    
    # Feature importance
    print("\nFeature Importances:")
    for name, importance in zip(iris.feature_names, dt.feature_importances_):
        print(f"{name}: {importance:.4f}")


In [None]:
def visualize_decision_tree():
    """Visualize decision tree structure"""
    print("\n" + "=" * 50)
    print("Decision Tree Visualization")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Create a simple tree with limited depth
    dt = DecisionTreeClassifier(max_depth=3, random_state=42)
    dt.fit(X, y)
    
    print(f"\nTree depth: {dt.get_depth()}")
    print(f"Number of leaves: {dt.get_n_leaves()}")
    
    # Visualize tree
    plt.figure(figsize=(20, 10))
    plot_tree(dt, 
              feature_names=iris.feature_names,
              class_names=iris.target_names,
              filled=True,
              rounded=True,
              fontsize=10)
    plt.title('Decision Tree Visualization')
    plt.tight_layout()
    plt.savefig('lab6_tree_visualization.png', dpi=100)
    plt.close()
    print("\nTree visualization saved as 'lab6_tree_visualization.png'")


In [None]:
def tune_hyperparameters():
    """Tune decision tree hyperparameters"""
    print("\n" + "=" * 50)
    print("Hyperparameter Tuning")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Test different max_depth values
    max_depths = range(1, 21)
    train_scores = []
    test_scores = []
    
    print("\nTesting different max_depth values:")
    for depth in max_depths:
        dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
        dt.fit(X_train, y_train)
        
        train_score = dt.score(X_train, y_train)
        test_score = dt.score(X_test, y_test)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        
        if depth % 5 == 0:
            print(f"Depth {depth}: Train={train_score:.4f}, Test={test_score:.4f}")
    
    # Find optimal depth
    optimal_depth = max_depths[np.argmax(test_scores)]
    print(f"\nOptimal max_depth: {optimal_depth}")
    print(f"Best test accuracy: {max(test_scores):.4f}")
    
    # Plot results
    plt.figure(figsize=(10, 6))
    plt.plot(max_depths, train_scores, 'o-', label='Training Accuracy')
    plt.plot(max_depths, test_scores, 's-', label='Test Accuracy')
    plt.axvline(x=optimal_depth, color='r', linestyle='--',
                label=f'Optimal Depth = {optimal_depth}')
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.title('Decision Tree: Accuracy vs Max Depth')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('lab6_hyperparameter_tuning.png')
    plt.close()
    print("\nHyperparameter tuning plot saved as 'lab6_hyperparameter_tuning.png'")


In [None]:
def compare_criterion():
    """Compare different splitting criteria"""
    print("\n" + "=" * 50)
    print("Comparing Splitting Criteria")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Test different criteria
    criteria = ['gini', 'entropy']
    results = {}
    
    for criterion in criteria:
        dt = DecisionTreeClassifier(criterion=criterion, random_state=42)
        dt.fit(X_train, y_train)
        accuracy = dt.score(X_test, y_test)
        results[criterion] = accuracy
        print(f"\n{criterion.capitalize()}: {accuracy:.4f}")
    
    # Visualize comparison
    plt.figure(figsize=(10, 6))
    plt.bar(results.keys(), results.values(), color=['blue', 'green'])
    plt.ylabel('Accuracy')
    plt.title('Decision Tree: Comparison of Splitting Criteria')
    plt.ylim([0.9, 1.0])
    for i, (criterion, acc) in enumerate(results.items()):
        plt.text(i, acc + 0.005, f'{acc:.4f}', ha='center')
    plt.tight_layout()
    plt.savefig('lab6_criterion_comparison.png')
    plt.close()
    print("\nCriterion comparison saved as 'lab6_criterion_comparison.png'")


In [None]:
def feature_importance_analysis():
    """Analyze feature importances"""
    print("\n" + "=" * 50)
    print("Feature Importance Analysis")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Train decision tree
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X, y)
    
    # Get feature importances
    importances = dt.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Print top 10 features
    print("\nTop 10 Most Important Features:")
    for i in range(min(10, len(indices))):
        print(f"{i+1}. {cancer.feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
    
    # Plot feature importances
    plt.figure(figsize=(12, 6))
    plt.bar(range(10), importances[indices[:10]])
    plt.xticks(range(10), [cancer.feature_names[i] for i in indices[:10]], 
               rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Top 10 Feature Importances')
    plt.tight_layout()
    plt.savefig('lab6_feature_importance.png')
    plt.close()
    print("\nFeature importance plot saved as 'lab6_feature_importance.png'")


In [None]:
def pruning_demonstration():
    """Demonstrate pruning effects"""
    print("\n" + "=" * 50)
    print("Pruning Demonstration")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Tree without pruning
    dt_no_prune = DecisionTreeClassifier(random_state=42)
    dt_no_prune.fit(X_train, y_train)
    
    # Tree with pruning (using min_samples_split and min_samples_leaf)
    dt_pruned = DecisionTreeClassifier(
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    )
    dt_pruned.fit(X_train, y_train)
    
    print("\nWithout Pruning:")
    print(f"  Tree depth: {dt_no_prune.get_depth()}")
    print(f"  Number of leaves: {dt_no_prune.get_n_leaves()}")
    print(f"  Train accuracy: {dt_no_prune.score(X_train, y_train):.4f}")
    print(f"  Test accuracy: {dt_no_prune.score(X_test, y_test):.4f}")
    
    print("\nWith Pruning:")
    print(f"  Tree depth: {dt_pruned.get_depth()}")
    print(f"  Number of leaves: {dt_pruned.get_n_leaves()}")
    print(f"  Train accuracy: {dt_pruned.score(X_train, y_train):.4f}")
    print(f"  Test accuracy: {dt_pruned.score(X_test, y_test):.4f}")


In [None]:
def cross_validation():
    """Perform cross-validation on Decision Tree"""
    print("\n" + "=" * 50)
    print("Cross-Validation")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Create classifier
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    
    # Perform cross-validation
    scores = cross_val_score(dt, X, y, cv=5)
    
    print(f"\nCross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.4f}")
    print(f"Standard deviation: {scores.std():.4f}")


In [None]:
def main():
    """Main function to demonstrate Decision Tree classifier"""
    print("\n" + "=" * 50)
    print("Lab 6: Decision Tree Classifier")
    print("=" * 50)
    
    # Basic decision tree
    basic_decision_tree()
    
    # Visualize tree
    visualize_decision_tree()
    
    # Tune hyperparameters
    tune_hyperparameters()
    
    # Compare criteria
    compare_criterion()
    
    # Feature importance
    feature_importance_analysis()
    
    # Pruning
    pruning_demonstration()
    
    # Cross-validation
    cross_validation()
    
    print("\n" + "=" * 50)
    print("Lab 6 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
