Lab 7: Random Forest Classifier
This script demonstrates Random Forest classification algorithm.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


In [None]:
def basic_random_forest():
    """Demonstrate basic Random Forest classifier"""
    print("=" * 50)
    print("Basic Random Forest Classification")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    print(f"\nDataset: Iris")
    print(f"Shape: {X.shape}")
    print(f"Classes: {iris.target_names}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Create and train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=iris.target_names,
                yticklabels=iris.target_names)
    plt.title('Confusion Matrix - Random Forest')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('lab7_confusion_matrix.png')
    plt.close()
    print("\nConfusion matrix saved as 'lab7_confusion_matrix.png'")
    
    # Feature importance
    print("\nFeature Importances:")
    for name, importance in zip(iris.feature_names, rf.feature_importances_):
        print(f"{name}: {importance:.4f}")


In [None]:
def optimize_n_estimators():
    """Find optimal number of trees"""
    print("\n" + "=" * 50)
    print("Optimizing Number of Trees")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Test different number of estimators
    n_estimators_range = range(10, 201, 10)
    train_scores = []
    test_scores = []
    
    print("\nTesting different numbers of trees:")
    for n_est in n_estimators_range:
        rf = RandomForestClassifier(n_estimators=n_est, random_state=42)
        rf.fit(X_train, y_train)
        
        train_score = rf.score(X_train, y_train)
        test_score = rf.score(X_test, y_test)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        
        if n_est % 50 == 0:
            print(f"Trees {n_est}: Train={train_score:.4f}, Test={test_score:.4f}")
    
    # Find optimal number
    optimal_n = n_estimators_range[np.argmax(test_scores)]
    print(f"\nOptimal number of trees: {optimal_n}")
    print(f"Best test accuracy: {max(test_scores):.4f}")
    
    # Plot results
    plt.figure(figsize=(10, 6))
    plt.plot(n_estimators_range, train_scores, 'o-', label='Training Accuracy')
    plt.plot(n_estimators_range, test_scores, 's-', label='Test Accuracy')
    plt.axvline(x=optimal_n, color='r', linestyle='--',
                label=f'Optimal n = {optimal_n}')
    plt.xlabel('Number of Trees')
    plt.ylabel('Accuracy')
    plt.title('Random Forest: Accuracy vs Number of Trees')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('lab7_n_estimators.png')
    plt.close()
    print("\nOptimal trees plot saved as 'lab7_n_estimators.png'")


In [None]:
def optimize_max_depth():
    """Find optimal maximum depth"""
    print("\n" + "=" * 50)
    print("Optimizing Maximum Depth")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Test different max_depth values
    max_depths = [None, 3, 5, 7, 10, 15, 20]
    train_scores = []
    test_scores = []
    depth_labels = []
    
    print("\nTesting different max_depth values:")
    for depth in max_depths:
        rf = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=42)
        rf.fit(X_train, y_train)
        
        train_score = rf.score(X_train, y_train)
        test_score = rf.score(X_test, y_test)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        depth_label = 'None' if depth is None else str(depth)
        depth_labels.append(depth_label)
        
        print(f"Depth {depth_label}: Train={train_score:.4f}, Test={test_score:.4f}")
    
    # Plot results
    plt.figure(figsize=(10, 6))
    x_pos = range(len(depth_labels))
    plt.plot(x_pos, train_scores, 'o-', label='Training Accuracy')
    plt.plot(x_pos, test_scores, 's-', label='Test Accuracy')
    plt.xticks(x_pos, depth_labels)
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.title('Random Forest: Accuracy vs Max Depth')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('lab7_max_depth.png')
    plt.close()
    print("\nMax depth plot saved as 'lab7_max_depth.png'")


In [None]:
def feature_importance_analysis():
    """Analyze feature importances"""
    print("\n" + "=" * 50)
    print("Feature Importance Analysis")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Train random forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Print top 10 features
    print("\nTop 10 Most Important Features:")
    for i in range(min(10, len(indices))):
        print(f"{i+1}. {cancer.feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
    
    # Plot feature importances
    plt.figure(figsize=(12, 6))
    plt.bar(range(10), importances[indices[:10]])
    plt.xticks(range(10), [cancer.feature_names[i] for i in indices[:10]], 
               rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Top 10 Feature Importances - Random Forest')
    plt.tight_layout()
    plt.savefig('lab7_feature_importance.png')
    plt.close()
    print("\nFeature importance plot saved as 'lab7_feature_importance.png'")


In [None]:
def compare_with_single_tree():
    """Compare Random Forest with single Decision Tree"""
    print("\n" + "=" * 50)
    print("Random Forest vs Single Decision Tree")
    print("=" * 50)
    
    from sklearn.tree import DecisionTreeClassifier
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Train Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    dt_accuracy = dt.score(X_test, y_test)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    rf_accuracy = rf.score(X_test, y_test)
    
    print(f"\nDecision Tree Accuracy: {dt_accuracy:.4f}")
    print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
    print(f"Improvement: {(rf_accuracy - dt_accuracy):.4f}")
    
    # Visualize comparison
    plt.figure(figsize=(10, 6))
    models = ['Decision Tree', 'Random Forest']
    accuracies = [dt_accuracy, rf_accuracy]
    plt.bar(models, accuracies, color=['blue', 'green'])
    plt.ylabel('Accuracy')
    plt.title('Decision Tree vs Random Forest')
    plt.ylim([0.9, 1.0])
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 0.005, f'{acc:.4f}', ha='center')
    plt.tight_layout()
    plt.savefig('lab7_dt_vs_rf.png')
    plt.close()
    print("\nComparison plot saved as 'lab7_dt_vs_rf.png'")


In [None]:
def out_of_bag_score():
    """Demonstrate Out-of-Bag (OOB) score"""
    print("\n" + "=" * 50)
    print("Out-of-Bag (OOB) Score")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Train Random Forest with OOB score
    rf = RandomForestClassifier(
        n_estimators=100,
        oob_score=True,
        random_state=42
    )
    rf.fit(X, y)
    
    print(f"\nOOB Score: {rf.oob_score_:.4f}")
    print("\nNote: OOB score is an estimate of accuracy without using a test set")


In [None]:
def cross_validation():
    """Perform cross-validation on Random Forest"""
    print("\n" + "=" * 50)
    print("Cross-Validation")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Create classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Perform cross-validation
    scores = cross_val_score(rf, X, y, cv=5)
    
    print(f"\nCross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.4f}")
    print(f"Standard deviation: {scores.std():.4f}")


In [None]:
def analyze_bootstrap_samples():
    """Analyze bootstrap sampling effect"""
    print("\n" + "=" * 50)
    print("Bootstrap Sampling Analysis")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Compare with and without bootstrap
    print("\nWith Bootstrap (default):")
    rf_with_bootstrap = RandomForestClassifier(
        n_estimators=100,
        bootstrap=True,
        random_state=42
    )
    rf_with_bootstrap.fit(X_train, y_train)
    print(f"Accuracy: {rf_with_bootstrap.score(X_test, y_test):.4f}")
    
    print("\nWithout Bootstrap:")
    rf_without_bootstrap = RandomForestClassifier(
        n_estimators=100,
        bootstrap=False,
        random_state=42
    )
    rf_without_bootstrap.fit(X_train, y_train)
    print(f"Accuracy: {rf_without_bootstrap.score(X_test, y_test):.4f}")


In [None]:
def main():
    """Main function to demonstrate Random Forest classifier"""
    print("\n" + "=" * 50)
    print("Lab 7: Random Forest Classifier")
    print("=" * 50)
    
    # Basic random forest
    basic_random_forest()
    
    # Optimize n_estimators
    optimize_n_estimators()
    
    # Optimize max_depth
    optimize_max_depth()
    
    # Feature importance
    feature_importance_analysis()
    
    # Compare with single tree
    compare_with_single_tree()
    
    # OOB score
    out_of_bag_score()
    
    # Cross-validation
    cross_validation()
    
    # Bootstrap analysis
    analyze_bootstrap_samples()
    
    print("\n" + "=" * 50)
    print("Lab 7 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
