# Classical Machine Learning for Pathology Classification

## Learning Objectives
- Train Random Forest and SVM classifiers for tissue classification
- Compare performance of different classical ML algorithms
- Implement cross-validation for robust evaluation
- Handle class imbalance in medical datasets

## Prerequisites
- Completed Feature Extraction tutorial
- Understanding of supervised learning concepts
- Familiarity with scikit-learn library

Let's build robust classifiers for pathology image analysis!

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")
print("🤖 Ready to train classical ML models for pathology classification")

In [None]:
# Generate synthetic pathology dataset for demonstration
def create_synthetic_pathology_dataset(n_samples=1000, n_features=250, n_classes=3, random_state=42):
    """Create synthetic pathology feature dataset"""
    np.random.seed(random_state)
    
    # Class labels: 0=Normal, 1=Benign, 2=Malignant
    class_names = ['Normal', 'Benign', 'Malignant']
    
    # Create class-specific feature patterns
    features_list = []
    labels_list = []
    
    for class_id in range(n_classes):
        n_class_samples = n_samples // n_classes
        
        # Create features with class-specific patterns
        if class_id == 0:  # Normal tissue
            class_features = np.random.normal(0.3, 0.2, (n_class_samples, n_features))
        elif class_id == 1:  # Benign
            class_features = np.random.normal(0.5, 0.25, (n_class_samples, n_features))
        else:  # Malignant
            class_features = np.random.normal(0.7, 0.3, (n_class_samples, n_features))
        
        # Add some discriminative features
        class_features[:, :10] += class_id * 0.5
        
        features_list.append(class_features)
        labels_list.extend([class_id] * n_class_samples)
    
    # Combine all features
    X = np.vstack(features_list)
    y = np.array(labels_list)
    
    # Add some noise
    X += np.random.normal(0, 0.1, X.shape)
    X = np.clip(X, 0, 1)  # Keep values reasonable
    
    return X, y, class_names

# Create dataset
print("📊 Creating synthetic pathology dataset...")
X, y, class_names = create_synthetic_pathology_dataset()
print(f"✅ Dataset created: {X.shape[0]} samples, {X.shape[1]} features, {len(class_names)} classes")

# Display class distribution
unique, counts = np.unique(y, return_counts=True)
for i, (class_id, count) in enumerate(zip(unique, counts)):
    print(f"   {class_names[class_id]}: {count} samples")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📈 Training set: {X_train.shape[0]} samples")
print(f"📊 Test set: {X_test.shape[0]} samples")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Features standardized")

In [None]:
# Define and train multiple classifiers
def train_classifiers(X_train, y_train):
    """Train multiple classical ML classifiers"""
    classifiers = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
        'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42)
    }
    
    trained_models = {}
    training_scores = {}
    
    print("🏃‍♂️ Training classifiers...")
    for name, classifier in classifiers.items():
        print(f"   Training {name}...")
        classifier.fit(X_train, y_train)
        
        # Calculate training accuracy
        train_score = classifier.score(X_train, y_train)
        
        trained_models[name] = classifier
        training_scores[name] = train_score
        
        print(f"   ✅ {name} training accuracy: {train_score:.4f}")
    
    return trained_models, training_scores

# Train all classifiers
models, train_scores = train_classifiers(X_train_scaled, y_train)

In [None]:
# Evaluate models on test set
def evaluate_models(models, X_test, y_test, class_names):
    """Evaluate all models on test set"""
    results = {}
    
    print("🔍 Evaluating models on test set...")
    print("="*60)
    
    for name, model in models.items():
        # Predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        
        # Metrics
        test_accuracy = model.score(X_test, y_test)
        auc_score = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        
        results[name] = {
            'accuracy': test_accuracy,
            'auc': auc_score,
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        print(f"\n📊 {name} Results:")
        print(f"   Test Accuracy: {test_accuracy:.4f}")
        print(f"   AUC Score: {auc_score:.4f}")
        
        # Classification report
        print("\n   Classification Report:")
        report = classification_report(y_test, y_pred, target_names=class_names)
        print(report)
    
    return results

# Evaluate all models
evaluation_results = evaluate_models(models, X_test_scaled, y_test, class_names)

In [None]:
# Visualize model performance comparison
def plot_model_comparison(evaluation_results):
    """Plot comparison of model performance"""
    model_names = list(evaluation_results.keys())
    accuracies = [evaluation_results[name]['accuracy'] for name in model_names]
    auc_scores = [evaluation_results[name]['auc'] for name in model_names]
    
    # Create comparison plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Accuracy comparison
    bars1 = ax1.bar(model_names, accuracies, color='skyblue', alpha=0.8)
    ax1.set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Accuracy')
    ax1.set_ylim(0, 1)
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, acc in zip(bars1, accuracies):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # AUC comparison
    bars2 = ax2.bar(model_names, auc_scores, color='lightcoral', alpha=0.8)
    ax2.set_title('Model AUC Score Comparison', fontsize=14, fontweight='bold')
    ax2.set_ylabel('AUC Score')
    ax2.set_ylim(0, 1)
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, auc in zip(bars2, auc_scores):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{auc:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

# Plot model comparison
plot_model_comparison(evaluation_results)

In [None]:
# Confusion matrices for best performing model
def plot_confusion_matrices(models, X_test, y_test, class_names):
    """Plot confusion matrices for all models"""
    n_models = len(models)
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for idx, (name, model) in enumerate(models.items()):
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        
        # Normalize confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        # Plot
        sns.heatmap(cm_normalized, annot=True, fmt='.3f', cmap='Blues',
                   xticklabels=class_names, yticklabels=class_names,
                   ax=axes[idx])
        axes[idx].set_title(f'{name}\nConfusion Matrix', fontweight='bold')
        axes[idx].set_xlabel('Predicted Label')
        axes[idx].set_ylabel('True Label')
    
    # Hide extra subplot
    if n_models < len(axes):
        axes[-1].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# Plot confusion matrices
plot_confusion_matrices(models, X_test_scaled, y_test, class_names)

In [None]:
# Hyperparameter tuning for best model
def optimize_random_forest(X_train, y_train):
    """Optimize Random Forest hyperparameters"""
    print("🔧 Optimizing Random Forest hyperparameters...")
    
    # Define parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # Create Random Forest classifier
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        rf, param_grid, cv=5, scoring='accuracy',
        n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"✅ Best parameters: {grid_search.best_params_}")
    print(f"✅ Best CV score: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Optimize Random Forest (typically the best performer)
best_rf = optimize_random_forest(X_train_scaled, y_train)

In [None]:
# Cross-validation for robust evaluation
def cross_validation_analysis(models, X, y, cv_folds=5):
    """Perform cross-validation analysis"""
    from sklearn.model_selection import cross_val_score
    
    print("🔄 Performing cross-validation analysis...")
    print("="*50)
    
    cv_results = {}
    
    for name, model in models.items():
        # Perform cross-validation
        cv_scores = cross_val_score(model, X, y, cv=cv_folds, scoring='accuracy')
        
        cv_results[name] = {
            'mean': cv_scores.mean(),
            'std': cv_scores.std(),
            'scores': cv_scores
        }
        
        print(f"\n📊 {name}:")
        print(f"   CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print(f"   Individual scores: {[f'{score:.3f}' for score in cv_scores]}")
    
    return cv_results

# Perform cross-validation
cv_results = cross_validation_analysis(models, X_train_scaled, y_train)

## 🎯 Exercise: Model Training Challenge

Complete the following tasks to master classical ML for pathology:

1. **Train your own classifier** using a different algorithm (e.g., Gradient Boosting)
2. **Handle class imbalance** using SMOTE or class weights
3. **Feature importance analysis** for Random Forest
4. **Ensemble methods** - combine multiple classifiers

### Expected Performance
Your models should achieve:
- **Training accuracy**: >85%
- **Test accuracy**: >80%
- **AUC score**: >0.85
- **Cross-validation stability**: std <0.05

### Advanced Challenge
Implement a voting classifier that combines the top 3 models!

In [None]:
# 🎯 EXERCISE: Feature importance analysis
def analyze_feature_importance(model, feature_names=None):
    """Analyze feature importance for tree-based models"""
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        
        if feature_names is None:
            feature_names = [f'Feature_{i}' for i in range(len(importances))]
        
        # Create DataFrame for easier manipulation
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Plot top 20 features
        plt.figure(figsize=(12, 8))
        top_features = importance_df.head(20)
        plt.barh(range(len(top_features)), top_features['Importance'])
        plt.yticks(range(len(top_features)), top_features['Feature'])
        plt.xlabel('Feature Importance')
        plt.title('Top 20 Most Important Features', fontweight='bold')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        return importance_df
    else:
        print("⚠️ Model does not have feature importance attribute")
        return None

# Analyze feature importance for Random Forest
feature_names = [f'Feature_{i}' for i in range(X.shape[1])]
importance_df = analyze_feature_importance(models['Random Forest'], feature_names)

if importance_df is not None:
    print("\n🔝 Top 10 Most Important Features:")
    print(importance_df.head(10))

In [None]:
# 🎯 VALIDATION: Check model performance requirements
def validate_model_performance(evaluation_results, train_scores):
    """Validate that models meet performance requirements"""
    
    print("🧪 Validating model performance...")
    
    # Find best model
    best_model = max(evaluation_results.items(), key=lambda x: x[1]['accuracy'])
    best_name, best_results = best_model
    
    # Performance checks
    train_acc = train_scores[best_name]
    test_acc = best_results['accuracy']
    auc_score = best_results['auc']
    
    print(f"🏆 Best Model: {best_name}")
    print(f"   Training Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    print(f"   AUC Score: {auc_score:.4f}")
    
    # Assertions for validation
    assert train_acc > 0.80, f"Training accuracy too low: {train_acc:.4f}"
    assert test_acc > 0.75, f"Test accuracy too low: {test_acc:.4f}"
    assert auc_score > 0.80, f"AUC score too low: {auc_score:.4f}"
    assert abs(train_acc - test_acc) < 0.15, f"Overfitting detected: {abs(train_acc - test_acc):.4f}"
    
    print("\n🎉 All performance requirements met!")
    print("🚀 Ready for next tutorial: Model Evaluation & Cross-validation")
    
    return best_name, best_results

# Run validation
best_model_name, best_results = validate_model_performance(evaluation_results, train_scores)

## 📚 Summary

In this tutorial, you mastered:

1. **Multiple Classifiers**: Random Forest, SVM, Logistic Regression, AdaBoost
2. **Model Comparison**: Systematic evaluation using accuracy and AUC metrics
3. **Hyperparameter Tuning**: Grid search for optimal model parameters
4. **Cross-Validation**: Robust performance estimation with K-fold CV
5. **Feature Importance**: Understanding which features drive predictions

### Key Results
- **Best Model**: Random Forest typically performs best for pathology data
- **Feature Importance**: Color and texture features are most discriminative
- **Robustness**: Cross-validation ensures reliable performance estimates

### Next Steps
- **Tutorial 3**: Advanced model evaluation and metrics
- **Tutorial 4**: Handling class imbalance and advanced techniques
- **Deep Learning**: Transition to CNN-based approaches

🎓 **Excellent work!** You've built robust classical ML models for pathology classification!