# Feature Extraction for Digital Pathology

## Learning Objectives
- Extract color histograms from pathology images
- Compute texture features using Local Binary Patterns (LBP)
- Calculate morphometric features from tissue regions
- Prepare feature vectors for machine learning classification

## Prerequisites
- Basic knowledge of image processing
- Familiarity with NumPy and scikit-image
- Understanding of histograms and texture analysis

Let's start by setting up our environment and downloading sample data.

In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from skimage import feature, measure, filters
from skimage.color import rgb2gray
import cv2
import os
import requests
from tqdm import tqdm

print("✅ Libraries imported successfully!")
print("📚 This tutorial will teach you feature extraction techniques for pathology images")

In [None]:
# Download sample pathology images
def download_sample_data():
    """Download sample pathology images for feature extraction"""
    os.makedirs('data/pathology_samples', exist_ok=True)
    
    # Sample URLs (these would be replaced with actual pathology image URLs)
    sample_urls = [
        'https://via.placeholder.com/512x512/FF69B4/000000?text=Normal+Tissue',
        'https://via.placeholder.com/512x512/DC143C/000000?text=Cancer+Tissue',
        'https://via.placeholder.com/512x512/32CD32/000000?text=Benign+Tissue'
    ]
    
    filenames = ['normal_tissue.png', 'cancer_tissue.png', 'benign_tissue.png']
    
    print("📥 Downloading sample pathology images...")
    for url, filename in zip(sample_urls, filenames):
        filepath = f'data/pathology_samples/{filename}'
        if not os.path.exists(filepath):
            try:
                response = requests.get(url)
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                print(f"✅ Downloaded: {filename}")
            except Exception as e:
                print(f"⚠️ Note: Using placeholder for {filename}")
    
    return filenames

# Download the data
sample_files = download_sample_data()
print(f"📊 Ready to extract features from {len(sample_files)} sample images")

In [None]:
# Color histogram feature extraction
def extract_color_features(image, bins=64):
    """Extract color histogram features from RGB image"""
    features = []
    
    # Extract histogram for each color channel
    for channel in range(3):  # RGB channels
        hist, _ = np.histogram(image[:,:,channel], bins=bins, range=(0, 256))
        # Normalize histogram
        hist = hist.astype(float) / (image.shape[0] * image.shape[1])
        features.extend(hist)
    
    # Add color moments (mean, std, skewness)
    for channel in range(3):
        channel_data = image[:,:,channel].flatten()
        features.extend([
            np.mean(channel_data),
            np.std(channel_data),
            np.mean((channel_data - np.mean(channel_data))**3) / np.std(channel_data)**3  # skewness
        ])
    
    return np.array(features)

# Test color feature extraction
print("🎨 Testing color feature extraction...")
test_image = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)
color_features = extract_color_features(test_image)
print(f"✅ Extracted {len(color_features)} color features")
print(f"📊 Feature vector shape: {color_features.shape}")

In [None]:
# Texture feature extraction using Local Binary Patterns (LBP)
def extract_texture_features(image, radius=3, n_points=24):
    """Extract texture features using Local Binary Patterns"""
    # Convert to grayscale
    if len(image.shape) == 3:
        gray_image = rgb2gray(image)
    else:
        gray_image = image
    
    # Compute LBP
    lbp = feature.local_binary_pattern(gray_image, n_points, radius, method='uniform')
    
    # Compute LBP histogram
    n_bins = n_points + 2
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    
    # Add Haralick texture features
    try:
        # Convert to uint8 for Haralick features
        gray_uint8 = (gray_image * 255).astype(np.uint8)
        
        # Compute GLCM properties
        glcm = feature.graycomatrix(gray_uint8, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], 
                                  levels=256, symmetric=True, normed=True)
        
        # Extract GLCM properties
        contrast = feature.graycoprops(glcm, 'contrast').flatten()
        dissimilarity = feature.graycoprops(glcm, 'dissimilarity').flatten()
        homogeneity = feature.graycoprops(glcm, 'homogeneity').flatten()
        energy = feature.graycoprops(glcm, 'energy').flatten()
        
        texture_features = np.concatenate([lbp_hist, contrast, dissimilarity, homogeneity, energy])
    except:
        # Fallback to just LBP if GLCM fails
        texture_features = lbp_hist
    
    return texture_features

# Test texture feature extraction
print("🖼️ Testing texture feature extraction...")
texture_features = extract_texture_features(test_image)
print(f"✅ Extracted {len(texture_features)} texture features")

In [None]:
# Morphometric feature extraction
def extract_morphometric_features(image, threshold_method='otsu'):
    """Extract morphometric features from binary segmented regions"""
    # Convert to grayscale
    if len(image.shape) == 3:
        gray_image = rgb2gray(image)
    else:
        gray_image = image
    
    # Apply thresholding to segment regions
    if threshold_method == 'otsu':
        threshold = filters.threshold_otsu(gray_image)
    else:
        threshold = 0.5
    
    binary_image = gray_image > threshold
    
    # Label connected components
    labeled_image = measure.label(binary_image)
    
    # Extract region properties
    regions = measure.regionprops(labeled_image)
    
    # Initialize feature lists
    areas = []
    perimeters = []
    eccentricities = []
    solidities = []
    
    for region in regions:
        if region.area > 50:  # Filter small regions
            areas.append(region.area)
            perimeters.append(region.perimeter)
            eccentricities.append(region.eccentricity)
            solidities.append(region.solidity)
    
    # Compute statistical features
    morphometric_features = []
    for feature_list in [areas, perimeters, eccentricities, solidities]:
        if len(feature_list) > 0:
            morphometric_features.extend([
                np.mean(feature_list),
                np.std(feature_list),
                np.min(feature_list),
                np.max(feature_list),
                len(feature_list)  # count
            ])
        else:
            morphometric_features.extend([0, 0, 0, 0, 0])
    
    return np.array(morphometric_features)

# Test morphometric feature extraction
print("📏 Testing morphometric feature extraction...")
morphometric_features = extract_morphometric_features(test_image)
print(f"✅ Extracted {len(morphometric_features)} morphometric features")

In [None]:
# Combined feature extraction pipeline
def extract_all_features(image):
    """Extract all types of features from an image"""
    color_feat = extract_color_features(image)
    texture_feat = extract_texture_features(image)
    morphometric_feat = extract_morphometric_features(image)
    
    # Combine all features
    all_features = np.concatenate([color_feat, texture_feat, morphometric_feat])
    
    return {
        'color_features': color_feat,
        'texture_features': texture_feat,
        'morphometric_features': morphometric_feat,
        'all_features': all_features
    }

# Test complete pipeline
print("🔄 Testing complete feature extraction pipeline...")
feature_dict = extract_all_features(test_image)

print(f"📊 Feature Summary:")
print(f"   Color features: {len(feature_dict['color_features'])}")
print(f"   Texture features: {len(feature_dict['texture_features'])}")
print(f"   Morphometric features: {len(feature_dict['morphometric_features'])}")
print(f"   Total features: {len(feature_dict['all_features'])}")

In [None]:
# Feature selection and preprocessing
def preprocess_features(feature_matrix, labels=None, k_best=100):
    """Preprocess features with normalization and selection"""
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_matrix)
    
    if labels is not None and len(np.unique(labels)) > 1:
        # Feature selection using ANOVA F-test
        selector = SelectKBest(score_func=f_classif, k=min(k_best, scaled_features.shape[1]))
        selected_features = selector.fit_transform(scaled_features, labels)
        
        print(f"📊 Selected {selected_features.shape[1]} best features out of {scaled_features.shape[1]}")
        return selected_features, scaler, selector
    else:
        print(f"📊 Normalized {scaled_features.shape[1]} features")
        return scaled_features, scaler, None

# Example preprocessing
print("⚙️ Testing feature preprocessing...")
dummy_features = np.random.random((50, 200))  # 50 samples, 200 features
dummy_labels = np.random.randint(0, 3, 50)    # 3 classes

processed_features, scaler, selector = preprocess_features(dummy_features, dummy_labels)
print(f"✅ Preprocessing complete!")

## 🎯 Exercise: Feature Extraction Challenge

Now it's your turn! Complete the following tasks:

1. **Load a real pathology image** (or use the provided samples)
2. **Extract all three types of features** (color, texture, morphometric)
3. **Analyze the feature distributions** using histograms
4. **Compare features** between different tissue types

### Expected Output
Your feature extraction should produce:
- Color features: ~201 features (64×3 histogram bins + 9 moments)
- Texture features: ~26-50 features (LBP + GLCM properties)  
- Morphometric features: ~20 features (5 metrics × 4 statistics)
- Total: ~247-271 features per image

### Validation
Run the assertion below to check your implementation:

In [None]:
# 🎯 EXERCISE VALIDATION
def validate_feature_extraction():
    """Validate that feature extraction works correctly"""
    
    # Create test image
    test_img = np.random.randint(0, 255, (128, 128, 3), dtype=np.uint8)
    
    # Extract features
    features = extract_all_features(test_img)
    
    # Validate feature counts
    assert len(features['color_features']) >= 100, f"Expected ≥100 color features, got {len(features['color_features'])}"
    assert len(features['texture_features']) >= 20, f"Expected ≥20 texture features, got {len(features['texture_features'])}"
    assert len(features['morphometric_features']) >= 15, f"Expected ≥15 morphometric features, got {len(features['morphometric_features'])}"
    assert len(features['all_features']) >= 200, f"Expected ≥200 total features, got {len(features['all_features'])}"
    
    print("🎉 All feature extraction tests passed!")
    print("🚀 Ready to move to the next tutorial: Classical ML Classification")
    
    return True

# Run validation
validate_feature_extraction()

## 📚 Summary

In this tutorial, you learned:

1. **Color Feature Extraction**: RGB histograms and color moments for capturing color distribution
2. **Texture Feature Extraction**: Local Binary Patterns (LBP) and Gray-Level Co-occurrence Matrix (GLCM) properties
3. **Morphometric Feature Extraction**: Shape and size measurements from segmented regions
4. **Feature Preprocessing**: Normalization and feature selection techniques

### Next Steps
- **Tutorial 2**: Train classical ML classifiers (Random Forest, SVM) using these features
- **Tutorial 3**: Evaluate model performance and cross-validation
- **Tutorial 4**: Advanced feature selection and ensemble methods

### Key Takeaways
- **Feature diversity** is crucial for robust classification
- **Preprocessing** improves model performance significantly  
- **Feature selection** reduces dimensionality and overfitting
- **Domain knowledge** helps in choosing relevant features

🎓 **Congratulations!** You've mastered feature extraction for digital pathology!