In [None]:
# Automatic Dataset Preparation for UMAP and Dimensionality Reduction (persistent across notebooks)
# Automatic Dataset Preparation for UMAP and Dimensionality Reduction (persistent across notebooks)
import sys, os
from pathlib import Path
import pandas as pd, numpy as np, json

# Bootstrap shared utils (Colab-friendly)
try:
    from shared import utils as u
except ImportError:
    repo_url = "https://github.com/anand-indx/dp-t25.git"; dest = "/content/dp-t25"
    if 'google.colab' in sys.modules and not os.path.exists(dest):
        import subprocess
        subprocess.run(['git', 'clone', '--depth', '1', repo_url, dest], check=False)
        sys.path.insert(0, dest)
    else:
        sys.path.insert(0, str(Path.cwd().parents[1]))
    from shared import utils as u

DATA_DIR = u.get_data_dir()
UMAP_DIR = DATA_DIR / "dimensionality_data"
UMAP_DIR.mkdir(parents=True, exist_ok=True)

from sklearn.datasets import make_classification, make_blobs

def create_dimensionality_datasets():
    """Create high-dimensional datasets for UMAP and dimensionality reduction"""
    np.random.seed(789)  # For reproducible embeddings

    datasets = {}

    # 1. High-dimensional pathology feature dataset
    print("Creating high-dimensional pathology features...")

    n_samples = 800
    n_features = 150
    n_classes = 4

    # Generate classification dataset with realistic pathology structure
    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=int(n_features * 0.7),  # 70% informative features
        n_redundant=int(n_features * 0.2),    # 20% redundant features
        n_clusters_per_class=2,
        n_classes=n_classes,
        class_sep=1.2,
        flip_y=0.05,  # 5% label noise
        random_state=789
    )

    # Create meaningful feature names
    morphology_features = [f'morphology_{i:03d}' for i in range(50)]
    texture_features = [f'texture_{i:03d}' for i in range(30)]
    intensity_features = [f'intensity_{i:03d}' for i in range(25)]
    shape_features = [f'shape_{i:03d}' for i in range(20)]
    color_features = [f'color_{i:03d}' for i in range(15)]
    spatial_features = [f'spatial_{i:03d}' for i in range(10)]

    feature_names = (morphology_features + texture_features + intensity_features +
                    shape_features + color_features + spatial_features)

    # Create DataFrame
    pathology_df = pd.DataFrame(X, columns=feature_names)
    pathology_df['sample_id'] = [f'P{i:04d}' for i in range(n_samples)]

    # Add meaningful class labels
    class_labels = ['Normal_Tissue', 'Low_Grade_Tumor', 'High_Grade_Tumor', 'Metastatic']
    pathology_df['tissue_class'] = [class_labels[label] for label in y]
    pathology_df['numeric_class'] = y

    # Add additional metadata
    pathology_df['patient_age'] = np.random.normal(65, 12, n_samples).astype(int)
    pathology_df['tumor_size'] = np.where(y > 0, np.random.lognormal(3, 0.5, n_samples), 0)
    pathology_df['grade'] = np.where(y == 0, 0,
                                   np.where(y == 1, 1,
                                          np.where(y == 2, 2, 3)))

    pathology_file = UMAP_DIR / "high_dim_pathology_features.csv"
    pathology_df.to_csv(pathology_file, index=False)
    datasets['pathology_features'] = str(pathology_file)

    # 2. Multi-modal dataset (imaging + genomics + clinical)
    print("Building multi-modal dataset...")

    n_multimodal = 500

    # Imaging features (extracted from CNNs, etc.)
    imaging_features = np.random.normal(0, 1, (n_multimodal, 50))

    # Genomic features (mutation status, expression levels)
    genomic_features = np.random.lognormal(0, 1, (n_multimodal, 100))

    # Clinical features (normalized)
    clinical_features = np.random.beta(2, 3, (n_multimodal, 20))

    # Combine all modalities
    multimodal_features = np.hstack([imaging_features, genomic_features, clinical_features])

    # Create structured outcomes
    outcome_probs = np.mean(multimodal_features[:, :30], axis=1)
    outcomes = (outcome_probs > np.median(outcome_probs)).astype(int)

    multimodal_df = pd.DataFrame(multimodal_features,
                               columns=([f'imaging_{i:02d}' for i in range(50)] +
                                       [f'genomic_{i:02d}' for i in range(100)] +
                                       [f'clinical_{i:02d}' for i in range(20)]))

    multimodal_df['patient_id'] = [f'M{i:04d}' for i in range(n_multimodal)]
    multimodal_df['outcome'] = outcomes
    multimodal_df['modality_imaging'] = 1
    multimodal_df['modality_genomic'] = np.random.choice([0, 1], n_multimodal, p=[0.3, 0.7])
    multimodal_df['modality_clinical'] = 1

    multimodal_file = UMAP_DIR / "multimodal_features.csv"
    multimodal_df.to_csv(multimodal_file, index=False)
    datasets['multimodal_features'] = str(multimodal_file)

    # 3. Clustering validation dataset
    print("Creating clustering validation dataset...")

    cluster_centers = 8
    cluster_std = 2.0

    X_blobs, y_blobs = make_blobs(
        n_samples=600,
        centers=cluster_centers,
        n_features=75,
        cluster_std=cluster_std,
        random_state=789,
    )

    cluster_types = ['Epithelial', 'Stromal', 'Immune_High', 'Immune_Low',
                    'Necrotic', 'Vascular', 'Neural', 'Adipose']

    clustering_df = pd.DataFrame(X_blobs, columns=[f'feature_{i:03d}' for i in range(75)])
    clustering_df['sample_id'] = [f'C{i:04d}' for i in range(600)]
    clustering_df['true_cluster'] = y_blobs
    clustering_df['cluster_type'] = [cluster_types[cluster] for cluster in y_blobs]

    noise_features = np.random.normal(0, 0.5, (600, 25))
    noise_df = pd.DataFrame(noise_features, columns=[f'noise_{i:02d}' for i in range(25)])
    clustering_df = pd.concat([clustering_df, noise_df], axis=1)

    clustering_file = UMAP_DIR / "clustering_validation.csv"
    clustering_df.to_csv(clustering_file, index=False)
    datasets['clustering_validation'] = str(clustering_file)

    # 4. Time-series embedding dataset
    print("Generating time-series embedding data...")

    n_timepoints = 50
    n_patients_ts = 100
    n_biomarkers = 30

    timeseries_embedding_data = []

    for patient in range(n_patients_ts):
        trajectory_type = np.random.choice(['stable', 'progressive', 'responsive', 'resistant'])
        for t in range(n_timepoints):
            biomarker_vector = []
            for biomarker in range(n_biomarkers):
                if trajectory_type == 'stable':
                    value = 5 + np.random.normal(0, 0.5)
                elif trajectory_type == 'progressive':
                    value = 5 + 0.1 * t + np.random.normal(0, 0.8)
                elif trajectory_type == 'responsive':
                    value = 5 + 3 * np.exp(-0.1 * t) + np.random.normal(0, 0.6)
                else:  # resistant
                    value = 5 + 2 * (1 - np.exp(-0.05 * t)) + np.random.normal(0, 0.7)
                biomarker_vector.append(value)
            timeseries_embedding_data.append({
                'patient_id': f'T{patient:03d}',
                'timepoint': t,
                'trajectory_type': trajectory_type,
                **{f'biomarker_{i:02d}': val for i, val in enumerate(biomarker_vector)}
            })

    timeseries_embedding_df = pd.DataFrame(timeseries_embedding_data)
    timeseries_embedding_file = UMAP_DIR / "timeseries_embedding.csv"
    timeseries_embedding_df.to_csv(timeseries_embedding_file, index=False)
    datasets['timeseries_embedding'] = str(timeseries_embedding_file)

    metadata = {
        'creation_info': {
            'date': pd.Timestamp.now().isoformat(),
            'purpose': 'UMAP and dimensionality reduction demonstrations',
            'random_seed': 789,
            'sklearn_version': 'Compatible with scikit-learn API'
        },
        'datasets': {
            'pathology_features': {
                'description': 'High-dimensional pathology features for tissue classification',
                'dimensions': f"{n_samples} samples × {n_features} features",
                'classes': class_labels,
            },
            'multimodal_features': {
                'description': 'Multi-modal dataset combining imaging, genomics, and clinical data',
                'dimensions': f"{n_multimodal} samples × {multimodal_features.shape[1]} features",
            },
            'clustering_validation': {
                'description': 'Blob clusters for clustering algorithm validation',
                'n_clusters': cluster_centers,
                'cluster_types': cluster_types,
                'dimensions': "600 samples × 100 features (75 informative + 25 noise)",
            },
            'timeseries_embedding': {
                'description': 'Time-series biomarker data for trajectory analysis',
                'n_patients': n_patients_ts,
                'n_timepoints': n_timepoints,
                'n_biomarkers': n_biomarkers,
                'trajectory_types': ['stable', 'progressive', 'responsive', 'resistant']
            },
        },
        'umap_parameters': {
            'recommended_settings': {
                'n_neighbors': [5, 15, 50],
                'min_dist': [0.1, 0.3, 0.5],
                'n_components': [2, 3],
                'metric': ['euclidean', 'manhattan', 'cosine']
            }
        }
    }

    metadata_file = UMAP_DIR / "dimensionality_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"Dimensionality reduction datasets ready! {len(datasets)} datasets created")
    print(f"Data location: {UMAP_DIR}")

    return UMAP_DIR, datasets, metadata

# Generate datasets
umap_data_dir, available_umap_datasets, umap_metadata = create_dimensionality_datasets()

print("Available dimensionality reduction datasets:")
for name, path in available_umap_datasets.items():
    print(f"  - {name}: {Path(path).name}")

# Optional: UMAP availability notice
try:
    import umap
    print("UMAP is available for use")
except ImportError:
    print("UMAP not installed. Run: pip install umap-learn")

# UMAP and Dimensionality Reduction for Digital Pathology

This notebook covers advanced dimensionality reduction techniques including UMAP, t-SNE, and PCA specifically for digital pathology data analysis. Learn to visualize high-dimensional pathology features and discover hidden patterns.

## Learning Objectives
1. Master UMAP for non-linear dimensionality reduction
2. Compare t-SNE, PCA, and UMAP techniques
3. Optimize hyperparameters for pathology data
4. Create interactive dimensionality reduction plots
5. Analyze clusters and subpopulations in pathology data
6. Validate dimensionality reduction results

## Prerequisites
- Completed correlation and statistical visualization notebooks
- Understanding of high-dimensional data concepts
- Basic knowledge of machine learning principles

## 1. Environment Setup and Data Preparation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score
import umap.umap_ as umap
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
sns.set_style("whitegrid")

# Try to import plotly for interactive plots (optional)
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    PLOTLY_AVAILABLE = True
    print("✅ Plotly available for interactive visualizations")
except ImportError:
    PLOTLY_AVAILABLE = False
    print("⚠️ Plotly not available - using matplotlib for static plots")

print("✅ Libraries imported successfully!")

In [None]:
# Create comprehensive high-dimensional pathology dataset
np.random.seed(42)
n_samples = 800  # More samples for better dimensionality reduction

print("Creating comprehensive pathology dataset...")

# Define different tissue/cancer types for ground truth clusters
tissue_types = ['Normal', 'Benign', 'Low_Grade_Cancer', 'High_Grade_Cancer']
tissue_probs = [0.25, 0.25, 0.25, 0.25]
tissue_labels = np.random.choice(tissue_types, n_samples, p=tissue_probs)

# Nuclear morphology features (50 features simulating different nuclear measurements)
print("• Nuclear morphology features...")
nuclear_features = {}
for i in range(1, 21):  # 20 nuclear features
    base_values = {
        'Normal': np.random.normal(100, 15, n_samples),
        'Benign': np.random.normal(110, 18, n_samples), 
        'Low_Grade_Cancer': np.random.normal(130, 25, n_samples),
        'High_Grade_Cancer': np.random.normal(160, 35, n_samples)
    }
    
    nuclear_features[f'nuclear_feature_{i:02d}'] = np.concatenate([
        base_values[tissue][tissue_labels == tissue] for tissue in tissue_types
    ])

# Chromatin texture features (30 features)
print("• Chromatin texture features...")
chromatin_features = {}
for i in range(1, 16):  # 15 chromatin features
    base_values = {
        'Normal': np.random.normal(0.5, 0.1, n_samples),
        'Benign': np.random.normal(0.6, 0.12, n_samples),
        'Low_Grade_Cancer': np.random.normal(0.75, 0.15, n_samples),
        'High_Grade_Cancer': np.random.normal(0.9, 0.2, n_samples)
    }
    
    chromatin_features[f'chromatin_feature_{i:02d}'] = np.concatenate([
        base_values[tissue][tissue_labels == tissue] for tissue in tissue_types
    ])

# Cellular organization features (25 features)
print("• Cellular organization features...")
organization_features = {}
for i in range(1, 11):  # 10 organization features
    base_values = {
        'Normal': np.random.normal(1.0, 0.2, n_samples),
        'Benign': np.random.normal(0.8, 0.25, n_samples),
        'Low_Grade_Cancer': np.random.normal(0.6, 0.3, n_samples),
        'High_Grade_Cancer': np.random.normal(0.3, 0.35, n_samples)
    }
    
    organization_features[f'organization_feature_{i:02d}'] = np.concatenate([
        base_values[tissue][tissue_labels == tissue] for tissue in tissue_types
    ])

# Vascular and stromal features (15 features)
print("• Vascular and stromal features...")
vascular_features = {}
for i in range(1, 8):  # 7 vascular features
    base_values = {
        'Normal': np.random.normal(20, 5, n_samples),
        'Benign': np.random.normal(25, 6, n_samples),
        'Low_Grade_Cancer': np.random.normal(35, 8, n_samples),
        'High_Grade_Cancer': np.random.normal(50, 12, n_samples)
    }
    
    vascular_features[f'vascular_feature_{i:02d}'] = np.concatenate([
        base_values[tissue][tissue_labels == tissue] for tissue in tissue_types
    ])

# Immune infiltration features (20 features)
print("• Immune infiltration features...")
immune_features = {}
for i in range(1, 11):  # 10 immune features
    base_values = {
        'Normal': np.random.normal(30, 8, n_samples),
        'Benign': np.random.normal(45, 10, n_samples),
        'Low_Grade_Cancer': np.random.normal(60, 15, n_samples),
        'High_Grade_Cancer': np.random.normal(80, 20, n_samples)
    }
    
    immune_features[f'immune_feature_{i:02d}'] = np.concatenate([
        base_values[tissue][tissue_labels == tissue] for tissue in tissue_types
    ])

# Combine all features
all_features = {**nuclear_features, **chromatin_features, **organization_features, 
               **vascular_features, **immune_features}

# Create DataFrame
pathology_data = pd.DataFrame(all_features)
pathology_data['tissue_type'] = tissue_labels
pathology_data['patient_id'] = [f'P{i:04d}' for i in range(1, n_samples + 1)]

# Add some noise and correlations to make it more realistic
for i in range(0, len(pathology_data.columns) - 2, 3):  # Every 3rd feature correlates
    if i + 1 < len(pathology_data.columns) - 2:
        col1 = pathology_data.columns[i]
        col2 = pathology_data.columns[i + 1]
        pathology_data[col2] = pathology_data[col1] * 0.7 + pathology_data[col2] * 0.3

print(f"✅ High-dimensional pathology dataset created:")
print(f"   • {len(pathology_data)} samples")
print(f"   • {len(pathology_data.columns) - 2} features")
print(f"   • 4 tissue types: {tissue_types}")
print(f"   • Feature categories: Nuclear (20), Chromatin (15), Organization (10), Vascular (7), Immune (10)")

# Display class distribution
print(f"\nClass distribution:")
for tissue, count in pd.Series(tissue_labels).value_counts().items():
    print(f"   • {tissue}: {count} samples ({count/len(tissue_labels)*100:.1f}%)")

pathology_data.head(3)

## 2. Principal Component Analysis (PCA)

Start with linear dimensionality reduction using PCA.

In [None]:
# Prepare data for PCA
feature_columns = [col for col in pathology_data.columns if col not in ['tissue_type', 'patient_id']]
X = pathology_data[feature_columns].values
y = pathology_data['tissue_type'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Data prepared for PCA: {X_scaled.shape}")

# Perform PCA
print("Performing PCA analysis...")
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)

# Calculate explained variance
explained_variance_ratio = pca_full.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Find optimal number of components for 95% variance
n_components_95 = np.where(cumulative_variance >= 0.95)[0][0] + 1
n_components_90 = np.where(cumulative_variance >= 0.90)[0][0] + 1

print(f"Components for 90% variance: {n_components_90}")
print(f"Components for 95% variance: {n_components_95}")

# Create PCA visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Scree plot
axes[0, 0].plot(range(1, min(21, len(explained_variance_ratio) + 1)), 
                explained_variance_ratio[:20], 'bo-', markersize=6)
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('PCA Scree Plot (First 20 Components)')
axes[0, 0].grid(True, alpha=0.3)

# Add elbow point annotation
if len(explained_variance_ratio) > 2:
    elbow_point = np.where(explained_variance_ratio > 0.05)[0][-1] + 1 if any(explained_variance_ratio > 0.05) else 3
    axes[0, 0].axvline(x=elbow_point, color='red', linestyle='--', alpha=0.7, label=f'Elbow at PC{elbow_point}')
    axes[0, 0].legend()

# 2. Cumulative variance explained
axes[0, 1].plot(range(1, min(31, len(cumulative_variance) + 1)), 
                cumulative_variance[:30], 'ro-', markersize=4)
axes[0, 1].axhline(y=0.90, color='orange', linestyle='--', alpha=0.7, label='90% variance')
axes[0, 1].axhline(y=0.95, color='green', linestyle='--', alpha=0.7, label='95% variance')
axes[0, 1].set_xlabel('Number of Components')
axes[0, 1].set_ylabel('Cumulative Explained Variance')
axes[0, 1].set_title('Cumulative Variance Explained')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. PCA 2D projection
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

colors = {'Normal': 'blue', 'Benign': 'green', 'Low_Grade_Cancer': 'orange', 'High_Grade_Cancer': 'red'}
for tissue in tissue_types:
    mask = y == tissue
    axes[1, 0].scatter(X_pca_2d[mask, 0], X_pca_2d[mask, 1], 
                      c=colors[tissue], label=tissue, alpha=0.7, s=30)

axes[1, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} variance)')
axes[1, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} variance)')
axes[1, 0].set_title('PCA 2D Projection')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature loadings for PC1 and PC2
feature_loadings = pca_2d.components_.T
feature_importance = np.sqrt(feature_loadings[:, 0]**2 + feature_loadings[:, 1]**2)

# Show top 10 most important features
top_features_idx = np.argsort(feature_importance)[-10:]
top_features = [feature_columns[i] for i in top_features_idx]
top_loadings = feature_importance[top_features_idx]

axes[1, 1].barh(range(len(top_features)), top_loadings, color='skyblue')
axes[1, 1].set_yticks(range(len(top_features)))
axes[1, 1].set_yticklabels([f.replace('_', ' ')[:15] + '...' if len(f) > 15 else f.replace('_', ' ') 
                           for f in top_features])
axes[1, 1].set_xlabel('Loading Magnitude')
axes[1, 1].set_title('Top 10 Feature Loadings (PC1 + PC2)')

plt.tight_layout()
plt.show()

# Print PCA summary
print("=== PCA ANALYSIS SUMMARY ===")
print(f"Total features: {X_scaled.shape[1]}")
print(f"First 2 components explain: {pca_2d.explained_variance_ratio_.sum():.1%} of variance")
print(f"First 5 components explain: {explained_variance_ratio[:5].sum():.1%} of variance")
print(f"Components needed for 95% variance: {n_components_95}")

print(f"\nTop contributing features to PC1 & PC2:")
for i, (feature, importance) in enumerate(zip(top_features[-5:], top_loadings[-5:]), 1):
    print(f"{i}. {feature}: {importance:.3f}")

## 3. t-SNE Analysis

Apply t-SNE for non-linear dimensionality reduction.

In [None]:
# t-SNE analysis with different perplexity values
print("Performing t-SNE analysis...")

# Test different perplexity values
perplexity_values = [5, 15, 30, 50]
tsne_results = {}

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, perplexity in enumerate(perplexity_values):
    print(f"Computing t-SNE with perplexity = {perplexity}...")
    
    # Limit features for faster computation
    X_sample = X_scaled[:, :30]  # Use first 30 features for speed
    
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, 
               n_iter=1000, learning_rate=200)
    X_tsne = tsne.fit_transform(X_sample)
    
    tsne_results[perplexity] = X_tsne
    
    # Plot results
    for tissue in tissue_types:
        mask = y == tissue
        axes[i].scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
                       c=colors[tissue], label=tissue, alpha=0.7, s=20)
    
    axes[i].set_title(f't-SNE (perplexity = {perplexity})')
    axes[i].set_xlabel('t-SNE 1')
    axes[i].set_ylabel('t-SNE 2')
    if i == 0:
        axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Analyze t-SNE clustering quality
print("\n=== t-SNE CLUSTERING QUALITY ===")

for perplexity, X_tsne in tsne_results.items():
    # Calculate silhouette score
    le = LabelEncoder()
    y_numeric = le.fit_transform(y)
    sil_score = silhouette_score(X_tsne, y_numeric)
    
    print(f"Perplexity {perplexity:2d}: Silhouette Score = {sil_score:.3f}")

# Find best perplexity
best_perplexity = max(tsne_results.keys(), 
                     key=lambda p: silhouette_score(tsne_results[p], le.fit_transform(y)))
print(f"\nBest perplexity: {best_perplexity} (highest silhouette score)")

# Detailed analysis of best t-SNE result
X_tsne_best = tsne_results[best_perplexity]

# Calculate cluster separation metrics
from scipy.spatial.distance import pdist, squareform

print(f"\n=== DETAILED t-SNE ANALYSIS (Perplexity = {best_perplexity}) ===")

# Calculate within-class and between-class distances
within_class_distances = []
between_class_distances = []

for tissue in tissue_types:
    mask = y == tissue
    tissue_points = X_tsne_best[mask]
    
    if len(tissue_points) > 1:
        # Within-class distances
        within_dist = pdist(tissue_points)
        within_class_distances.extend(within_dist)
        
        # Between-class distances to other tissues
        for other_tissue in tissue_types:
            if other_tissue != tissue:
                other_mask = y == other_tissue
                other_points = X_tsne_best[other_mask]
                
                for point1 in tissue_points:
                    for point2 in other_points:
                        between_dist = np.linalg.norm(point1 - point2)
                        between_class_distances.append(between_dist)

avg_within = np.mean(within_class_distances)
avg_between = np.mean(between_class_distances)
separation_ratio = avg_between / avg_within

print(f"Average within-class distance: {avg_within:.2f}")
print(f"Average between-class distance: {avg_between:.2f}")
print(f"Separation ratio: {separation_ratio:.2f}")

if separation_ratio > 2:
    print("✅ Excellent class separation!")
elif separation_ratio > 1.5:
    print("✅ Good class separation")
elif separation_ratio > 1.2:
    print("⚠️ Moderate class separation")
else:
    print("❌ Poor class separation")

## 4. UMAP Analysis

Apply UMAP for state-of-the-art non-linear dimensionality reduction.

In [None]:
# UMAP analysis with hyperparameter optimization
print("Performing UMAP analysis...")

# Define hyperparameter grid
n_neighbors_values = [5, 15, 30, 50]
min_dist_values = [0.01, 0.1, 0.3, 0.5]

# Test key hyperparameter combinations
hyperparameter_combinations = [
    (15, 0.1),  # Default-like
    (5, 0.01),   # Local structure focus
    (50, 0.3),   # Global structure focus  
    (30, 0.1)    # Balanced
]

umap_results = {}

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, (n_neighbors, min_dist) in enumerate(hyperparameter_combinations):
    print(f"Computing UMAP with n_neighbors = {n_neighbors}, min_dist = {min_dist}...")
    
    umap_reducer = umap.UMAP(n_neighbors=n_neighbors, 
                            min_dist=min_dist,
                            n_components=2, 
                            random_state=42,
                            metric='euclidean')
    
    X_umap = umap_reducer.fit_transform(X_scaled)
    umap_results[(n_neighbors, min_dist)] = X_umap
    
    # Plot results
    for tissue in tissue_types:
        mask = y == tissue
        axes[i].scatter(X_umap[mask, 0], X_umap[mask, 1], 
                       c=colors[tissue], label=tissue, alpha=0.7, s=20)
    
    axes[i].set_title(f'UMAP (neighbors={n_neighbors}, min_dist={min_dist})')
    axes[i].set_xlabel('UMAP 1')
    axes[i].set_ylabel('UMAP 2')
    if i == 0:
        axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Evaluate UMAP results
print("\n=== UMAP HYPERPARAMETER EVALUATION ===")

le = LabelEncoder()
y_numeric = le.fit_transform(y)

umap_evaluation = []

for (n_neighbors, min_dist), X_umap in umap_results.items():
    # Silhouette score
    sil_score = silhouette_score(X_umap, y_numeric)
    
    # Custom cluster separation metric
    within_distances = []
    between_distances = []
    
    for tissue_idx, tissue in enumerate(tissue_types):
        mask = y == tissue
        tissue_points = X_umap[mask]
        
        if len(tissue_points) > 1:
            # Within-tissue distances
            within_dist = pdist(tissue_points)
            within_distances.extend(within_dist)
            
            # Between-tissue distances
            for other_tissue_idx, other_tissue in enumerate(tissue_types):
                if other_tissue_idx > tissue_idx:  # Avoid duplicates
                    other_mask = y == other_tissue
                    other_points = X_umap[other_mask]
                    
                    # Sample distances to avoid computation explosion
                    sample_size = min(50, len(tissue_points), len(other_points))
                    tissue_sample = tissue_points[np.random.choice(len(tissue_points), sample_size, replace=False)]
                    other_sample = other_points[np.random.choice(len(other_points), sample_size, replace=False)]
                    
                    for point1 in tissue_sample:
                        for point2 in other_sample:
                            between_distances.append(np.linalg.norm(point1 - point2))
    
    avg_within = np.mean(within_distances) if within_distances else 0
    avg_between = np.mean(between_distances) if between_distances else 0
    separation_ratio = avg_between / avg_within if avg_within > 0 else 0
    
    umap_evaluation.append({
        'n_neighbors': n_neighbors,
        'min_dist': min_dist,
        'silhouette_score': sil_score,
        'separation_ratio': separation_ratio,
        'avg_within_distance': avg_within,
        'avg_between_distance': avg_between
    })
    
    print(f"n_neighbors={n_neighbors:2d}, min_dist={min_dist:.2f}: "
          f"Silhouette={sil_score:.3f}, Separation={separation_ratio:.2f}")

# Find best UMAP configuration
eval_df = pd.DataFrame(umap_evaluation)
best_config_idx = eval_df['silhouette_score'].idxmax()
best_config = eval_df.iloc[best_config_idx]

print(f"\n✅ Best UMAP configuration:")
print(f"   n_neighbors = {best_config['n_neighbors']}")
print(f"   min_dist = {best_config['min_dist']}")
print(f"   Silhouette score = {best_config['silhouette_score']:.3f}")
print(f"   Separation ratio = {best_config['separation_ratio']:.2f}")

# Store best UMAP result
best_umap_key = (int(best_config['n_neighbors']), best_config['min_dist'])
X_umap_best = umap_results[best_umap_key]

## 5. Method Comparison

Compare PCA, t-SNE, and UMAP side-by-side.

In [None]:
# Ensure DATA_DIR exists (from setup). Not directly used here but kept for consistency.
try:
    DATA_DIR
except NameError:
    from shared import utils as u
    DATA_DIR = u.get_data_dir()

## 6. Cluster Analysis and Discovery

Analyze clusters discovered by dimensionality reduction.

In [None]:
# Cluster analysis on embeddings
print("=== CLUSTER DISCOVERY ANALYSIS ===")

# Use best embedding method for cluster analysis
best_embedding_method = metrics_df.loc[metrics_df['Silhouette_Score'].idxmax(), 'Method']
best_embedding = methods_data[best_embedding_method]

print(f"Using {best_embedding_method} embedding for cluster analysis")

# Try different clustering algorithms
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

clustering_methods = {
    'K-Means': KMeans(n_clusters=4, random_state=42),
    'DBSCAN': DBSCAN(eps=1.5, min_samples=5),
    'Hierarchical': AgglomerativeClustering(n_clusters=4),
    'Gaussian Mixture': GaussianMixture(n_components=4, random_state=42)
}

cluster_results = {}

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, (method_name, clusterer) in enumerate(clustering_methods.items()):
    print(f"\nApplying {method_name}...")
    
    if method_name == 'Gaussian Mixture':
        cluster_labels = clusterer.fit_predict(best_embedding)
    else:
        cluster_labels = clusterer.fit_predict(best_embedding)
    
    cluster_results[method_name] = cluster_labels
    
    # Handle noise points in DBSCAN (labeled as -1)
    unique_labels = np.unique(cluster_labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    
    print(f"Found {n_clusters} clusters")
    
    # Plot results
    ax = axes[i]
    
    # Color map for clusters
    if -1 in unique_labels:  # DBSCAN with noise
        colors_cluster = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
        colors_cluster = colors_cluster[unique_labels != -1]  # Remove noise color
        
        # Plot noise points
        noise_mask = cluster_labels == -1
        ax.scatter(best_embedding[noise_mask, 0], best_embedding[noise_mask, 1], 
                  c='black', marker='x', s=20, alpha=0.5, label='Noise')
        
        # Plot clusters
        for j, label in enumerate(unique_labels):
            if label != -1:
                mask = cluster_labels == label
                ax.scatter(best_embedding[mask, 0], best_embedding[mask, 1], 
                          c=[colors_cluster[j]], label=f'Cluster {label}', s=30, alpha=0.7)
    else:
        colors_cluster = plt.cm.Set1(np.linspace(0, 1, len(unique_labels)))
        for j, label in enumerate(unique_labels):
            mask = cluster_labels == label
            ax.scatter(best_embedding[mask, 0], best_embedding[mask, 1], 
                      c=[colors_cluster[j]], label=f'Cluster {label}', s=30, alpha=0.7)
    
    ax.set_title(f'{method_name} Clustering')
    ax.set_xlabel(f'{best_embedding_method} 1')
    ax.set_ylabel(f'{best_embedding_method} 2')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Calculate clustering metrics
    if n_clusters > 1:
        # Silhouette score
        valid_mask = cluster_labels != -1  # Exclude noise for DBSCAN
        if np.sum(valid_mask) > 1:
            sil_score = silhouette_score(best_embedding[valid_mask], cluster_labels[valid_mask])
            
            # Adjusted Rand Index with true labels
            ari_score = adjusted_rand_score(y_numeric[valid_mask], cluster_labels[valid_mask])
            
            print(f"  Silhouette Score: {sil_score:.3f}")
            print(f"  Adjusted Rand Index: {ari_score:.3f}")
        else:
            print("  Unable to calculate metrics (insufficient valid points)")
    else:
        print("  Unable to calculate metrics (insufficient clusters)")

plt.tight_layout()
plt.show()

# Detailed analysis of best clustering method
best_clustering_method = None
best_ari = -1

for method_name, cluster_labels in cluster_results.items():
    valid_mask = cluster_labels != -1
    if np.sum(valid_mask) > 1 and len(np.unique(cluster_labels[valid_mask])) > 1:
        ari_score = adjusted_rand_score(y_numeric[valid_mask], cluster_labels[valid_mask])
        if ari_score > best_ari:
            best_ari = ari_score
            best_clustering_method = method_name

if best_clustering_method:
    print(f"\n🏆 Best clustering method: {best_clustering_method} (ARI = {best_ari:.3f})")
    
    # Analyze agreement between discovered clusters and true tissue types
    best_clusters = cluster_results[best_clustering_method]
    
    print(f"\n=== CLUSTER-TISSUE TYPE CORRESPONDENCE ===")
    
    # Create confusion matrix
    from sklearn.metrics import confusion_matrix
    
    valid_mask = best_clusters != -1
    if np.sum(valid_mask) > 0:
        cm = confusion_matrix(y_numeric[valid_mask], best_clusters[valid_mask])
        
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        
        # Get unique cluster labels for labeling
        unique_clusters = np.unique(best_clusters[valid_mask])
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=[f'Cluster {i}' for i in unique_clusters],
                   yticklabels=tissue_types, ax=ax)
        ax.set_title(f'Confusion Matrix: True Tissue Types vs {best_clustering_method} Clusters')
        ax.set_ylabel('True Tissue Type')
        ax.set_xlabel('Discovered Cluster')
        
        plt.tight_layout()
        plt.show()
        
        # Calculate purity scores
        cluster_purities = []
        for cluster_id in unique_clusters:
            cluster_mask = (best_clusters == cluster_id) & valid_mask
            if np.sum(cluster_mask) > 0:
                cluster_tissue_counts = pd.Series(y[cluster_mask]).value_counts()
                purity = cluster_tissue_counts.max() / cluster_tissue_counts.sum()
                dominant_tissue = cluster_tissue_counts.idxmax()
                cluster_purities.append({
                    'Cluster': cluster_id,
                    'Dominant_Tissue': dominant_tissue,
                    'Purity': purity,
                    'Size': np.sum(cluster_mask)
                })
        
        purity_df = pd.DataFrame(cluster_purities)
        print(f"\nCluster Purity Analysis:")
        for _, row in purity_df.iterrows():
            print(f"  Cluster {int(row['Cluster'])}: {row['Dominant_Tissue']} "
                  f"({row['Purity']:.1%} purity, {int(row['Size'])} samples)")
        
        average_purity = purity_df['Purity'].mean()
        print(f"\nOverall average purity: {average_purity:.1%}")
        
        if average_purity > 0.8:
            print("✅ Excellent cluster-tissue correspondence!")
        elif average_purity > 0.6:
            print("✅ Good cluster-tissue correspondence")
        elif average_purity > 0.4:
            print("⚠️ Moderate cluster-tissue correspondence")
        else:
            print("❌ Poor cluster-tissue correspondence")

## 7. Interactive Visualization (if Plotly available)

Create interactive plots for better exploration of the embeddings.

In [None]:
# Interactive visualization
if PLOTLY_AVAILABLE:
    print("Creating interactive visualizations with Plotly...")
    
    # Create interactive comparison of all three methods
    fig = make_subplots(rows=1, cols=3, 
                       subplot_titles=['PCA', 't-SNE', 'UMAP'],
                       specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]])
    
    # Color mapping
    color_map = {'Normal': 'blue', 'Benign': 'green', 
                'Low_Grade_Cancer': 'orange', 'High_Grade_Cancer': 'red'}
    
    methods_plot = [
        ('PCA', X_pca_2d, 1),
        ('t-SNE', tsne_results[best_perplexity], 2),
        ('UMAP', X_umap_best, 3)
    ]
    
    for method_name, embedding, col_idx in methods_plot:
        for tissue in tissue_types:
            mask = y == tissue
            fig.add_scatter(x=embedding[mask, 0], 
                          y=embedding[mask, 1],
                          mode='markers',
                          name=tissue,
                          marker=dict(color=color_map[tissue], size=8, opacity=0.7),
                          text=[f"Patient: {pathology_data.iloc[i]['patient_id']}<br>Tissue: {tissue}" 
                                for i in np.where(mask)[0]],
                          hovertemplate='<b>%{text}</b><br>%{x:.2f}, %{y:.2f}<extra></extra>',
                          showlegend=(col_idx == 1),  # Only show legend for first subplot
                          row=1, col=col_idx)
    
    fig.update_layout(title_text="Interactive Dimensionality Reduction Comparison", 
                     title_x=0.5,
                     height=500,
                     width=1200)
    
    # Update axes labels
    fig.update_xaxes(title_text="Component 1", row=1, col=1)
    fig.update_xaxes(title_text="Component 1", row=1, col=2)
    fig.update_xaxes(title_text="Component 1", row=1, col=3)
    fig.update_yaxes(title_text="Component 2", row=1, col=1)
    fig.update_yaxes(title_text="Component 2", row=1, col=2)
    fig.update_yaxes(title_text="Component 2", row=1, col=3)
    
    fig.show()
    
    # Create 3D UMAP visualization
    print("Creating 3D UMAP visualization...")
    
    umap_3d = umap.UMAP(n_neighbors=int(best_config['n_neighbors']), 
                       min_dist=best_config['min_dist'],
                       n_components=3, 
                       random_state=42)
    
    X_umap_3d = umap_3d.fit_transform(X_scaled)
    
    # Create 3D scatter plot
    fig_3d = go.Figure()
    
    for tissue in tissue_types:
        mask = y == tissue
        fig_3d.add_scatter3d(x=X_umap_3d[mask, 0],
                           y=X_umap_3d[mask, 1], 
                           z=X_umap_3d[mask, 2],
                           mode='markers',
                           name=tissue,
                           marker=dict(color=color_map[tissue], size=5, opacity=0.8),
                           text=[f"Patient: {pathology_data.iloc[i]['patient_id']}<br>Tissue: {tissue}" 
                                 for i in np.where(mask)[0]],
                           hovertemplate='<b>%{text}</b><br>(%{x:.2f}, %{y:.2f}, %{z:.2f})<extra></extra>')
    
    fig_3d.update_layout(title='3D UMAP Visualization',
                        scene=dict(xaxis_title='UMAP 1',
                                  yaxis_title='UMAP 2', 
                                  zaxis_title='UMAP 3'),
                        width=800,
                        height=600)
    
    fig_3d.show()
    
    print("✅ Interactive visualizations created!")
    
else:
    print("Creating static 3D visualization...")
    
    # Create 3D UMAP with matplotlib
    umap_3d = umap.UMAP(n_neighbors=int(best_config['n_neighbors']), 
                       min_dist=best_config['min_dist'],
                       n_components=3, 
                       random_state=42)
    
    X_umap_3d = umap_3d.fit_transform(X_scaled)
    
    fig = plt.figure(figsize=(12, 9))
    ax = fig.add_subplot(111, projection='3d')
    
    for tissue in tissue_types:
        mask = y == tissue
        ax.scatter(X_umap_3d[mask, 0], X_umap_3d[mask, 1], X_umap_3d[mask, 2],
                  c=colors[tissue], label=tissue, s=30, alpha=0.7)
    
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2') 
    ax.set_zlabel('UMAP 3')
    ax.set_title('3D UMAP Visualization')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("✅ 3D visualization created!")

## 8. Auto-Validation Tests

In [None]:
# Auto-validation tests for dimensionality reduction
print("=== AUTO-VALIDATION TESTS ===")

# Test 1: Embedding dimensions
assert X_pca_2d.shape == (n_samples, 2), "❌ PCA embedding wrong dimensions"
assert X_tsne_best.shape == (n_samples, 2), "❌ t-SNE embedding wrong dimensions" 
assert X_umap_best.shape == (n_samples, 2), "❌ UMAP embedding wrong dimensions"
print("✅ Test 1 passed: All embeddings have correct dimensions")

# Test 2: No NaN values in embeddings
assert not np.isnan(X_pca_2d).any(), "❌ NaN values in PCA embedding"
assert not np.isnan(X_tsne_best).any(), "❌ NaN values in t-SNE embedding"
assert not np.isnan(X_umap_best).any(), "❌ NaN values in UMAP embedding"
print("✅ Test 2 passed: No NaN values in embeddings")

# Test 3: PCA explained variance properties
assert 0 < pca_2d.explained_variance_ratio_[0] < 1, "❌ Invalid PC1 explained variance"
assert 0 < pca_2d.explained_variance_ratio_[1] < 1, "❌ Invalid PC2 explained variance"
assert pca_2d.explained_variance_ratio_[0] >= pca_2d.explained_variance_ratio_[1], "❌ PC1 should explain more variance than PC2"
print("✅ Test 3 passed: PCA explained variance is valid")

# Test 4: Clustering validation
for method_name, cluster_labels in cluster_results.items():
    n_unique_clusters = len(np.unique(cluster_labels[cluster_labels != -1]))
    assert n_unique_clusters >= 1, f"❌ {method_name} produced no valid clusters"
    assert n_unique_clusters <= n_samples, f"❌ {method_name} produced too many clusters"
print("✅ Test 4 passed: All clustering methods produced valid results")

# Test 5: Silhouette scores are in valid range
for method in ['PCA', 't-SNE', 'UMAP']:
    method_embedding = methods_data[method]
    sil_score = silhouette_score(method_embedding, y_numeric)
    assert -1 <= sil_score <= 1, f"❌ Invalid silhouette score for {method}"
print("✅ Test 5 passed: All silhouette scores in valid range")

# Test 6: Feature scaling verification
assert np.allclose(X_scaled.mean(axis=0), 0, atol=1e-10), "❌ Features not properly centered"
assert np.allclose(X_scaled.std(axis=0), 1, atol=1e-10), "❌ Features not properly scaled"
print("✅ Test 6 passed: Data preprocessing is correct")

# Test 7: Hyperparameter optimization results
assert best_config['silhouette_score'] > -1, "❌ Best silhouette score too low"
assert 0 < best_config['n_neighbors'] <= 100, "❌ Invalid best n_neighbors"
assert 0 <= best_config['min_dist'] <= 1, "❌ Invalid best min_dist"
print("✅ Test 7 passed: Hyperparameter optimization successful")

# Test 8: Method comparison consistency
methods_comparison = metrics_df
assert len(methods_comparison) == 3, "❌ Should compare exactly 3 methods"
assert all(0 <= score <= 1 for score in methods_comparison['Silhouette_Score']), "❌ Invalid silhouette scores"
assert all(-1 <= score <= 1 for score in methods_comparison['Rank_Correlation']), "❌ Invalid rank correlations"
print("✅ Test 8 passed: Method comparison metrics are valid")

print(f"\n🎉 All validation tests passed! You've successfully mastered dimensionality reduction techniques!")
print(f"Summary of achievements:")
print(f"• ✅ Applied PCA, t-SNE, and UMAP to {n_samples} pathology samples")
print(f"• ✅ Optimized hyperparameters for each method")
print(f"• ✅ Compared methods using multiple metrics")
print(f"• ✅ Discovered {len(tissue_types)} tissue type clusters")
print(f"• ✅ Best method: {best_overall} (silhouette score: {metrics_df.set_index('Method').loc[best_overall, 'Silhouette_Score']:.3f})")

## 9. Next Steps and Advanced Applications

Congratulations! You've mastered advanced dimensionality reduction techniques for digital pathology data.

**Key Skills Acquired:**
✅ PCA for linear dimensionality reduction  
✅ t-SNE for local structure preservation  
✅ UMAP for balanced local/global structure  
✅ Hyperparameter optimization and validation  
✅ Method comparison and selection  
✅ Cluster discovery and analysis  
✅ Interactive visualization techniques  
✅ Quality metrics and validation  

**In your next advanced notebooks, you'll learn:**
- Foundation models (UNI, CONCH, CLAM) integration
- Whole slide image analysis workflows
- Computational pathology pipelines
- Spatial transcriptomics analysis
- Multi-modal data integration

**For Further Practice:**
- Apply to real TCGA datasets from different cancer types
- Experiment with other distance metrics (cosine, manhattan)
- Try ensemble dimensionality reduction approaches
- Create interactive dashboards for pathology exploration
- Integrate with deep learning feature extractors

**Clinical Applications:**
- Patient stratification and subtyping
- Biomarker discovery workflows
- Treatment response prediction
- Pathology image quality assessment
- Multi-omics data integration
- Precision medicine applications

**Advanced Techniques to Explore:**
- Parametric UMAP for new sample projection
- Supervised dimensionality reduction
- Time-series embedding for longitudinal studies
- Graph-based dimensionality reduction
- Deep autoencoders for non-linear embedding