# Feature Visualization and Analysis

This notebook visualizes and analyzes quantitative features extracted from histopathology images.

## Setup

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

%matplotlib inline

## Load Features

Load the quantitative features extracted from the trained model.

In [None]:
# Load features
features_path = Path('../experiments/features/features.parquet')

if features_path.exists():
    df = pd.read_parquet(features_path)
    print(f"Loaded {len(df)} samples with {len(df.columns)} features")
    print(f"\nColumns: {list(df.columns[:20])}...")
else:
    print(f"Features file not found at {features_path}")
    print("Please run: python scripts/extract_quant_features.py first")

In [None]:
# Merge with labels if needed
if 'label' not in df.columns:
    # Load splits to get labels
    train_df = pd.read_csv('../data/splits/train.csv')
    df = df.merge(train_df[['image_path', 'label']], on='image_path', how='left')

print(f"\nClass distribution:")
print(df['label'].value_counts())

## Feature Types

Separate features by type.

In [None]:
# Identify feature types
color_features = [c for c in df.columns if c.startswith('color_')]
texture_features = [c for c in df.columns if c.startswith('glcm_') or c.startswith('lbp_')]
morph_features = [c for c in df.columns if c.startswith('morph_')]
freq_features = [c for c in df.columns if c.startswith('freq_')]
deep_features = [c for c in df.columns if c.startswith('deep_feature_')]

print(f"Color features: {len(color_features)}")
print(f"Texture features: {len(texture_features)}")
print(f"Morphological features: {len(morph_features)}")
print(f"Frequency features: {len(freq_features)}")
print(f"Deep features: {len(deep_features)}")

## Feature Correlation with Label

In [None]:
# Compute correlations with label
feature_cols = color_features + texture_features + morph_features + freq_features
correlations = df[feature_cols + ['label']].corr()['label'].drop('label').sort_values(ascending=False)

# Plot top correlations
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Top positive correlations
top_positive = correlations.head(15)
axes[0].barh(range(len(top_positive)), top_positive.values)
axes[0].set_yticks(range(len(top_positive)))
axes[0].set_yticklabels(top_positive.index, fontsize=8)
axes[0].set_xlabel('Correlation with Label')
axes[0].set_title('Top 15 Positive Correlations')
axes[0].invert_yaxis()

# Top negative correlations
top_negative = correlations.tail(15)
axes[1].barh(range(len(top_negative)), top_negative.values)
axes[1].set_yticks(range(len(top_negative)))
axes[1].set_yticklabels(top_negative.index, fontsize=8)
axes[1].set_xlabel('Correlation with Label')
axes[1].set_title('Top 15 Negative Correlations')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## Feature Distributions by Class

In [None]:
# Plot distributions of top features
top_features = correlations.abs().sort_values(ascending=False).head(9).index.tolist()

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for idx, feature in enumerate(top_features):
    for class_id in [0, 1]:
        class_data = df[df['label'] == class_id][feature]
        axes[idx].hist(class_data, bins=30, alpha=0.5, 
                      label=f"Class {class_id}", density=True)
    
    axes[idx].set_title(feature, fontsize=10)
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Density')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## Correlation Heatmap

In [None]:
# Plot correlation heatmap for top features
top_n = 20
top_features_for_heatmap = correlations.abs().sort_values(ascending=False).head(top_n).index.tolist()

corr_matrix = df[top_features_for_heatmap].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title(f'Feature Correlation Heatmap (Top {top_n} Features)')
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(rotation=0, fontsize=8)
plt.tight_layout()
plt.show()

## Dimensionality Reduction

### PCA

In [None]:
# PCA on classical features
X = df[feature_cols].fillna(0)
y = df['label']

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# PCA scatter
for class_id in [0, 1]:
    mask = y == class_id
    axes[0].scatter(X_pca[mask, 0], X_pca[mask, 1], 
                   label=f"Class {class_id}", alpha=0.6, s=20)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
axes[0].set_title('PCA of Classical Features')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Explained variance
pca_full = PCA()
pca_full.fit(X_scaled)
cumsum = np.cumsum(pca_full.explained_variance_ratio_)
axes[1].plot(range(1, len(cumsum) + 1), cumsum)
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('PCA Explained Variance')
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% variance')
axes[1].legend()

plt.tight_layout()
plt.show()

### t-SNE

In [None]:
# t-SNE (on a subset for speed)
n_samples = min(1000, len(X_scaled))
indices = np.random.choice(len(X_scaled), n_samples, replace=False)

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled[indices])
y_subset = y.iloc[indices]

plt.figure(figsize=(10, 8))
for class_id in [0, 1]:
    mask = y_subset == class_id
    plt.scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
               label=f"Class {class_id}", alpha=0.6, s=20)

plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE of Classical Features')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### UMAP (if deep features available)

In [None]:
# UMAP on deep features if available
if len(deep_features) > 0:
    X_deep = df[deep_features].fillna(0)
    
    # UMAP
    reducer = umap.UMAP(n_components=2, random_state=42)
    X_umap = reducer.fit_transform(X_deep)
    
    plt.figure(figsize=(10, 8))
    for class_id in [0, 1]:
        mask = y == class_id
        plt.scatter(X_umap[mask, 0], X_umap[mask, 1], 
                   label=f"Class {class_id}", alpha=0.6, s=20)
    
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.title('UMAP of Deep Features')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No deep features found. Extract features with trained model first.")

## Feature Importance

In [None]:
# Train a simple Random Forest to get feature importance
from sklearn.ensemble import RandomForestClassifier

# Use a subset of features to avoid overfitting
selected_features = correlations.abs().sort_values(ascending=False).head(30).index.tolist()
X_selected = df[selected_features].fillna(0)

# Train RF
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_selected, y)

# Get feature importance
importances = pd.Series(rf.feature_importances_, index=selected_features).sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 8))
importances.head(20).plot(kind='barh')
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Summary Statistics by Feature Type

In [None]:
# Compare feature types
feature_types = {
    'Color': color_features,
    'Texture': texture_features,
    'Morphology': morph_features,
    'Frequency': freq_features,
}

type_stats = []

for ftype, features in feature_types.items():
    if len(features) > 0:
        # Average absolute correlation with label
        avg_corr = correlations[features].abs().mean()
        max_corr = correlations[features].abs().max()
        
        type_stats.append({
            'Feature Type': ftype,
            'Count': len(features),
            'Avg |Correlation|': avg_corr,
            'Max |Correlation|': max_corr,
        })

stats_df = pd.DataFrame(type_stats)
print("\nFeature Type Statistics:")
print(stats_df.to_string(index=False))

## Conclusions

- Features have been successfully extracted and analyzed
- Clear separation between classes is visible in reduced dimensions
- Multiple feature types contribute to classification
- Ready for advanced modeling and interpretation!