# Body Shape Diversity Metrics
## PCA Explained Variance Analysis

This notebook measures body shape diversity using pose-based body proportions and PCA analysis.

## 1. Setup and Installation

In [None]:
# Install dependencies
!pip install -q tensorflow tensorflow-hub numpy scipy matplotlib pillow tqdm scikit-learn

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import json

print(f"TensorFlow version: {tf.__version__}")

## 2. Load MoveNet Lightning Model

In [None]:
# Load MoveNet Lightning
model_url = "https://tfhub.dev/google/movenet/singlepose/lightning/4"
model = hub.load(model_url)
movenet = model.signatures['serving_default']

print("MoveNet Lightning loaded!")

# Keypoint names
KEYPOINT_NAMES = [
    'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',
    'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
    'left_wrist', 'right_wrist', 'left_hip', 'right_hip',
    'left_knee', 'right_knee', 'left_ankle', 'right_ankle'
]

## 3. Pose Extraction

In [None]:
def load_and_preprocess(image_path, input_size=192):
    """Preprocess image for MoveNet"""
    img = tf.io.read_file(str(image_path))
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.resize_with_pad(img, input_size, input_size)
    img = tf.cast(img, dtype=tf.int32)
    return img

def extract_keypoints(image_path):
    """Extract pose keypoints"""
    try:
        img = load_and_preprocess(image_path)
        img = tf.expand_dims(img, axis=0)
        outputs = movenet(img)
        keypoints = outputs['output_0'].numpy()[0, 0]
        return keypoints
    except Exception as e:
        return None

def extract_all_keypoints(image_dir, max_images=None):
    """Extract keypoints from all images"""
    image_dir = Path(image_dir)
    extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp']
    
    paths = []
    for ext in extensions:
        paths.extend(list(image_dir.rglob(ext)))
    
    if max_images:
        paths = paths[:max_images]
    
    print(f"Processing {len(paths)} images...")
    
    keypoints = []
    valid_paths = []
    
    for path in tqdm(paths):
        kp = extract_keypoints(path)
        if kp is not None:
            keypoints.append(kp)
            valid_paths.append(path)
    
    return np.array(keypoints), valid_paths

## 4. Body Proportion Features

In [None]:
def compute_distance(kp, idx1, idx2):
    """Compute Euclidean distance between two keypoints"""
    p1 = kp[idx1, :2]  # [y, x]
    p2 = kp[idx2, :2]
    return np.linalg.norm(p1 - p2)

def extract_body_proportions(keypoints):
    """
    Extract body proportion features from pose keypoints.
    Returns ratios that are scale-invariant.
    """
    # Compute various body segment lengths
    
    # Torso: shoulder center to hip center
    shoulder_center = (keypoints[5, :2] + keypoints[6, :2]) / 2
    hip_center = (keypoints[11, :2] + keypoints[12, :2]) / 2
    torso_length = np.linalg.norm(shoulder_center - hip_center)
    
    if torso_length < 0.01:  # Invalid pose
        return None
    
    # Limb lengths
    left_upper_arm = compute_distance(keypoints, 5, 7)  # shoulder to elbow
    left_forearm = compute_distance(keypoints, 7, 9)    # elbow to wrist
    right_upper_arm = compute_distance(keypoints, 6, 8)
    right_forearm = compute_distance(keypoints, 8, 10)
    
    left_thigh = compute_distance(keypoints, 11, 13)    # hip to knee
    left_shin = compute_distance(keypoints, 13, 15)     # knee to ankle
    right_thigh = compute_distance(keypoints, 12, 14)
    right_shin = compute_distance(keypoints, 14, 16)
    
    # Shoulder and hip width
    shoulder_width = compute_distance(keypoints, 5, 6)
    hip_width = compute_distance(keypoints, 11, 12)
    
    # Head-related
    head_size = compute_distance(keypoints, 0, 5)  # nose to shoulder (approximation)
    
    # Compute normalized ratios (relative to torso length)
    proportions = [
        # Arm ratios
        (left_upper_arm + right_upper_arm) / (2 * torso_length),  # avg upper arm ratio
        (left_forearm + right_forearm) / (2 * torso_length),      # avg forearm ratio
        (left_upper_arm + left_forearm + right_upper_arm + right_forearm) / (4 * torso_length),  # total arm ratio
        
        # Leg ratios
        (left_thigh + right_thigh) / (2 * torso_length),          # avg thigh ratio
        (left_shin + right_shin) / (2 * torso_length),            # avg shin ratio
        (left_thigh + left_shin + right_thigh + right_shin) / (4 * torso_length),  # total leg ratio
        
        # Width ratios
        shoulder_width / torso_length,                            # shoulder-torso ratio
        hip_width / torso_length,                                 # hip-torso ratio
        shoulder_width / (hip_width + 1e-6),                      # shoulder-hip ratio
        
        # Head ratio
        head_size / torso_length,                                 # head-torso ratio
        
        # Symmetry ratios
        left_upper_arm / (right_upper_arm + 1e-6),                # arm symmetry
        left_thigh / (right_thigh + 1e-6),                        # leg symmetry
    ]
    
    return np.array(proportions)

PROPORTION_NAMES = [
    'upper_arm_ratio', 'forearm_ratio', 'total_arm_ratio',
    'thigh_ratio', 'shin_ratio', 'total_leg_ratio',
    'shoulder_torso_ratio', 'hip_torso_ratio', 'shoulder_hip_ratio',
    'head_torso_ratio', 'arm_symmetry', 'leg_symmetry'
]

## 5. Body Shape Diversity Metrics

In [None]:
def compute_pca_variance(proportions):
    """
    Compute PCA explained variance for body proportions.
    High variance in multiple components = diverse body shapes.
    """
    # Standardize features
    scaler = StandardScaler()
    proportions_scaled = scaler.fit_transform(proportions)
    
    # Apply PCA
    pca = PCA()
    pca.fit(proportions_scaled)
    
    return {
        'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
        'cumulative_variance_90': int(np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.9)) + 1,
        'cumulative_variance_95': int(np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95)) + 1,
        'singular_values': pca.singular_values_.tolist(),
    }, pca, proportions_scaled

def compute_proportion_entropy(proportions, num_bins=10):
    """Compute entropy of body proportion distributions"""
    entropies = []
    
    for i, name in enumerate(PROPORTION_NAMES):
        values = proportions[:, i]
        
        # Filter outliers
        q1, q99 = np.percentile(values, [1, 99])
        values = values[(values >= q1) & (values <= q99)]
        
        if len(values) < 10:
            entropies.append(0)
            continue
        
        # Compute histogram
        hist, _ = np.histogram(values, bins=num_bins, density=True)
        hist = hist[hist > 0]
        
        # Shannon entropy
        entropy = -np.sum(hist * np.log(hist + 1e-10))
        entropies.append(entropy)
    
    return {
        'per_feature_entropy': dict(zip(PROPORTION_NAMES, entropies)),
        'avg_entropy': float(np.mean(entropies)),
    }

def compute_proportion_diversity(proportions):
    """Compute diversity statistics for body proportions"""
    stats = {}
    
    for i, name in enumerate(PROPORTION_NAMES):
        values = proportions[:, i]
        stats[name] = {
            'mean': float(np.mean(values)),
            'std': float(np.std(values)),
            'min': float(np.min(values)),
            'max': float(np.max(values)),
            'range': float(np.max(values) - np.min(values)),
        }
    
    # Overall diversity score (coefficient of variation)
    cv_scores = [stats[name]['std'] / (abs(stats[name]['mean']) + 1e-6) 
                 for name in PROPORTION_NAMES]
    
    return {
        'per_feature_stats': stats,
        'avg_coefficient_of_variation': float(np.mean(cv_scores)),
    }

## 6. Load Dataset Configuration

In [None]:
config_path = Path('/content/datasets/dataset_config.json')

if config_path.exists():
    with open(config_path) as f:
        config = json.load(f)
    print("Loaded dataset configuration")
else:
    config = {
        'vitonhd': '/content/datasets/vitonhd',
        'deepfashion1': '/content/datasets/deepfashion1',
        'dresscode': '/content/datasets/dresscode',
    }
    print("Using default paths")

## 7. Evaluate Body Shape Diversity

In [None]:
def evaluate_body_shape_diversity(dataset_name, dataset_path, max_images=500):
    """Evaluate body shape diversity"""
    print(f"\n{'='*60}")
    print(f"Evaluating: {dataset_name}")
    print(f"{'='*60}")
    
    dataset_path = Path(dataset_path)
    if not dataset_path.exists():
        print(f"Dataset path not found: {dataset_path}")
        return None, None
    
    # Extract keypoints
    keypoints, paths = extract_all_keypoints(dataset_path, max_images)
    
    if len(keypoints) == 0:
        print("No valid poses extracted")
        return None, None
    
    print(f"Extracted {len(keypoints)} poses")
    
    # Compute body proportions
    print("\nExtracting body proportions...")
    proportions = []
    for kp in tqdm(keypoints):
        prop = extract_body_proportions(kp)
        if prop is not None:
            proportions.append(prop)
    
    proportions = np.array(proportions)
    print(f"Valid proportions: {len(proportions)}")
    
    if len(proportions) < 10:
        print("Not enough valid proportions")
        return None, None
    
    # Compute metrics
    print("\nComputing PCA variance...")
    pca_results, pca_model, proportions_scaled = compute_pca_variance(proportions)
    
    print("Computing entropy...")
    entropy_results = compute_proportion_entropy(proportions)
    
    print("Computing diversity stats...")
    diversity_results = compute_proportion_diversity(proportions)
    
    results = {
        'dataset': dataset_name,
        'num_samples': len(proportions),
        'pca': pca_results,
        'entropy': entropy_results,
        'diversity': diversity_results,
    }
    
    print(f"\nResults for {dataset_name}:")
    print(f"  - Components for 90% variance: {pca_results['cumulative_variance_90']}")
    print(f"  - Components for 95% variance: {pca_results['cumulative_variance_95']}")
    print(f"  - Top-3 variance ratio: {sum(pca_results['explained_variance_ratio'][:3]):.4f}")
    print(f"  - Avg Entropy: {entropy_results['avg_entropy']:.4f}")
    print(f"  - Avg CV: {diversity_results['avg_coefficient_of_variation']:.4f}")
    
    return results, (proportions, proportions_scaled, pca_model)

In [None]:
# Evaluate all datasets
all_results = {}
all_data = {}

for name, path in config.items():
    if name in ['vitonhd', 'deepfashion1', 'dresscode']:
        results, data = evaluate_body_shape_diversity(name.upper(), path, max_images=500)
        if results:
            all_results[name] = results
            all_data[name] = data

## 8. Visualization

In [None]:
# PCA Explained Variance Comparison
if all_results:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    colors = {'vitonhd': '#3498db', 'deepfashion1': '#e74c3c', 'dresscode': '#2ecc71'}
    
    # Individual explained variance
    for name, results in all_results.items():
        var_ratio = results['pca']['explained_variance_ratio'][:10]
        axes[0].plot(range(1, len(var_ratio)+1), var_ratio, 
                     label=name.upper(), color=colors.get(name, '#333'), marker='o')
    
    axes[0].set_xlabel('Principal Component')
    axes[0].set_ylabel('Explained Variance Ratio')
    axes[0].set_title('PCA Explained Variance per Component')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Cumulative variance
    for name, results in all_results.items():
        var_ratio = results['pca']['explained_variance_ratio'][:10]
        cumsum = np.cumsum(var_ratio)
        axes[1].plot(range(1, len(cumsum)+1), cumsum, 
                     label=name.upper(), color=colors.get(name, '#333'), marker='o')
    
    axes[1].axhline(y=0.9, color='gray', linestyle='--', label='90% threshold')
    axes[1].axhline(y=0.95, color='gray', linestyle=':', label='95% threshold')
    axes[1].set_xlabel('Number of Components')
    axes[1].set_ylabel('Cumulative Variance Ratio')
    axes[1].set_title('Cumulative PCA Explained Variance')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Body Shape Feature Distributions
if all_data:
    # Select key features to visualize
    key_features = ['shoulder_hip_ratio', 'total_arm_ratio', 'total_leg_ratio', 'shoulder_torso_ratio']
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.flatten()
    
    for idx, feature in enumerate(key_features):
        feat_idx = PROPORTION_NAMES.index(feature)
        
        for name, (proportions, _, _) in all_data.items():
            values = proportions[:, feat_idx]
            axes[idx].hist(values, bins=20, alpha=0.5, label=name.upper(), density=True)
        
        axes[idx].set_xlabel(feature.replace('_', ' ').title())
        axes[idx].set_ylabel('Density')
        axes[idx].set_title(f'{feature.replace("_", " ").title()} Distribution')
        axes[idx].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# PCA 2D visualization
if all_data:
    fig, ax = plt.subplots(figsize=(10, 8))
    
    colors = {'vitonhd': '#3498db', 'deepfashion1': '#e74c3c', 'dresscode': '#2ecc71'}
    
    for name, (_, proportions_scaled, pca_model) in all_data.items():
        # Project to 2D
        pca_2d = PCA(n_components=2)
        coords = pca_2d.fit_transform(proportions_scaled[:200])  # Limit for clarity
        
        ax.scatter(coords[:, 0], coords[:, 1], 
                   c=colors.get(name, '#333'), alpha=0.5, label=name.upper(), s=30)
    
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_title('Body Shape PCA Projection')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 9. Save Results

In [None]:
results_path = Path('/content/datasets/body_shape_diversity_results.json')

# Convert results to serializable format
save_results = {}
for name, results in all_results.items():
    save_results[name] = {
        'dataset': results['dataset'],
        'num_samples': results['num_samples'],
        'pca_components_90': results['pca']['cumulative_variance_90'],
        'pca_components_95': results['pca']['cumulative_variance_95'],
        'explained_variance_top3': sum(results['pca']['explained_variance_ratio'][:3]),
        'avg_entropy': results['entropy']['avg_entropy'],
        'avg_cv': results['diversity']['avg_coefficient_of_variation'],
    }

with open(results_path, 'w') as f:
    json.dump(save_results, f, indent=2)

print(f"Results saved to: {results_path}")

# Summary table
print("\n" + "="*75)
print("BODY SHAPE DIVERSITY METRICS SUMMARY")
print("="*75)
print(f"{'Dataset':<15} {'PCA-90%':<10} {'PCA-95%':<10} {'Top3 Var':<12} {'Entropy':<10} {'CV':<10}")
print("-"*67)
for name, r in save_results.items():
    print(f"{name:<15} {r['pca_components_90']:<10} {r['pca_components_95']:<10} {r['explained_variance_top3']:<12.4f} {r['avg_entropy']:<10.4f} {r['avg_cv']:<10.4f}")
print("="*75)