# Cloth Diversity Metrics
## Feature Distribution Analysis with MobileNetV3

This notebook measures clothing appearance diversity using pretrained MobileNetV3-Small features (~5MB).

## 1. Setup and Installation

In [None]:
# Install dependencies
!pip install -q torch torchvision numpy scipy matplotlib pillow tqdm scikit-learn

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
from scipy.spatial.distance import pdist, cdist
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import json

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

## 2. Load MobileNetV3-Small Feature Extractor

In [None]:
class ClothFeatureExtractor(nn.Module):
    """Feature extractor using MobileNetV3-Small backbone"""
    
    def __init__(self):
        super().__init__()
        # Load pretrained MobileNetV3-Small
        mobilenet = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1)
        
        # Remove classifier, keep feature extractor
        self.features = mobilenet.features
        self.avgpool = mobilenet.avgpool
        
        # Feature dimension: 576 for MobileNetV3-Small
        self.feature_dim = 576
        
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.flatten(1)
        return x

# Initialize model
model = ClothFeatureExtractor().to(device)
model.eval()

print(f"MobileNetV3-Small loaded (feature dim: {model.feature_dim})")

# Image preprocessing
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

## 3. Feature Extraction Functions

In [None]:
@torch.no_grad()
def extract_features(image_path):
    """Extract features from a single image"""
    try:
        img = Image.open(image_path).convert('RGB')
        img_tensor = preprocess(img).unsqueeze(0).to(device)
        features = model(img_tensor)
        return features.cpu().numpy().flatten()
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

@torch.no_grad()
def extract_features_batch(image_paths, batch_size=32):
    """Extract features from multiple images in batches"""
    all_features = []
    valid_paths = []
    
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting features"):
        batch_paths = image_paths[i:i+batch_size]
        batch_tensors = []
        batch_valid_paths = []
        
        for path in batch_paths:
            try:
                img = Image.open(path).convert('RGB')
                batch_tensors.append(preprocess(img))
                batch_valid_paths.append(path)
            except Exception as e:
                continue
        
        if batch_tensors:
            batch = torch.stack(batch_tensors).to(device)
            features = model(batch).cpu().numpy()
            all_features.extend(features)
            valid_paths.extend(batch_valid_paths)
    
    return np.array(all_features), valid_paths

def get_image_paths(directory, max_images=None):
    """Get all image paths from directory"""
    directory = Path(directory)
    extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp']
    
    image_paths = []
    for ext in extensions:
        image_paths.extend(list(directory.rglob(ext)))
    
    if max_images:
        image_paths = image_paths[:max_images]
    
    return image_paths

## 4. Cloth Diversity Metrics

In [None]:
def compute_feature_statistics(features):
    """Compute feature distribution statistics"""
    mean = np.mean(features, axis=0)
    std = np.std(features, axis=0)
    variance = np.var(features, axis=0)
    
    return {
        'mean_feature_magnitude': float(np.linalg.norm(mean)),
        'avg_feature_std': float(np.mean(std)),
        'total_variance': float(np.sum(variance)),
        'avg_variance_per_dim': float(np.mean(variance)),
    }

def compute_pairwise_cosine_diversity(features, max_pairs=10000):
    """Compute average pairwise cosine distance"""
    n = len(features)
    
    # Sample if too many features
    if n * (n - 1) / 2 > max_pairs:
        indices = np.random.choice(n, size=int(np.sqrt(2 * max_pairs)), replace=False)
        features = features[indices]
    
    # Normalize features for cosine distance
    normalized = features / (np.linalg.norm(features, axis=1, keepdims=True) + 1e-10)
    
    # Compute pairwise cosine distances
    distances = pdist(normalized, metric='cosine')
    
    return {
        'avg_cosine_distance': float(np.mean(distances)),
        'std_cosine_distance': float(np.std(distances)),
        'min_cosine_distance': float(np.min(distances)),
        'max_cosine_distance': float(np.max(distances)),
    }

def compute_pairwise_euclidean_diversity(features, max_pairs=10000):
    """Compute average pairwise Euclidean distance"""
    n = len(features)
    
    # Sample if too many features
    if n * (n - 1) / 2 > max_pairs:
        indices = np.random.choice(n, size=int(np.sqrt(2 * max_pairs)), replace=False)
        features = features[indices]
    
    # Compute pairwise Euclidean distances
    distances = pdist(features, metric='euclidean')
    
    return {
        'avg_euclidean_distance': float(np.mean(distances)),
        'std_euclidean_distance': float(np.std(distances)),
    }

def compute_feature_entropy(features, num_clusters=20):
    """Compute feature entropy using PCA + clustering"""
    from sklearn.cluster import MiniBatchKMeans
    
    # Reduce dimensionality with PCA
    pca = PCA(n_components=min(50, features.shape[1]))
    features_pca = pca.fit_transform(features)
    
    # Cluster features
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, n_init=3)
    labels = kmeans.fit_predict(features_pca)
    
    # Compute entropy of cluster distribution
    counts = np.bincount(labels, minlength=num_clusters)
    probs = counts / len(labels)
    probs = probs[probs > 0]  # Remove zero probabilities
    
    entropy = -np.sum(probs * np.log(probs))
    max_entropy = np.log(num_clusters)
    normalized_entropy = entropy / max_entropy
    
    return {
        'feature_entropy': float(entropy),
        'normalized_entropy': float(normalized_entropy),
        'pca_explained_variance_ratio': pca.explained_variance_ratio_.tolist()[:10],
    }

## 5. Load Dataset Configuration

In [None]:
# Try to load config from dataset download notebook
config_path = Path('/content/datasets/dataset_config.json')

if config_path.exists():
    with open(config_path) as f:
        config = json.load(f)
    print("Loaded dataset configuration")
else:
    config = {
        'vitonhd': '/content/datasets/vitonhd',
        'deepfashion1': '/content/datasets/deepfashion1',
        'dresscode': '/content/datasets/dresscode',
    }
    print("Using default paths")

print(f"Dataset paths: {config}")

## 6. Evaluate Cloth Diversity

In [None]:
def evaluate_cloth_diversity(dataset_name, dataset_path, max_images=500):
    """Evaluate cloth diversity metrics for a dataset"""
    print(f"\n{'='*60}")
    print(f"Evaluating: {dataset_name}")
    print(f"{'='*60}")
    
    dataset_path = Path(dataset_path)
    if not dataset_path.exists():
        print(f"Dataset path not found: {dataset_path}")
        return None, None
    
    # Get image paths (look for cloth-specific directories)
    cloth_dirs = ['cloth', 'clothes', 'garment', 'garments']
    image_paths = []
    
    for cloth_dir in cloth_dirs:
        cloth_path = dataset_path / cloth_dir
        if cloth_path.exists():
            image_paths.extend(get_image_paths(cloth_path, max_images))
    
    # If no cloth-specific directory, use all images
    if not image_paths:
        image_paths = get_image_paths(dataset_path, max_images)
    
    if not image_paths:
        print("No images found")
        return None, None
    
    print(f"Found {len(image_paths)} images")
    
    # Extract features
    features, valid_paths = extract_features_batch(image_paths)
    print(f"Extracted features for {len(features)} images")
    
    if len(features) == 0:
        print("No valid features extracted")
        return None, None
    
    # Compute metrics
    print("\nComputing metrics...")
    
    results = {
        'dataset': dataset_name,
        'num_images': len(features),
        'feature_dim': features.shape[1],
    }
    
    results.update(compute_feature_statistics(features))
    results.update(compute_pairwise_cosine_diversity(features))
    results.update(compute_pairwise_euclidean_diversity(features))
    results.update(compute_feature_entropy(features))
    
    print(f"\nResults for {dataset_name}:")
    print(f"  - Avg Cosine Distance: {results['avg_cosine_distance']:.4f}")
    print(f"  - Avg Euclidean Distance: {results['avg_euclidean_distance']:.4f}")
    print(f"  - Normalized Entropy: {results['normalized_entropy']:.4f}")
    print(f"  - Total Variance: {results['total_variance']:.4f}")
    
    return results, features

In [None]:
# Evaluate all datasets
all_results = {}
all_features = {}

for name, path in config.items():
    if name in ['vitonhd', 'deepfashion1', 'dresscode']:
        results, features = evaluate_cloth_diversity(name.upper(), path, max_images=500)
        if results:
            all_results[name] = results
            all_features[name] = features

## 7. Visualization

In [None]:
def visualize_feature_space(features_dict, title="Feature Space Visualization"):
    """Visualize feature space using PCA"""
    if not features_dict:
        return
    
    # Combine all features
    all_features = []
    labels = []
    
    for name, features in features_dict.items():
        all_features.append(features[:200])  # Limit for visualization
        labels.extend([name] * len(features[:200]))
    
    combined = np.vstack(all_features)
    
    # PCA to 2D
    pca = PCA(n_components=2)
    features_2d = pca.fit_transform(combined)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    
    colors = {'vitonhd': '#3498db', 'deepfashion1': '#e74c3c', 'dresscode': '#2ecc71'}
    
    start_idx = 0
    for name, features in features_dict.items():
        end_idx = start_idx + len(features[:200])
        ax.scatter(
            features_2d[start_idx:end_idx, 0],
            features_2d[start_idx:end_idx, 1],
            label=name.upper(),
            c=colors.get(name, '#333'),
            alpha=0.6,
            s=30
        )
        start_idx = end_idx
    
    ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    plt.show()

visualize_feature_space(all_features, "Cloth Feature Space (PCA)")

In [None]:
# Compare metrics across datasets
if all_results:
    datasets = list(all_results.keys())
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Cosine Distance
    values = [all_results[d]['avg_cosine_distance'] for d in datasets]
    axes[0, 0].bar(datasets, values, color=['#3498db', '#e74c3c', '#2ecc71'][:len(datasets)])
    axes[0, 0].set_ylabel('Avg Cosine Distance')
    axes[0, 0].set_title('Cloth Diversity (Cosine Distance)')
    
    # Euclidean Distance
    values = [all_results[d]['avg_euclidean_distance'] for d in datasets]
    axes[0, 1].bar(datasets, values, color=['#3498db', '#e74c3c', '#2ecc71'][:len(datasets)])
    axes[0, 1].set_ylabel('Avg Euclidean Distance')
    axes[0, 1].set_title('Cloth Diversity (Euclidean Distance)')
    
    # Normalized Entropy
    values = [all_results[d]['normalized_entropy'] for d in datasets]
    axes[1, 0].bar(datasets, values, color=['#3498db', '#e74c3c', '#2ecc71'][:len(datasets)])
    axes[1, 0].set_ylabel('Normalized Entropy')
    axes[1, 0].set_title('Cloth Feature Entropy')
    axes[1, 0].set_ylim(0, 1)
    
    # Total Variance
    values = [all_results[d]['total_variance'] for d in datasets]
    axes[1, 1].bar(datasets, values, color=['#3498db', '#e74c3c', '#2ecc71'][:len(datasets)])
    axes[1, 1].set_ylabel('Total Variance')
    axes[1, 1].set_title('Cloth Feature Variance')
    
    plt.tight_layout()
    plt.show()

## 8. Save Results

In [None]:
# Save results
results_path = Path('/content/datasets/cloth_diversity_results.json')

# Remove non-serializable items
save_results = {}
for name, results in all_results.items():
    save_results[name] = {k: v for k, v in results.items()}

with open(results_path, 'w') as f:
    json.dump(save_results, f, indent=2)

print(f"Results saved to: {results_path}")

# Print summary
print("\n" + "="*70)
print("CLOTH DIVERSITY METRICS SUMMARY")
print("="*70)
print(f"{'Dataset':<15} {'Cosine Dist':<12} {'Eucl. Dist':<12} {'Entropy':<12} {'Variance':<12}")
print("-"*63)
for name, r in all_results.items():
    print(f"{name:<15} {r['avg_cosine_distance']:<12.4f} {r['avg_euclidean_distance']:<12.2f} {r['normalized_entropy']:<12.4f} {r['total_variance']:<12.2f}")
print("="*70)