In [None]:
# Clone Repository
!git clone https://github.com/YoussifKhaled/dinov2.git
%cd dinov2

In [None]:
# Uninstall Kaggle's broken PyTorch and install correct versions
!pip uninstall -y torch torchvision torchaudio
!pip install -r dinov2/fl/requirements.txt

In [None]:
# ‚ö†Ô∏è RESTART KERNEL NOW
# After restart, skip cells 1-2 and run from here

%cd /kaggle/working/dinov2

import torch
import torchvision
import os

print("=" * 60)
print("ENVIRONMENT VERIFICATION")
print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"Torchvision: {torchvision.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Verify dataset
BASE_PATH = "/kaggle/input/cityscapes-fine-dataset"  # ‚Üê UPDATE IF DIFFERENT
print(f"\nDataset: {BASE_PATH}")
print(f"Exists: {os.path.exists(BASE_PATH)}")
if os.path.exists(BASE_PATH):
    print(f"Contents: {os.listdir(BASE_PATH)}")
else:
    print("‚ùå ERROR: Dataset not found!")
    print("Go to: + Add Data ‚Üí Search 'cityscapes'")
    print("Then update BASE_PATH above")
print("=" * 60)

---

## Section 2: Extract DINOv2 Embeddings

**Time:** ~15-20 minutes on T4 GPU

Extracts 1024-dim CLS token embeddings from all Cityscapes training images.

In [None]:
# Extract embeddings using DINOv2 ViT-L/14
OUTPUT_DIR = "/kaggle/working/fl_settings"

!python -m dinov2.fl.scripts.run_extraction \
    --dataset_list_file train_fine.txt \
    --base_path {BASE_PATH} \
    --output_dir {OUTPUT_DIR} \
    --model_name dinov2_vitl14 \
    --batch_size 16

print("\n‚úì Embedding extraction complete")
print(f"Output: {OUTPUT_DIR}/embeddings.pth")

---

## Section 3: Generate All Three Settings

**Time:** ~2-3 minutes

Runs the complete pipeline to generate:
- Setting 1: IID (Œ±=100, 10 clients)
- Setting 2: Non-IID (Œ±=0.1, 10 clients)
- Setting 3: City-Based (Œ±=0.5, 5 clients per city)

In [None]:
# Generate all three settings
!python -m dinov2.fl.scripts.generate_settings \
    --embeddings_path {OUTPUT_DIR}/embeddings.pth \
    --output_dir {OUTPUT_DIR} \
    --n_clusters 16 \
    --n_clients 10 \
    --n_clients_per_city 5 \
    --alpha_iid 100.0 \
    --alpha_noniid 0.1 \
    --alpha_city 0.5 \
    --base_path {BASE_PATH} \
    --seed 42

---

## Section 4: Verify Output Files

Check that all settings were generated correctly.

In [None]:
import json
import os

print("=" * 70)
print("GENERATED FILES")
print("=" * 70)

# List all files
files = sorted(os.listdir(OUTPUT_DIR))
pth_files = [f for f in files if f.endswith('.pth')]
json_files = [f for f in files if f.endswith('.json')]

print("\nüìÅ PTH FILES:")
for f in pth_files:
    fpath = os.path.join(OUTPUT_DIR, f)
    size_mb = os.path.getsize(fpath) / (1024 * 1024)
    print(f"  ‚Ä¢ {f:<45} {size_mb:>6.2f} MB")

print("\nüìÑ JSON FILES (FL Training Settings):")
for f in json_files:
    fpath = os.path.join(OUTPUT_DIR, f)
    size_mb = os.path.getsize(fpath) / (1024 * 1024)
    print(f"  ‚Ä¢ {f:<45} {size_mb:>6.2f} MB")

print("\n" + "=" * 70)

---

## Section 5: Inspect Settings

Load and display statistics for each setting.

In [None]:
import json

def print_setting_stats(json_path, title):
    """Print statistics for a partition setting."""
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    n_clients = len(data)
    samples_per_client = [data[str(i)]['num_samples'] for i in range(n_clients)]
    total_samples = sum(samples_per_client)
    
    print("=" * 70)
    print(title)
    print("=" * 70)
    print(f"Total clients: {n_clients}")
    print(f"Total samples: {total_samples}")
    print(f"Samples per client:")
    print(f"  Min:  {min(samples_per_client)}")
    print(f"  Max:  {max(samples_per_client)}")
    print(f"  Mean: {sum(samples_per_client)/len(samples_per_client):.1f}")
    print(f"  Std:  {(sum((x - sum(samples_per_client)/len(samples_per_client))**2 for x in samples_per_client) / len(samples_per_client))**0.5:.1f}")
    
    # Show first few clients
    print(f"\nFirst 5 clients:")
    for i in range(min(5, n_clients)):
        client = data[str(i)]
        print(f"  Client {i} ({client['client_name']}): {client['num_samples']} samples")
    
    # Show sample data format
    print(f"\nSample data format (first entry):")
    first_sample = data['0']['data'][0]
    print(f"  Image: {first_sample[0]}")
    print(f"  Label: {first_sample[1]}")
    print()

# Display statistics for all settings
print_setting_stats(f"{OUTPUT_DIR}/setting1_iid.json", "SETTING 1 - IID (Œ±=100)")
print_setting_stats(f"{OUTPUT_DIR}/setting2_noniid.json", "SETTING 2 - Non-IID (Œ±=0.1)")
print_setting_stats(f"{OUTPUT_DIR}/setting3_city_based.json", "SETTING 3 - City-Based Non-IID")

---

## Section 6: Visualize Heterogeneity

Compare sample distributions across the three settings.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import json

# Load all settings
settings = [
    ("Setting 1: IID", f"{OUTPUT_DIR}/setting1_iid.json", "steelblue"),
    ("Setting 2: Non-IID", f"{OUTPUT_DIR}/setting2_noniid.json", "coral"),
    ("Setting 3: City-Based", f"{OUTPUT_DIR}/setting3_city_based.json", "mediumseagreen"),
]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (title, path, color) in enumerate(settings):
    with open(path, 'r') as f:
        data = json.load(f)
    
    n_clients = len(data)
    samples = [data[str(i)]['num_samples'] for i in range(n_clients)]
    
    # Bar plot
    x = range(n_clients)
    axes[idx].bar(x, samples, color=color, alpha=0.7, edgecolor='black', linewidth=0.5)
    axes[idx].axhline(y=np.mean(samples), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(samples):.0f}')
    
    axes[idx].set_xlabel("Client ID", fontsize=12)
    axes[idx].set_ylabel("Number of Samples", fontsize=12)
    axes[idx].set_title(title, fontsize=14, fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)
    
    # Limit x-axis for city-based (too many clients)
    if idx == 2:
        axes[idx].set_xlim(-1, min(30, n_clients))
        axes[idx].set_xlabel("Client ID (showing first 30)", fontsize=12)

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/settings_comparison.png", dpi=150, bbox_inches='tight')
plt.show()

print("\nüìä INTERPRETATION:")
print("  ‚Ä¢ Setting 1 (IID): Uniform bars = balanced distribution")
print("  ‚Ä¢ Setting 2 (Non-IID): Uneven bars = imbalanced clients")
print("  ‚Ä¢ Setting 3 (City-Based): 90 clients (5 per city) with geographic isolation")

---

## Section 7: Export for Download

Zip all files for easy download.

In [None]:
# Create zip archive
!cd /kaggle/working && zip -r fl_settings_complete.zip fl_settings/

print("\n" + "=" * 70)
print("‚úÖ PIPELINE COMPLETE")
print("=" * 70)
print("\nüì¶ Download: /kaggle/working/fl_settings_complete.zip")
print("\nContains:")
print("  ‚Ä¢ 3 JSON files (for FL training)")
print("    - setting1_iid.json")
print("    - setting2_noniid.json")
print("    - setting3_city_based.json")
print("  ‚Ä¢ 3 PTH files (for analysis)")
print("  ‚Ä¢ Embeddings and clusters")
print("  ‚Ä¢ Visualization")
print("=" * 70)

---

## Summary

### Output Files

**JSON Files (Ready for FL Training):**
- `setting1_iid.json` - Near-IID baseline (Œ±=100, 10 clients)
- `setting2_noniid.json` - Extreme heterogeneity (Œ±=0.1, 10 clients)
- `setting3_city_based.json` - Geographic partitioning (90 clients, 5 per city)

### JSON Format

Each file follows the `city_partitions.json` format:

```json
{
    "0": {
        "client_name": "client_0",
        "num_samples": 297,
        "data": [
            ["leftImg8bit/train/aachen/...", "gtFine/train/aachen/..."],
            ...
        ]
    },
    ...
}
```

### Next Steps

1. Download `fl_settings_complete.zip`
2. Use JSON files as input to your FL training framework
3. Compare model performance across the three settings
4. Analyze how data heterogeneity affects convergence and accuracy