# RadioMapSeer Dataset Exploration

This notebook explores the RadioMapSeer dataset to understand:
1. Data format and structure
2. Value distributions (pathloss ranges)
3. City map characteristics
4. Transmitter locations
5. Preparation for trajectory sampling

In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import json
from collections import defaultdict

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Dataset path
DATA_DIR = Path.cwd().parent / 'data' / 'raw' / 'RadioMapSeer'
print(f"Looking for data in: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

## 1. Dataset Structure Discovery

First, let's explore what files and folders exist in the dataset.

In [None]:
def explore_directory(path: Path, max_depth: int = 3, current_depth: int = 0) -> dict:
    """Recursively explore directory structure."""
    if not path.exists() or current_depth >= max_depth:
        return {}
    
    result = {
        'files': [],
        'dirs': {}
    }
    
    for item in sorted(path.iterdir())[:20]:  # Limit to first 20 items
        if item.is_file():
            result['files'].append(item.name)
        elif item.is_dir():
            result['dirs'][item.name] = explore_directory(item, max_depth, current_depth + 1)
    
    return result

if DATA_DIR.exists():
    structure = explore_directory(DATA_DIR)
    print("Dataset structure (first 20 items per level):")
    print(json.dumps(structure, indent=2))
else:
    print("⚠️  Dataset not found! Please download first:")
    print("   python scripts/download_data.py --method manual")

In [None]:
# Count all files by type
def count_files(path: Path) -> dict:
    """Count files by extension."""
    counts = defaultdict(int)
    total_size = 0
    
    if not path.exists():
        return counts, 0
    
    for root, dirs, files in os.walk(path):
        for f in files:
            ext = Path(f).suffix.lower()
            counts[ext] += 1
            total_size += (Path(root) / f).stat().st_size
    
    return dict(counts), total_size / (1024**3)  # Size in GB

if DATA_DIR.exists():
    file_counts, total_gb = count_files(DATA_DIR)
    print("File counts by extension:")
    for ext, count in sorted(file_counts.items(), key=lambda x: -x[1]):
        print(f"  {ext or '(no ext)'}: {count:,}")
    print(f"\nTotal size: {total_gb:.2f} GB")

## 2. Load and Visualize Sample Data

Let's load a few samples to understand the data format.

In [None]:
def find_sample_files(data_dir: Path, pattern: str = "*.png", n: int = 5) -> list:
    """Find sample files matching pattern."""
    files = list(data_dir.rglob(pattern))[:n]
    return files

# Find PNG files
if DATA_DIR.exists():
    png_files = find_sample_files(DATA_DIR, "*.png", 10)
    print(f"Found {len(png_files)} sample PNG files:")
    for f in png_files:
        print(f"  {f.relative_to(DATA_DIR)}")

In [None]:
def load_and_analyze_png(filepath: Path) -> dict:
    """Load a PNG and analyze its properties."""
    img = Image.open(filepath)
    arr = np.array(img)
    
    return {
        'path': filepath.name,
        'shape': arr.shape,
        'dtype': str(arr.dtype),
        'mode': img.mode,
        'min': arr.min(),
        'max': arr.max(),
        'mean': arr.mean(),
        'std': arr.std(),
        'unique_values': len(np.unique(arr)),
        'array': arr
    }

# Analyze sample files
if DATA_DIR.exists() and png_files:
    print("Analyzing sample PNG files:\n")
    for f in png_files[:5]:
        info = load_and_analyze_png(f)
        print(f"File: {info['path']}")
        print(f"  Shape: {info['shape']}, Mode: {info['mode']}, Dtype: {info['dtype']}")
        print(f"  Values: min={info['min']}, max={info['max']}, mean={info['mean']:.2f}")
        print(f"  Unique values: {info['unique_values']}")
        print()

In [None]:
# Visualize sample images
if DATA_DIR.exists() and png_files:
    fig, axes = plt.subplots(2, min(3, len(png_files)), figsize=(15, 10))
    axes = axes.flatten() if len(png_files) > 1 else [axes]
    
    for ax, fpath in zip(axes, png_files[:6]):
        img = np.array(Image.open(fpath))
        if len(img.shape) == 2:  # Grayscale
            im = ax.imshow(img, cmap='viridis')
            plt.colorbar(im, ax=ax, fraction=0.046)
        else:  # RGB
            ax.imshow(img)
        ax.set_title(fpath.name[:30], fontsize=10)
        ax.axis('off')
    
    plt.tight_layout()
    plt.suptitle('Sample Images from Dataset', y=1.02)
    plt.show()

## 3. Understand City Map / Floor Plan Encoding

The city maps should show buildings vs streets. Let's understand the color encoding.

In [None]:
def analyze_city_map_colors(img: np.ndarray) -> dict:
    """Analyze color distribution in city map."""
    if len(img.shape) == 2:
        # Grayscale
        unique, counts = np.unique(img, return_counts=True)
        return {'type': 'grayscale', 'unique_values': len(unique), 'top_values': list(zip(unique[:10], counts[:10]))}
    
    # RGB
    pixels = img.reshape(-1, img.shape[-1])
    unique, counts = np.unique(pixels, axis=0, return_counts=True)
    
    # Sort by frequency
    sorted_idx = np.argsort(-counts)
    top_colors = [(tuple(unique[i]), counts[i]) for i in sorted_idx[:10]]
    
    return {
        'type': 'rgb',
        'unique_colors': len(unique),
        'top_colors': top_colors
    }

# Look for city map files
if DATA_DIR.exists():
    # Try to find city/building map files
    city_patterns = ['*city*.png', '*map*.png', '*building*.png', '*floor*.png']
    city_files = []
    for pattern in city_patterns:
        city_files.extend(list(DATA_DIR.rglob(pattern))[:5])
    
    print(f"Found {len(city_files)} potential city map files")
    for f in city_files[:3]:
        print(f"  {f.relative_to(DATA_DIR)}")

## 4. Pathloss Value Distribution

According to documentation:
- Min pathloss: -186 dB
- Max pathloss: -47 dB
- PNG encoding: Linear mapping to 0-255

In [None]:
# Define conversion functions based on documentation
PL_MIN = -186  # dB
PL_MAX = -47   # dB
PL_RANGE = PL_MAX - PL_MIN  # 139 dB

def png_to_db(png_value: np.ndarray) -> np.ndarray:
    """Convert PNG grayscale (0-255) to pathloss in dB."""
    return (png_value / 255.0) * PL_RANGE + PL_MIN

def db_to_png(db_value: np.ndarray) -> np.ndarray:
    """Convert pathloss in dB to PNG grayscale (0-255)."""
    return np.clip((db_value - PL_MIN) / PL_RANGE * 255, 0, 255).astype(np.uint8)

# Test conversion
test_png = np.array([0, 127, 255])
test_db = png_to_db(test_png)
print("PNG to dB conversion test:")
print(f"  PNG values: {test_png}")
print(f"  dB values:  {test_db}")

In [None]:
# Analyze pathloss distribution across multiple maps
if DATA_DIR.exists() and png_files:
    # Find files that look like radio/pathloss maps (grayscale)
    radio_maps = []
    for f in find_sample_files(DATA_DIR, "*.png", 50):
        img = np.array(Image.open(f))
        if len(img.shape) == 2 and img.shape == (256, 256):  # Grayscale, correct size
            radio_maps.append(img)
            if len(radio_maps) >= 20:
                break
    
    if radio_maps:
        all_values = np.concatenate([m.flatten() for m in radio_maps])
        all_db = png_to_db(all_values)
        
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # PNG value histogram
        axes[0].hist(all_values, bins=50, edgecolor='black', alpha=0.7)
        axes[0].set_xlabel('PNG Value (0-255)')
        axes[0].set_ylabel('Frequency')
        axes[0].set_title('Distribution of PNG Values')
        
        # dB value histogram
        axes[1].hist(all_db, bins=50, edgecolor='black', alpha=0.7, color='orange')
        axes[1].set_xlabel('Pathloss (dB)')
        axes[1].set_ylabel('Frequency')
        axes[1].set_title('Distribution of Pathloss Values')
        
        plt.tight_layout()
        plt.show()
        
        print(f"Analyzed {len(radio_maps)} radio maps")
        print(f"PNG range: [{all_values.min()}, {all_values.max()}]")
        print(f"dB range: [{all_db.min():.1f}, {all_db.max():.1f}]")

## 5. Walkable Area Extraction (For Trajectory Sampling)

To generate trajectories, we need to identify walkable areas (streets) vs obstacles (buildings).

In [None]:
def extract_walkable_mask(city_map: np.ndarray, method: str = 'threshold') -> np.ndarray:
    """Extract walkable areas from city map.
    
    Args:
        city_map: RGB or grayscale city map
        method: Extraction method ('threshold', 'color')
    
    Returns:
        Binary mask where 1 = walkable, 0 = obstacle
    """
    if len(city_map.shape) == 3:
        # Convert to grayscale
        gray = np.mean(city_map, axis=-1)
    else:
        gray = city_map
    
    if method == 'threshold':
        # Simple threshold - streets are usually lighter
        # This is a placeholder - actual threshold depends on dataset encoding
        threshold = np.percentile(gray, 50)
        mask = (gray > threshold).astype(np.uint8)
    else:
        # TODO: Color-based extraction for RGB maps
        mask = np.ones_like(gray, dtype=np.uint8)
    
    return mask

# Test on a sample
if DATA_DIR.exists() and png_files:
    # Try with first RGB image or city map
    for f in png_files:
        img = np.array(Image.open(f))
        if len(img.shape) == 3:  # RGB
            mask = extract_walkable_mask(img)
            
            fig, axes = plt.subplots(1, 2, figsize=(12, 5))
            axes[0].imshow(img)
            axes[0].set_title('Original City Map')
            axes[1].imshow(mask, cmap='gray')
            axes[1].set_title('Walkable Mask (preliminary)')
            plt.show()
            break

## 6. Summary Statistics

Compile key statistics about the dataset.

In [None]:
def compute_dataset_summary(data_dir: Path) -> dict:
    """Compute comprehensive dataset summary."""
    summary = {
        'exists': data_dir.exists(),
        'total_files': 0,
        'png_files': 0,
        'json_files': 0,
        'total_size_gb': 0,
        'image_shapes': set(),
        'cities': set(),
    }
    
    if not data_dir.exists():
        return summary
    
    for root, dirs, files in os.walk(data_dir):
        # Try to identify cities from directory names
        for d in dirs:
            if d.lower() in ['ankara', 'berlin', 'glasgow', 'ljubljana', 'london', 'telaviv', 'tel_aviv']:
                summary['cities'].add(d)
        
        for f in files:
            fpath = Path(root) / f
            summary['total_files'] += 1
            summary['total_size_gb'] += fpath.stat().st_size / (1024**3)
            
            if f.endswith('.png'):
                summary['png_files'] += 1
                # Sample some to get shapes
                if summary['png_files'] <= 10:
                    img = Image.open(fpath)
                    summary['image_shapes'].add(img.size)
            elif f.endswith('.json'):
                summary['json_files'] += 1
    
    summary['cities'] = list(summary['cities'])
    summary['image_shapes'] = list(summary['image_shapes'])
    
    return summary

if DATA_DIR.exists():
    summary = compute_dataset_summary(DATA_DIR)
    print("=" * 50)
    print("DATASET SUMMARY")
    print("=" * 50)
    print(f"Total files: {summary['total_files']:,}")
    print(f"PNG files: {summary['png_files']:,}")
    print(f"JSON files: {summary['json_files']:,}")
    print(f"Total size: {summary['total_size_gb']:.2f} GB")
    print(f"Image shapes: {summary['image_shapes']}")
    print(f"Cities found: {summary['cities']}")
else:
    print("Dataset not found. Please download first.")

## 7. Next Steps

Based on the exploration above:

1. **Floor Plan Processing**: Implement proper walkable area extraction based on observed color encoding
2. **Trajectory Generation**: Generate realistic pedestrian trajectories on streets
3. **Data Pipeline**: Create PyTorch Dataset that loads maps and generates trajectories
4. **Visualization**: Create overlay visualizations of trajectories on city maps

See notebook `02_trajectory_visualization.ipynb` for trajectory examples.

In [None]:
print("Notebook complete!")
print("\nKey findings to document:")
print("- [ ] Actual file structure and naming convention")
print("- [ ] Color encoding for city maps (buildings vs streets)")
print("- [ ] Pathloss value range in actual data")
print("- [ ] Transmitter location format")
print("- [ ] Any preprocessing needed")