# EDA: Apartment JSONs Dataset

**Goal**: Understand the structure and quality of apartment data

## Questions to answer:
1. How many apartments have furniture?
2. What room types exist?
3. What furniture IDs are used?
4. Are there semantic errors (e.g., WC in living room)?
5. What are the distributions of room sizes?
6. How many furniture items per room on average?

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter, defaultdict

# Add utils to path
import sys
sys.path.append('../utils')

from json_processing import (
    load_apartment_json,
    extract_furniture_ids,
    extract_room_features
)

sns.set_theme(style='whitegrid')

## 1. Load Sample Data

**TODO**: Update `data_dir` to point to your apartment JSONs location

In [None]:
# TODO: Update this path!
data_dir = Path("path/to/apartment_jsons")

# Load first N apartments for exploration
json_files = list(data_dir.glob("*.json"))[:1000]  # Start with 1000

print(f"Found {len(json_files)} JSON files")

In [None]:
# Load one example to inspect structure
if json_files:
    example = load_apartment_json(json_files[0])
    print(json.dumps(example, indent=2)[:2000])  # First 2000 chars

## 2. Furniture Coverage Analysis

In [None]:
# Count apartments with furniture
apartments_with_furniture = 0
apartments_without_furniture = 0
total_furniture_items = 0

for json_file in json_files:
    apt_data = load_apartment_json(json_file)
    furniture_by_room = extract_furniture_ids(apt_data)
    
    total_items = sum(len(ids) for ids in furniture_by_room.values())
    
    if total_items > 0:
        apartments_with_furniture += 1
        total_furniture_items += total_items
    else:
        apartments_without_furniture += 1

print(f"Apartments WITH furniture: {apartments_with_furniture}")
print(f"Apartments WITHOUT furniture: {apartments_without_furniture}")
print(f"Avg furniture items per apartment: {total_furniture_items / apartments_with_furniture:.1f}")

## 3. Room Type Distribution

In [None]:
room_types = Counter()
room_areas = defaultdict(list)

for json_file in json_files:
    apt_data = load_apartment_json(json_file)
    
    for apt in apt_data.get("apartments", []):
        for comp in apt.get("compartments", []):
            room_type = comp.get("name_cad", "unknown")
            area = comp.get("area", 0.0)
            
            room_types[room_type] += 1
            room_areas[room_type].append(area)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
room_df = pd.DataFrame(room_types.most_common(20), columns=['Room Type', 'Count'])
sns.barplot(data=room_df, x='Count', y='Room Type', ax=ax)
ax.set_title('Top 20 Room Types')
plt.tight_layout()
plt.show()

## 4. Furniture ID Distribution

In [None]:
furniture_ids = Counter()
room_furniture_pairs = Counter()  # (room_type, furniture_id)

for json_file in json_files:
    apt_data = load_apartment_json(json_file)
    furniture_by_room = extract_furniture_ids(apt_data)
    
    for room_type, ids in furniture_by_room.items():
        for fid in ids:
            furniture_ids[fid] += 1
            room_furniture_pairs[(room_type, fid)] += 1

# Plot top furniture IDs
fig, ax = plt.subplots(figsize=(12, 8))
furn_df = pd.DataFrame(furniture_ids.most_common(30), columns=['Furniture ID', 'Count'])
sns.barplot(data=furn_df, x='Count', y='Furniture ID', ax=ax)
ax.set_title('Top 30 Furniture IDs')
plt.tight_layout()
plt.show()

print(f"\nTotal unique furniture IDs: {len(furniture_ids)}")

## 5. Semantic Validation

Check for suspicious room-furniture pairs

In [None]:
# Define suspicious patterns (manual inspection needed)
# TODO: Update after seeing actual IDs
bathroom_furniture = ["sh", "wc", "sink", "bathtub"]  # Example IDs
bedroom_rooms = ["idBedroom", "idSypialnia"]  # Example room names

suspicious_pairs = []

for (room_type, furn_id), count in room_furniture_pairs.items():
    # Example: bathroom furniture in bedroom
    if any(br in room_type for br in bedroom_rooms) and furn_id in bathroom_furniture:
        suspicious_pairs.append((room_type, furn_id, count))

if suspicious_pairs:
    print("⚠️ Suspicious room-furniture pairs:")
    for room, furn, cnt in suspicious_pairs[:10]:
        print(f"  {room} + {furn}: {cnt} occurrences")
else:
    print("✅ No obvious semantic errors detected (or patterns not defined yet)")

## 6. Room Size Distribution by Type

In [None]:
# Plot area distributions for common room types
top_rooms = [rt for rt, _ in room_types.most_common(5)]

fig, axes = plt.subplots(len(top_rooms), 1, figsize=(10, len(top_rooms) * 3))
if len(top_rooms) == 1:
    axes = [axes]

for idx, room_type in enumerate(top_rooms):
    areas = room_areas[room_type]
    axes[idx].hist(areas, bins=30, edgecolor='black')
    axes[idx].set_title(f'{room_type} - Area Distribution')
    axes[idx].set_xlabel('Area (m²)')
    axes[idx].set_ylabel('Count')
    axes[idx].axvline(np.median(areas), color='red', linestyle='--', 
                      label=f'Median: {np.median(areas):.1f}m²')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 7. Export Statistics for Pipeline Config

In [None]:
stats = {
    "total_apartments_scanned": len(json_files),
    "apartments_with_furniture": apartments_with_furniture,
    "apartments_without_furniture": apartments_without_furniture,
    "unique_room_types": len(room_types),
    "unique_furniture_ids": len(furniture_ids),
    "top_room_types": dict(room_types.most_common(10)),
    "top_furniture_ids": dict(furniture_ids.most_common(20)),
    "avg_furniture_per_apartment": total_furniture_items / apartments_with_furniture if apartments_with_furniture > 0 else 0
}

# Save to configs
output_path = Path("../../datasets_pipelines/apartment_jsons_pipe/configs/eda_stats.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(stats, f, indent=2)

print(f"\n✅ Stats saved to {output_path}")
print(json.dumps(stats, indent=2))

## Next Steps

Based on EDA results:
1. ✅ Identify all room types and furniture IDs
2. ✅ Check data quality (coverage, missing values)
3. 🔄 Create furniture catalog from SVG definitions
4. 🔄 Build data pipeline (1_collect → 2_validate → 3_process → 4_split)
5. 🔄 Start with Stage 1: Semantic model (room → furniture IDs)