# üèôÔ∏è Smart City Computer Vision - Data Analysis

This notebook provides data analysis and visualization tools for the Smart City Computer Vision project.

## üìã Contents
1. Dataset Statistics
2. Model Performance Analysis  
3. Inference Results Visualization
4. Class Distribution Analysis

In [None]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import glob

# Add parent directory to path
sys.path.append('..')
from utils.common import count_dataset_files, get_class_names

# Set style for plots
plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

## üìä Dataset Statistics

In [None]:
# Analyze dataset statistics for all models
models = ['garbage', 'helmet', 'traffic']
dataset_stats = {}

for model in models:
    dataset_path = f'../{model}-detection/data'
    if os.path.exists(dataset_path):
        stats = count_dataset_files(dataset_path)
        dataset_stats[model] = stats
        print(f"üìÅ {model.title()} Detection Dataset:")
        for split, count in stats.items():
            print(f"  {split}: {count} images")
        print(f"  Total: {sum(stats.values())} images\n")

# Create visualization
if dataset_stats:
    df = pd.DataFrame(dataset_stats).T
    df.plot(kind='bar', figsize=(10, 6), title='Dataset Distribution by Model Type')
    plt.ylabel('Number of Images')
    plt.xlabel('Model Type')
    plt.legend(title='Split')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()

## üéØ Class Distribution Analysis

In [None]:
# Analyze class distributions
def analyze_labels(model_name):
    """Analyze label distribution for a given model."""
    labels_dir = f'../{model_name}-detection/data/train/labels'
    
    if not os.path.exists(labels_dir):
        print(f"Labels directory not found: {labels_dir}")
        return None
    
    class_counts = {}
    total_objects = 0
    
    for label_file in glob.glob(os.path.join(labels_dir, '*.txt')):
        with open(label_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if line.strip():
                    class_id = int(line.split()[0])
                    class_counts[class_id] = class_counts.get(class_id, 0) + 1
                    total_objects += 1
    
    return class_counts, total_objects

# Analyze each model
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, model in enumerate(models):
    result = analyze_labels(model)
    if result:
        class_counts, total = result
        class_names = get_class_names(model)
        
        # Convert to readable names
        readable_counts = {}
        for class_id, count in class_counts.items():
            name = class_names.get(class_id, f'Class_{class_id}')
            readable_counts[name] = count
        
        # Create bar plot
        ax = axes[idx]
        classes = list(readable_counts.keys())
        counts = list(readable_counts.values())
        
        ax.bar(classes, counts)
        ax.set_title(f'{model.title()} Detection\nTotal Objects: {total}')
        ax.set_ylabel('Count')
        ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## üìà Model Performance Visualization

In [None]:
# Load and visualize training results
def plot_training_results(model_name):
    """Plot training results from results.png if available."""
    results_path = f'../models/{model_name}/{model_name}_training_results.png'
    
    if os.path.exists(results_path):
        img = cv2.imread(results_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        plt.figure(figsize=(12, 8))
        plt.imshow(img_rgb)
        plt.title(f'{model_name.title()} Detection - Training Results')
        plt.axis('off')
        plt.show()
    else:
        print(f"Training results not found for {model_name}")
        print(f"Expected path: {results_path}")
        print("Train the model first to generate results.")

# Display training results for each model
for model in models:
    plot_training_results(model)

## üñºÔ∏è Sample Data Visualization

In [None]:
# Display sample images from each dataset
def show_sample_images(model_name, num_samples=4):
    """Display sample images from the dataset."""
    images_dir = f'../{model_name}-detection/data/train/images'
    
    if not os.path.exists(images_dir):
        print(f"Images directory not found: {images_dir}")
        return
    
    image_files = glob.glob(os.path.join(images_dir, '*.jpg')) + \
                 glob.glob(os.path.join(images_dir, '*.png'))
    
    if not image_files:
        print(f"No images found in {images_dir}")
        return
    
    # Select random samples
    sample_files = np.random.choice(image_files, min(num_samples, len(image_files)), replace=False)
    
    fig, axes = plt.subplots(1, len(sample_files), figsize=(15, 4))
    if len(sample_files) == 1:
        axes = [axes]
    
    for idx, img_path in enumerate(sample_files):
        img = cv2.imread(img_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        axes[idx].imshow(img_rgb)
        axes[idx].set_title(os.path.basename(img_path))
        axes[idx].axis('off')
    
    plt.suptitle(f'{model_name.title()} Detection - Sample Images')
    plt.tight_layout()
    plt.show()

# Show samples for each model
for model in models:
    show_sample_images(model)

## üìã Summary Report

In [None]:
# Generate a comprehensive summary report
print("üèôÔ∏è SMART CITY COMPUTER VISION PROJECT SUMMARY")
print("=" * 60)

for model in models:
    print(f"\n{model.upper()} DETECTION:")
    print("-" * 40)
    
    # Dataset info
    if model in dataset_stats:
        stats = dataset_stats[model]
        total_images = sum(stats.values())
        print(f"üìä Dataset: {total_images} total images")
        print(f"  - Train: {stats.get('train', 0)}")
        print(f"  - Valid: {stats.get('valid', 0)}")
        print(f"  - Test: {stats.get('test', 0)}")
    else:
        print("üìä Dataset: Not found")
    
    # Model info
    model_path = f'../models/{model}/{model}_best.pt'
    if os.path.exists(model_path):
        print(f"ü§ñ Model: Trained ‚úÖ")
        file_size = os.path.getsize(model_path) / (1024 * 1024)  # MB
        print(f"üì¶ Size: {file_size:.1f} MB")
    else:
        print(f"ü§ñ Model: Not trained ‚ùå")
    
    # Classes
    class_names = get_class_names(model)
    print(f"üè∑Ô∏è Classes: {list(class_names.values())}")

print("\n" + "=" * 60)
print("üöÄ Ready for inference with demo.py!")
print("üìñ See README.md for usage instructions.")