# 📊 AV Simulation Data Analysis

Analyze simulation data collected from AV simulation runs.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aanshshah/av-simulation/blob/main/examples/notebooks/03_data_analysis.ipynb)

## Load Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Dynamic path setup
base_path = '/content' if ('/content' in os.getcwd() or 'COLAB_GPU' in os.environ) else os.getcwd()
repo_path = os.path.join(base_path, 'av-simulation')
sim_path = os.path.join(repo_path, 'src')

for path_entry in (repo_path, sim_path):
    if path_entry not in sys.path:
        sys.path.insert(0, path_entry)
        print(f'📁 Added to Python path: {path_entry}')

try:
    from av_simulation.data.repository import DataRepository, SimulationData
    from av_simulation.data.exporters import CSVExporter
    print('✅ Data modules imported successfully')
except ImportError as e:
    print(f'❌ Cannot import data modules: {e}')
    print('💡 Make sure to run 01_colab_setup.ipynb first')

print('✅ Libraries loaded')


## Load Data

In [None]:
# Load simulation data with dynamic paths and error handling
base_path = '/content' if ('/content' in os.getcwd() or 'COLAB_GPU' in os.environ) else os.getcwd()
data_path = os.path.join(base_path, 'simulation_data')

available_runs = []
data_helper = None

print(f'📁 Looking for data in: {data_path}')

try:
    os.makedirs(data_path, exist_ok=True)

    repo = DataRepository(data_path)
    data_helper = SimulationData(repo)
    runs = repo.get_all_runs()

    def run_key(run):
        if run.end_time:
            return run.end_time.timestamp()
        if run.start_time:
            return run.start_time.timestamp()
        return 0

    runs_sorted = sorted(runs, key=run_key, reverse=True)
    available_runs = [run.run_id for run in runs_sorted]

    print('✅ Data repository initialized')
    print(f'📊 Found {len(available_runs)} simulation runs')

    if available_runs:
        for i, run_id in enumerate(available_runs[:5], start=1):
            print(f'  {i}. {run_id}')
        if len(available_runs) > 5:
            print(f'  ... and {len(available_runs) - 5} more')
    else:
        print('💡 No simulation runs found. Try running 02_simulation_runner.ipynb first')

except Exception as e:
    print(f'❌ Error accessing data: {e}')
    print('💡 Solutions:')
    print('  1. Run 02_simulation_runner.ipynb to create data')
    print('  2. Check that the data directory exists')
    print('  3. Verify simulation completed successfully')
    available_runs = []
    data_helper = None


## Analyze Performance

In [None]:
import os
# Analyze vehicle performance with enhanced error handling
import numpy as np
import pandas as pd

analysis_summary = None
analysis_df = None

if available_runs:
    try:
        latest_run = available_runs[0]
        print(f'📊 Analyzing run: {latest_run}')

        repo = DataRepository(os.path.join(base_path, 'simulation_data'))
        helper = SimulationData(repo) if data_helper is None else data_helper
        ego_snapshots = helper.get_ego_vehicle_data(latest_run)

        if ego_snapshots:
            ego_snapshots = sorted(ego_snapshots, key=lambda snap: snap.timestamp)
            analysis_df = pd.DataFrame({
                'timestamp': [snap.timestamp for snap in ego_snapshots],
                'speed': [snap.speed for snap in ego_snapshots],
                'acceleration': [snap.acceleration for snap in ego_snapshots],
                'pos_x': [snap.position_x for snap in ego_snapshots],
                'pos_y': [snap.position_y for snap in ego_snapshots]
            })
            if len(analysis_df) > 1:
                step_dx = np.diff(analysis_df['pos_x'])
                step_dy = np.diff(analysis_df['pos_y'])
                analysis_df.loc[1:, 'step_distance'] = np.sqrt(step_dx ** 2 + step_dy ** 2)
            else:
                analysis_df['step_distance'] = np.nan
            analysis_summary = f'Loaded {len(analysis_df)} ego snapshots from repository run {latest_run[:8]}'
        else:
            print('❌ No data found for the selected run')
    except Exception as e:
        print(f'❌ Analysis error: {e}')
        print('💡 The data format might be different than expected')

if analysis_df is None:
    sample_candidates = [
        os.path.join(base_path, 'av-simulation', 'examples', 'notebooks', 'data', 'sample_ego_run.csv'),
        os.path.join(base_path, 'examples', 'notebooks', 'data', 'sample_ego_run.csv'),
        'examples/notebooks/data/sample_ego_run.csv'
    ]
    sample_path = next((p for p in sample_candidates if os.path.exists(p)), None)
    if sample_path:
        analysis_df = pd.read_csv(sample_path)
        analysis_summary = f'Using bundled sample data from {sample_path}'
        print(f'ℹ️ {analysis_summary}')
    else:
        print('❌ No data available for analysis')
        print('💡 Run 02_simulation_runner.ipynb to generate fresh data')

if analysis_df is not None:
    print('📈 Performance Metrics:')
    if 'speed' in analysis_df:
        print(f"  Average Speed: {analysis_df['speed'].mean():.2f} m/s")
        print(f"  Max Speed: {analysis_df['speed'].max():.2f} m/s")
        print(f"  Min Speed: {analysis_df['speed'].min():.2f} m/s")
        print(f"  Speed Std Dev: {analysis_df['speed'].std():.2f} m/s")

    if 'step_distance' in analysis_df and analysis_df['step_distance'].notna().any():
        total_distance = analysis_df['step_distance'].fillna(0).sum()
        print(f'  Total Distance: {total_distance:.2f} m')
        valid_steps = analysis_df['step_distance'].dropna()
        if not valid_steps.empty:
            print(f'  Average Distance per Step: {valid_steps.mean():.2f} m')

    if 'timestamp' in analysis_df and len(analysis_df['timestamp']) > 1:
        duration = analysis_df['timestamp'].iloc[-1] - analysis_df['timestamp'].iloc[0]
        print(f'  Simulation Duration: {duration:.2f} s')
        print(f'  Data Points: {len(analysis_df)}')
        if duration > 0:
            avg_fps = len(analysis_df) / duration
            print(f'  Average FPS: {avg_fps:.1f}')

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 2, 1)
    if 'timestamp' in analysis_df and 'speed' in analysis_df:
        plt.plot(analysis_df['timestamp'], analysis_df['speed'], 'b-', linewidth=2)
        plt.title('Speed Over Time')
        plt.xlabel('Time (s)')
        plt.ylabel('Speed (m/s)')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Speed data not available',
                 ha='center', va='center', transform=plt.gca().transAxes,
                 fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))

    plt.subplot(2, 2, 2)
    if 'speed' in analysis_df:
        plt.hist(analysis_df['speed'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        plt.title('Speed Distribution')
        plt.xlabel('Speed (m/s)')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Speed data not available',
                 ha='center', va='center', transform=plt.gca().transAxes,
                 fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))

    plt.subplot(2, 2, 3)
    if 'acceleration' in analysis_df and analysis_df['acceleration'].notna().any():
        plt.plot(analysis_df['timestamp'], analysis_df['acceleration'], 'g-', linewidth=2)
        plt.title('Acceleration Over Time')
        plt.xlabel('Time (s)')
        plt.ylabel('Acceleration (m/s²)')
        plt.grid(True, alpha=0.3)
    elif 'speed' in analysis_df and 'timestamp' in analysis_df and len(analysis_df) > 1:
        derived_acc = np.diff(analysis_df['speed']) / np.diff(analysis_df['timestamp'])
        plt.plot(analysis_df['timestamp'].iloc[1:], derived_acc, 'g-', linewidth=2)
        plt.title('Derived Acceleration')
        plt.xlabel('Time (s)')
        plt.ylabel('Acceleration (m/s²)')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Acceleration data not available',
                 ha='center', va='center', transform=plt.gca().transAxes,
                 fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))

    plt.subplot(2, 2, 4)
    if {'pos_x', 'pos_y'} <= set(analysis_df.columns):
        plt.plot(analysis_df['pos_x'], analysis_df['pos_y'], 'r-', linewidth=2)
        plt.title('Vehicle Trajectory')
        plt.xlabel('X Position (m)')
        plt.ylabel('Y Position (m)')
        plt.grid(True, alpha=0.3)
        plt.axis('equal')
    else:
        plt.text(0.5, 0.5, 'Position data\nnot available',
                 ha='center', va='center', transform=plt.gca().transAxes,
                 fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))
        plt.title('Vehicle Trajectory')

    plt.tight_layout()
    plt.show()

    print('✅ Analysis plots generated successfully')

globals()['analysis_df'] = analysis_df
globals()['analysis_summary'] = analysis_summary
