# 🔬 Advanced AV Simulation Analysis

Advanced analytics and machine learning on simulation data.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/aanshshah/av-simulation/blob/main/examples/notebooks/05_advanced_analysis.ipynb)

## Setup Analysis Environment

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
import os
import sys

base_path = '/content' if ('/content' in os.getcwd() or 'COLAB_GPU' in os.environ) else os.getcwd()
repo_path = os.path.join(base_path, 'av-simulation')
sim_path = os.path.join(repo_path, 'src')

for path_entry in (repo_path, sim_path):
    if os.path.exists(path_entry) and path_entry not in sys.path:
        sys.path.insert(0, path_entry)

try:
    from av_simulation.data.repository import DataRepository, SimulationData
    print('✅ Data modules imported successfully')
except ImportError as e:
    print(f'⚠️ Cannot import data modules: {e}')
    print('💡 Make sure to run 01_colab_setup.ipynb first')

sample_candidates = [
    os.path.join(repo_path, 'examples', 'notebooks', 'data', 'sample_ego_run.csv'),
    os.path.join(base_path, 'examples', 'notebooks', 'data', 'sample_ego_run.csv'),
    'examples/notebooks/data/sample_ego_run.csv'
]
sample_path = next((p for p in sample_candidates if os.path.exists(p)), None)

if sample_path:
    analysis_base_df = pd.read_csv(sample_path)
    print(f'✅ Loaded base dataset from {sample_path}')
else:
    print('⚠️ Sample data not found; generating synthetic dataset for demonstrations')
    t = np.linspace(0, 10, 300)
    analysis_base_df = pd.DataFrame({
        'timestamp': t,
        'speed': 20 + 5 * np.sin(t) + np.random.normal(0, 1.5, len(t)),
        'acceleration': np.gradient(20 + 5 * np.sin(t), t),
        'pos_x': np.linspace(0, 150, len(t)),
        'pos_y': 4 * np.sin(t / 2)
    })

globals()['analysis_base_df'] = analysis_base_df
print('✅ Advanced analysis libraries loaded')


## Time Series Analysis

In [None]:
# Perform time series analysis
def analyze_time_series(data):
    """Analyze time series patterns in simulation data."""
    results = {
        'mean': np.mean(data),
        'std': np.std(data),
        'trend': stats.linregress(range(len(data)), data).slope
    }
    return results

base_df = globals().get('analysis_base_df')
if base_df is not None and 'speed' in base_df:
    analysis = analyze_time_series(base_df['speed'])
    print(f'Analysis results: {analysis}')
else:
    print('❌ No base dataset available for analysis')


## Machine Learning

In [None]:
# Apply clustering to simulation data
def cluster_behaviors(features, n_clusters=3):
    """Cluster vehicle behaviors."""
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(features_scaled)

    return clusters, kmeans

base_df = globals().get('analysis_base_df')
if base_df is not None and {'speed', 'acceleration'} <= set(base_df.columns):
    feature_matrix = base_df[['speed', 'acceleration']].to_numpy()
    clusters, model = cluster_behaviors(feature_matrix, n_clusters=3)
    base_df = base_df.assign(cluster=clusters)
    print('🤖 Clustering complete. Cluster counts:')
    print(base_df['cluster'].value_counts())
else:
    print('❌ Insufficient data for clustering')

globals()['analysis_base_df'] = base_df
print('🤖 ML functions ready')
