# 01 - Data Exploration
Drive&Act Dataset - Kinect IR View

**Runtime:** CPU is sufficient for this notebook.

In [None]:
# Colab Setup
import os
IN_COLAB = 'COLAB_GPU' in os.environ or os.path.exists('/content')

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content
    !git clone https://github.com/batuhne/Driver-Activity-Recognition.git 2>/dev/null || true
    %cd Driver-Activity-Recognition
    !pip install -q -r requirements.txt
    DATA_ROOT = '/content/drive/MyDrive/DriveAndAct'
else:
    DATA_ROOT = './data'

In [None]:
import sys
sys.path.insert(0, '.' if not IN_COLAB else '/content/Driver-Activity-Recognition')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from src.utils import load_config, get_activity_labels, build_file_id_to_video_path
from src.dataset import parse_annotations

config = load_config()
if IN_COLAB:
    config['data']['root'] = DATA_ROOT

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Annotation Analysis

In [None]:
# Load main annotation file
ann_path = os.path.join(config['data']['root'], config['data']['annotation_dir'], config['data']['annotation_file'])
df = pd.read_csv(ann_path)
print(f'Total segments: {len(df)}')
print(f'Unique activities: {df["activity"].nunique()}')
print(f'Participants: {sorted(df["participant_id"].unique())}')
print(f'\nColumns: {list(df.columns)}')
df.head(10)

## 2. Class Distribution

In [None]:
# Class distribution
class_counts = df['activity'].value_counts()
print(f'Most common: {class_counts.index[0]} ({class_counts.iloc[0]})')
print(f'Least common: {class_counts.index[-1]} ({class_counts.iloc[-1]})')
print(f'Imbalance ratio: {class_counts.iloc[0] / class_counts.iloc[-1]:.1f}x')

fig, ax = plt.subplots(figsize=(12, 8))
class_counts.plot(kind='barh', ax=ax, color='steelblue')
ax.set_xlabel('Number of Segments')
ax.set_title('Activity Class Distribution')
plt.tight_layout()
plt.show()

## 3. Segment Duration Analysis

In [None]:
# Segment durations
df['duration_frames'] = df['frame_end'] - df['frame_start']
df['duration_sec'] = df['duration_frames'] / 30.0  # 30 fps

print(f'Duration stats (seconds):')
print(df['duration_sec'].describe())
print(f'\nSegments < 8 frames: {(df["duration_frames"] < 8).sum()}')
print(f'Segments < 16 frames: {(df["duration_frames"] < 16).sum()}')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(df['duration_sec'], bins=50, color='steelblue', edgecolor='white')
axes[0].set_xlabel('Duration (seconds)')
axes[0].set_ylabel('Count')
axes[0].set_title('Segment Duration Distribution')
axes[0].axvline(x=3.0, color='red', linestyle='--', label='3s (16 frames @ 5fps)')
axes[0].legend()

# Per-class mean duration
mean_dur = df.groupby('activity')['duration_sec'].mean().sort_values()
mean_dur.plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_xlabel('Mean Duration (seconds)')
axes[1].set_title('Mean Segment Duration per Activity')
plt.tight_layout()
plt.show()

## 4. Split Statistics

In [None]:
# Parse splits
splits, label_to_idx, idx_to_label = parse_annotations(config)
print(f'Number of classes: {len(label_to_idx)}')
for name, segs in splits.items():
    labels = [s["label_idx"] for s in segs]
    participants = set(s["participant_id"] for s in segs)
    print(f'{name}: {len(segs)} segments, participants: {sorted(participants)}')

## 5. Sample Frame Visualization

In [None]:
# Visualize sample frames from different activities
file_id_to_video = build_file_id_to_video_path(config['data']['root'], config['data']['video_dir'])
print(f'Found {len(file_id_to_video)} video files')

# Pick 6 random segments from different activities
sample_activities = np.random.choice(list(label_to_idx.keys()), size=min(6, len(label_to_idx)), replace=False)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for ax, activity in zip(axes.flat, sample_activities):
    # Find a segment for this activity
    seg = next((s for s in splits['train'] if s['activity'] == activity), None)
    if seg is None:
        continue
    video_path = file_id_to_video.get(seg['file_id'])
    if video_path is None:
        continue
    
    mid_frame = (seg['frame_start'] + seg['frame_end']) // 2
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
    ret, frame = cap.read()
    cap.release()
    
    if ret:
        ax.imshow(frame, cmap='gray' if len(frame.shape) == 2 else None)
    ax.set_title(activity.replace('_', ' '), fontsize=10)
    ax.axis('off')

plt.suptitle('Sample Frames from Different Activities', fontsize=14)
plt.tight_layout()
plt.show()