# Dataset Analysis for Video LoRA Training

---

## ⚠️ CRITICAL INSTRUCTIONS

**THE AI ASSISTANT MUST NOT DIRECTLY EVALUATE, INTERPRET, OR VIEW THE CONTENT OF ANY VIDEO/IMAGE DATA.**

All processing must be:
- Fully automated through scripts
- Based on statistical outputs and metadata only
- Without human or AI review of actual visual content

---

## Notebook Sections

1. **VLM Captioning** - Generate structured captions via vLLM
2. **Emotion Analysis** - Visualize pain/pleasure metrics from preprocessing
3. **Category Extraction** - Parse VLM output for character traits and actions
4. **Vectorization** - Create sentence embeddings
5. **Clustering** - HDBSCAN analysis and visualization
6. **Prioritization** - Composite scoring based on primary targets
7. **Curation** - Final dataset selection and export

## Setup & Imports

In [None]:
# Standard library
import json
import sys
import re
from pathlib import Path
from collections import Counter
from typing import Dict, List, Optional, Tuple

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# ML / NLP
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
from sklearn.preprocessing import StandardScaler

# Progress
from tqdm.notebook import tqdm

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("Imports complete!")

In [None]:
# Path configuration
BASE_DIR = Path("..").resolve()
ANALYSIS_DIR = BASE_DIR / "analysis"
CURATED_DIR = BASE_DIR / "curated"
VLM_DIR = BASE_DIR / "vlm_copies"
SCENES_DIR = BASE_DIR / "scenes"

# Ensure output directories exist
ANALYSIS_DIR.mkdir(exist_ok=True)
CURATED_DIR.mkdir(exist_ok=True)

# Add parent path for imports
sys.path.insert(0, str(BASE_DIR.parent))

print(f"Base directory: {BASE_DIR}")
print(f"Analysis directory: {ANALYSIS_DIR}")

## Load Preprocessing Results

Load the detection and emotion analysis results from preprocessing scripts.

In [None]:
# Load detection results
detections_file = ANALYSIS_DIR / "detections.json"

if detections_file.exists():
    with open(detections_file) as f:
        detections_data = json.load(f)
    print(f"Loaded {len(detections_data.get('analyses', []))} scene analyses")
    print(f"Config: {detections_data.get('config', {})}")
    print(f"Summary: {detections_data.get('summary', {})}")
else:
    print(f"WARNING: Detection file not found: {detections_file}")
    print("Run person_detector.py first!")
    detections_data = {'analyses': []}

In [None]:
# Load emotion results
emotions_file = ANALYSIS_DIR / "emotions.json"

if emotions_file.exists():
    with open(emotions_file) as f:
        emotions_data = json.load(f)
    print(f"Loaded {len(emotions_data.get('analyses', []))} emotion analyses")
    print(f"Summary: {emotions_data.get('summary', {})}")
else:
    print(f"WARNING: Emotions file not found: {emotions_file}")
    print("Run emotion_detector.py first!")
    emotions_data = {'analyses': []}

In [None]:
# Create combined DataFrame - merge detection and emotion data by scene path
df_detections = pd.DataFrame(detections_data.get('analyses', []))
df_emotions = pd.DataFrame(emotions_data.get('analyses', []))

if not df_detections.empty and not df_emotions.empty:
    # Merge on scene_path
    df = pd.merge(
        df_detections, 
        df_emotions,
        on='scene_path',
        how='left',
        suffixes=('_det', '_emo')
    )
    print(f"Combined DataFrame: {len(df)} rows")
elif not df_detections.empty:
    df = df_detections
    print(f"Using detection data only: {len(df)} rows")
else:
    df = pd.DataFrame()
    print("No data loaded!")

# Filter to scenes with persons
if not df.empty and 'person_present' in df.columns:
    df_persons = df[df['person_present'] == True].copy()
    print(f"Scenes with persons: {len(df_persons)}")
else:
    df_persons = df.copy()

## Section 2: Emotion Analysis

Visualize pain/pleasure metrics from emotion detection.

In [None]:
# Valence/Arousal Distribution
if 'mean_valence' in df_persons.columns:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Valence histogram
    axes[0].hist(df_persons['mean_valence'].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[0].axvline(0, color='red', linestyle='--', label='Neutral')
    axes[0].set_xlabel('Valence (Pain ← → Pleasure)')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Valence Distribution')
    axes[0].legend()
    
    # Arousal histogram
    axes[1].hist(df_persons['mean_arousal'].dropna(), bins=30, edgecolor='black', alpha=0.7, color='orange')
    axes[1].set_xlabel('Arousal (Low ← → High)')
    axes[1].set_ylabel('Count')
    axes[1].set_title('Arousal Distribution')
    
    # Valence x Arousal scatter
    scatter = axes[2].scatter(
        df_persons['mean_valence'], 
        df_persons['mean_arousal'],
        c=df_persons['pain_pleasure_score'],
        cmap='RdYlGn',
        alpha=0.6,
        s=50
    )
    axes[2].axhline(0.5, color='gray', linestyle=':', alpha=0.5)
    axes[2].axvline(0, color='gray', linestyle=':', alpha=0.5)
    axes[2].set_xlabel('Valence')
    axes[2].set_ylabel('Arousal')
    axes[2].set_title('Valence × Arousal Space')
    plt.colorbar(scatter, ax=axes[2], label='Pain/Pleasure Score')
    
    plt.tight_layout()
    plt.savefig(ANALYSIS_DIR / 'valence_arousal_distribution.png', dpi=150)
    plt.show()
else:
    print("Emotion data not available. Run emotion_detector.py first.")

## Section 3: VLM Captioning

Generate structured captions via vLLM (Qwen2.5-VL). This section requires the vLLM server to be running.

In [None]:
# VLM Configuration
VLLM_URL = "http://localhost"
VLLM_PORT = 8000

# Structured prompt for human interaction analysis
VLM_PROMPT = """
Analyze this video sequence. Provide a structured description:

INDIVIDUALS:
- Count and describe each person visible (age_category, gender_presentation, distinguishing_features)
- Body positions and poses
- Clothing/attire

INTERACTIONS:
- Spatial relationships between individuals
- Physical contact type and location (if any)
- Eye contact and facial expressions (if visible)
- Gesture types

ACTIONS:
- Primary activity occurring
- Secondary/background activities
- Motion direction and intensity
- Temporal progression (what changes frame to frame)

SETTING:
- Environment type (indoor/outdoor, room type)
- Lighting conditions
- Notable objects

MOOD/TONE:
- Overall emotional atmosphere
- Intensity level (calm, active, intense)

Output as structured text with clear section headers.
""".strip()

print("VLM prompt configured")
print(f"VLM server: {VLLM_URL}:{VLLM_PORT}")

In [None]:
# NOTE: Run this cell only when vLLM server is available
# VLM captioning code will be added when server is ready

captions = {}
captions_file = ANALYSIS_DIR / "captions.json"

# Load existing captions if available
if captions_file.exists():
    with open(captions_file) as f:
        captions = json.load(f)
    print(f"Loaded {len(captions)} existing captions")
else:
    print("No existing captions found")
    print("Run the VLM captioning cells when vLLM server is available")

## Section 4: Clustering & Vectorization

Create sentence embeddings and perform HDBSCAN clustering.

In [None]:
# Load sentence transformer and generate embeddings
print("Loading sentence transformer model...")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded: {embed_model.get_sentence_embedding_dimension()} dimensions")

# Generate embeddings for captions (if available)
if captions:
    caption_texts = list(captions.values())
    caption_names = list(captions.keys())
    
    print(f"Generating embeddings for {len(caption_texts)} captions...")
    embeddings = embed_model.encode(caption_texts, show_progress_bar=True)
    print(f"Embeddings shape: {embeddings.shape}")
    
    # Save embeddings
    np.save(ANALYSIS_DIR / 'embeddings.npy', embeddings)
else:
    print("No captions to embed - run VLM captioning first")
    embeddings = None

In [None]:
# HDBSCAN Clustering
cluster_labels = None

if embeddings is not None and len(embeddings) > 5:
    print("Running HDBSCAN clustering...")
    
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=max(3, len(embeddings) // 10),
        min_samples=2,
        metric='euclidean',
        cluster_selection_method='eom'
    )
    
    cluster_labels = clusterer.fit_predict(embeddings)
    
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = (cluster_labels == -1).sum()
    
    print(f"Found {n_clusters} clusters")
    print(f"Noise points: {n_noise}")
    print(f"Cluster sizes: {Counter(cluster_labels)}")
    
    # UMAP for visualization
    print("\nRunning UMAP for visualization...")
    reducer = umap.UMAP(n_components=2, n_neighbors=min(15, len(embeddings)-1), 
                        min_dist=0.1, metric='cosine', random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 8))
    scatter = ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
                        c=cluster_labels, cmap='tab20', s=100, alpha=0.7,
                        edgecolors='black', linewidth=0.5)
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    ax.set_title('Scene Clusters (HDBSCAN on Caption Embeddings)')
    plt.colorbar(scatter, ax=ax, label='Cluster ID')
    plt.tight_layout()
    plt.savefig(ANALYSIS_DIR / 'cluster_visualization.png', dpi=150)
    plt.show()
else:
    print("Not enough data for clustering")

## Section 5: Prioritization & Curation

Calculate priority scores and select final dataset.

In [None]:
# Prioritization weights
# Pain/Pleasure (35%), Character clarity (30%), Action clarity (25%), Technical quality (10%)
WEIGHTS = {'pain_pleasure': 0.35, 'character_clarity': 0.30, 
           'action_clarity': 0.25, 'technical_quality': 0.10}

# Calculate priority scores (based on emotion data primarily)
priority_scores = []

for idx, row in df_persons.iterrows():
    # Pain/Pleasure intensity - high absolute valence * arousal
    valence = row.get('mean_valence', 0) or 0
    arousal = row.get('mean_arousal', 0.5) or 0.5
    pp_score = abs(valence) * arousal
    
    # Character clarity - detection confidence
    confidence = row.get('avg_confidence', 0.5) or 0.5
    char_score = confidence
    
    # Action clarity - based on bbox movement (proxy)
    bbox_area = row.get('avg_bbox_area_ratio', 0.2) or 0.2
    action_score = min(1.0, bbox_area * 3)
    
    # Technical quality - detection coverage
    coverage = row.get('detection_coverage', 0.5) or 0.5
    tech_score = coverage
    
    # Weighted sum
    total = (pp_score * WEIGHTS['pain_pleasure'] + 
             char_score * WEIGHTS['character_clarity'] +
             action_score * WEIGHTS['action_clarity'] + 
             tech_score * WEIGHTS['technical_quality'])
    priority_scores.append(round(total, 4))

df_persons['priority_score'] = priority_scores

print(f"Priority scores calculated")
print(f"Score range: {min(priority_scores):.4f} - {max(priority_scores):.4f}")
print(f"Mean score: {np.mean(priority_scores):.4f}")

In [None]:
# Final dataset curation and export
TARGET_SIZE = 150  # Adjust based on available data
CORE_RATIO = 0.70  # 70% core, 30% fringe

# Select top priority scenes
df_sorted = df_persons.sort_values('priority_score', ascending=False)
target_size = min(TARGET_SIZE, len(df_sorted))
df_curated = df_sorted.head(target_size).copy()

print(f"Curated dataset: {len(df_curated)} scenes")

# Export to LTX-2 format
dataset = []
for idx, row in df_curated.iterrows():
    scene_path = row.get('scene_path', '')
    scene_name = Path(scene_path).stem if scene_path else ''
    caption = captions.get(scene_name, f"Scene {scene_name}")
    
    dataset.append({
        'caption': caption,
        'media_path': f"scenes/{Path(scene_path).name}"
    })

# Save dataset
dataset_path = CURATED_DIR / 'dataset.json'
with open(dataset_path, 'w') as f:
    json.dump(dataset, f, indent=2)

print(f"Exported {len(dataset)} entries to {dataset_path}")

## Summary

Pipeline complete! Output files:
- `analysis/detections.json` - Person detection results
- `analysis/emotions.json` - Emotion analysis results  
- `analysis/captions.json` - VLM captions (when generated)
- `analysis/embeddings.npy` - Sentence embeddings
- `curated/dataset.json` - LTX-2 compatible dataset