# Seamless Interaction Dataset Analysis
**Author:** bloggerwang1217  
**Date:** 2025-08-12  
**Goal:** Descriptive statistics for Sample Set + Session Groups


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import cv2
import librosa
from seamless_interaction.fs import SeamlessInteractionFS, DatasetConfig
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

In [None]:
# Initialize dataset configuration
config = DatasetConfig(
    label='improvised', 
    split='dev',
    preferred_vendors_only=True,
    local_dir=Path.home() / 'datasets/seamless_interaction'
)
fs = SeamlessInteractionFS(config=config)

print(f"📁 Dataset location: {config.local_dir}")
print(f"🏷️  Dataset label: {config.label}")
print(f"📊 Dataset split: {config.split}")

## Step 1: Download Sample Set (~1GB)
Run this cell to download the first dataset

In [None]:
# Download Sample Set (you can run this when ready)
print('📂 Starting Sample Set download...')
try:
    fs.download_batch_from_hf(batch_idx=0, archive_list=[0, 1, 2])
    print('✅ Sample Set download complete!')
except Exception as e:
    print(f'❌ Download failed: {e}')

## Step 2: Download Session Groups (~400MB)
Run this after Step 1 completes

In [None]:
# Download Session Groups (run this after Sample Set is done)
print('🎯 Starting Session Groups download...')
try:
    file_ids = fs.sample_random_file_ids(num_samples=10)  # Start smaller
    for i, file_id in enumerate(file_ids):
        print(f'Downloading {i+1}/10: {file_id}')
        fs.gather_file_id_data_from_s3(file_id)
    print('✅ Session Groups download complete!')
except Exception as e:
    print(f'❌ Download failed: {e}')

## Step 3: Basic File Structure Analysis
Let's see what we downloaded

In [None]:
# Analyze file structure
local_files = list(config.local_dir.rglob("*"))
print(f"Total files found: {len([f for f in local_files if f.is_file()])}")

# Count by file type
file_types = {}
for file_path in local_files:
    if file_path.is_file():
        ext = file_path.suffix.lower()
        file_types[ext] = file_types.get(ext, 0) + 1

# Display as DataFrame
df_files = pd.DataFrame([
    {'File_Type': ext, 'Count': count}
    for ext, count in file_types.items()
])

print("\n📁 File Type Distribution:")
display(df_files)

In [None]:
# Visualize file distribution
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.pie(df_files['Count'], labels=df_files['File_Type'], autopct='%1.1f%%')
plt.title('File Type Distribution')

plt.subplot(1, 2, 2)
plt.bar(df_files['File_Type'], df_files['Count'])
plt.title('File Counts by Type')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()