# Seamless Interaction Dataset Analysis
**Author:** bloggerwang1217  
**Date:** 2025-08-12  
**Goal:** Descriptive statistics for Sample Set + Session Groups


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import cv2
import librosa
from seamless_interaction.fs import SeamlessInteractionFS, DatasetConfig
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [None]:
# Initialize dataset configuration
config = DatasetConfig(
    label='improvised', 
    split='dev',
    preferred_vendors_only=True,
    local_dir=Path('data/seamless_interaction')
)
fs = SeamlessInteractionFS(config=config)

print(f"📁 Dataset location: {config.local_dir}")
print(f"🏷️  Dataset label: {config.label}")
print(f"📊 Dataset split: {config.split}")

2025-08-12 20:43:07,290 - INFO - Loaded filelist with 129572 entries


📁 Dataset location: /Users/bloggerwang/datasets/seamless_interaction
🏷️  Dataset label: improvised
📊 Dataset split: dev


## Step 1: Download Sample Set (~1GB)
Run this cell to download the first dataset

In [4]:
# Download Sample Set (you can run this when ready)
print('📂 Starting Sample Set download...')
try:
    fs.download_batch_from_hf(batch_idx=0, archive_list=[0, 1, 2])
    print('✅ Sample Set download complete!')
except Exception as e:
    print(f'❌ Download failed: {e}')

2025-08-12 20:47:39,984 - INFO - Downloading 3 archives for batch 0


📂 Starting Sample Set download...


2025-08-12 20:47:41,154 - INFO - Downloading improvised/dev/0000/0002.tar to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0002.tar
2025-08-12 20:47:41,154 - INFO - Downloading improvised/dev/0000/0000.tar to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0000.tar
2025-08-12 20:47:41,154 - INFO - Downloading improvised/dev/0000/0001.tar to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0001.tar
2025-08-12 20:48:30,817 - INFO - Extracting /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0001.tar to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0001
2025-08-12 20:48:32,109 - INFO - Extracting /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0000.tar to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0000
2025-08-12 20:48:33,648 - INFO - Extracting /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0002.tar to /Users/blogg

✅ Sample Set download complete!


## Step 2: Download Session Groups (~400MB)
Run this after Step 1 completes

In [5]:
# Download Session Groups (run this after Sample Set is done)
print('🎯 Starting Session Groups download...')
try:
    file_ids = fs.sample_random_file_ids(num_samples=10)  # Start smaller
    for i, file_id in enumerate(file_ids):
        print(f'Downloading {i+1}/10: {file_id}')
        fs.gather_file_id_data_from_s3(file_id)
    print('✅ Session Groups download complete!')
except Exception as e:
    print(f'❌ Download failed: {e}')

2025-08-12 20:55:16,770 - INFO - Found 33 files for V00_S0925_I00000520_P0816


🎯 Starting Session Groups download...
Downloading 1/10: V00_S0925_I00000520_P0816


2025-08-12 20:55:23,901 - INFO - Skipping optional file annotations/1P-IS/V00_S0925_I00000520_P0816.json
2025-08-12 20:55:23,953 - INFO - Skipping optional file annotations/3P-IS/V00_S0925_I00000520_P0816.json
2025-08-12 20:55:24,489 - INFO - Skipping optional file annotations/1P-R/V00_S0925_I00000520_P0816.json
2025-08-12 20:55:24,535 - INFO - Skipping optional file annotations/3P-R/V00_S0925_I00000520_P0816.json
2025-08-12 20:55:24,571 - INFO - Skipping optional file annotations/3P-V/V00_S0925_I00000520_P0816.json
2025-08-12 20:55:25,404 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0022/V00_S0925_I00000520_P0816.npz
2025-08-12 20:55:25,405 - INFO - Successfully processed file V00_S0925_I00000520_P0816 to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0022
2025-08-12 20:55:25,414 - INFO - Found 33 files for V00_S0925_I00000481_P0383


Downloading 2/10: V00_S0925_I00000481_P0383


2025-08-12 20:55:31,452 - INFO - Skipping optional file annotations/1P-IS/V00_S0925_I00000481_P0383.json
2025-08-12 20:55:32,079 - INFO - Skipping optional file annotations/1P-R/V00_S0925_I00000481_P0383.json
2025-08-12 20:55:32,521 - INFO - Skipping optional file annotations/3P-IS/V00_S0925_I00000481_P0383.json
2025-08-12 20:55:32,617 - INFO - Skipping optional file annotations/3P-V/V00_S0925_I00000481_P0383.json
2025-08-12 20:55:33,059 - INFO - Skipping optional file annotations/3P-R/V00_S0925_I00000481_P0383.json
2025-08-12 20:55:33,494 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0015/V00_S0925_I00000481_P0383.npz
2025-08-12 20:55:33,495 - INFO - Successfully processed file V00_S0925_I00000481_P0383 to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0015
2025-08-12 20:55:33,506 - INFO - Found 33 files for V00_S0925_I00000484_P0816


Downloading 3/10: V00_S0925_I00000484_P0816


2025-08-12 20:55:39,867 - INFO - Skipping optional file annotations/1P-IS/V00_S0925_I00000484_P0816.json
2025-08-12 20:55:40,077 - INFO - Skipping optional file annotations/3P-IS/V00_S0925_I00000484_P0816.json
2025-08-12 20:55:40,084 - INFO - Skipping optional file annotations/3P-V/V00_S0925_I00000484_P0816.json
2025-08-12 20:55:40,528 - INFO - Skipping optional file annotations/1P-R/V00_S0925_I00000484_P0816.json
2025-08-12 20:55:40,644 - INFO - Skipping optional file annotations/3P-R/V00_S0925_I00000484_P0816.json
2025-08-12 20:55:42,388 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0048/V00_S0925_I00000484_P0816.npz
2025-08-12 20:55:42,389 - INFO - Successfully processed file V00_S0925_I00000484_P0816 to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0048
2025-08-12 20:55:42,399 - INFO - Found 33 files for V00_S0644_I00000138_P0299A


Downloading 4/10: V00_S0644_I00000138_P0299A


2025-08-12 20:55:48,123 - INFO - Skipping optional file annotations/1P-IS/V00_S0644_I00000138_P0299A.json
2025-08-12 20:55:48,231 - INFO - Skipping optional file annotations/3P-IS/V00_S0644_I00000138_P0299A.json
2025-08-12 20:55:48,722 - INFO - Skipping optional file annotations/1P-R/V00_S0644_I00000138_P0299A.json
2025-08-12 20:55:48,777 - INFO - Skipping optional file annotations/3P-R/V00_S0644_I00000138_P0299A.json
2025-08-12 20:55:48,888 - INFO - Skipping optional file annotations/3P-V/V00_S0644_I00000138_P0299A.json
2025-08-12 20:55:49,024 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0035/V00_S0644_I00000138_P0299A.npz
2025-08-12 20:55:49,024 - INFO - Successfully processed file V00_S0644_I00000138_P0299A to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0035
2025-08-12 20:55:49,032 - INFO - Found 33 files for V00_S2091_I00001095_P1282A


Downloading 5/10: V00_S2091_I00001095_P1282A


2025-08-12 20:55:54,752 - INFO - Skipping optional file annotations/1P-IS/V00_S2091_I00001095_P1282A.json
2025-08-12 20:55:54,967 - INFO - Skipping optional file annotations/3P-IS/V00_S2091_I00001095_P1282A.json
2025-08-12 20:55:55,063 - INFO - Skipping optional file annotations/3P-V/V00_S2091_I00001095_P1282A.json
2025-08-12 20:55:55,317 - INFO - Skipping optional file annotations/1P-R/V00_S2091_I00001095_P1282A.json
2025-08-12 20:55:55,521 - INFO - Skipping optional file annotations/3P-R/V00_S2091_I00001095_P1282A.json
2025-08-12 20:55:56,963 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0040/V00_S2091_I00001095_P1282A.npz
2025-08-12 20:55:56,964 - INFO - Successfully processed file V00_S2091_I00001095_P1282A to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0040
2025-08-12 20:55:56,973 - INFO - Found 33 files for V00_S0925_I00000479_P0383


Downloading 6/10: V00_S0925_I00000479_P0383


2025-08-12 20:56:03,025 - INFO - Skipping optional file annotations/1P-IS/V00_S0925_I00000479_P0383.json
2025-08-12 20:56:03,174 - INFO - Skipping optional file annotations/3P-IS/V00_S0925_I00000479_P0383.json
2025-08-12 20:56:03,651 - INFO - Skipping optional file annotations/1P-R/V00_S0925_I00000479_P0383.json
2025-08-12 20:56:03,683 - INFO - Skipping optional file annotations/3P-V/V00_S0925_I00000479_P0383.json
2025-08-12 20:56:03,734 - INFO - Skipping optional file annotations/3P-R/V00_S0925_I00000479_P0383.json
2025-08-12 20:56:15,807 - INFO - Saved 24 numpy arrays to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0037/V00_S0925_I00000479_P0383.npz
2025-08-12 20:56:15,808 - INFO - Successfully processed file V00_S0925_I00000479_P0383 to /Users/bloggerwang/datasets/seamless_interaction/improvised/dev/0000/0037
2025-08-12 20:56:15,817 - INFO - Found 33 files for V01_S0172_I00001294_P1316


Downloading 7/10: V01_S0172_I00001294_P1316


2025-08-12 20:56:18,676 - INFO - Skipping optional file movement/emotion_valence/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:18,924 - INFO - Skipping optional file movement/EmotionArousalToken/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,192 - INFO - Skipping optional file movement/expression/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,264 - INFO - Skipping optional file movement/EmotionValenceToken/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,366 - INFO - Skipping optional file movement/gaze_encodings/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,424 - INFO - Skipping optional file movement/frame_latent/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,508 - INFO - Skipping optional file movement/head_encodings/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,660 - INFO - Skipping optional file movement/FAUToken/V01_S0172_I00001294_P1316.npy
2025-08-12 20:56:19,859 - INFO - Skipping optional file movement/alignment_head_rotation/V01_S0172_I00001294_P1316.npy
20

Downloading 8/10: V01_S0308_I00001228_P1618


2025-08-12 20:56:25,201 - INFO - Skipping optional file movement/emotion_valence/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:25,838 - INFO - Skipping optional file movement/expression/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:25,878 - INFO - Skipping optional file movement/frame_latent/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:25,880 - INFO - Skipping optional file movement/EmotionValenceToken/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:26,013 - INFO - Skipping optional file movement/gaze_encodings/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:26,313 - INFO - Skipping optional file movement/EmotionArousalToken/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:26,336 - INFO - Skipping optional file movement/FAUToken/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:26,416 - INFO - Skipping optional file movement/FAUValue/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:26,466 - INFO - Skipping optional file movement/head_encodings/V01_S0308_I00001228_P1618.npy
2025-08-12 20:56:

Downloading 9/10: V01_S0346_I00000724_P1694


2025-08-12 20:56:32,218 - INFO - Skipping optional file movement/emotion_valence/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,393 - INFO - Skipping optional file movement/EmotionArousalToken/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,733 - INFO - Skipping optional file movement/expression/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,794 - INFO - Skipping optional file movement/gaze_encodings/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,812 - INFO - Skipping optional file movement/frame_latent/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,830 - INFO - Skipping optional file movement/EmotionValenceToken/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:32,895 - INFO - Skipping optional file movement/head_encodings/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:33,316 - INFO - Skipping optional file movement/FAUToken/V01_S0346_I00000724_P1694.npy
2025-08-12 20:56:33,336 - INFO - Skipping optional file movement/alignment_head_rotation/V01_S0346_I00000724_P1694.npy
20

Downloading 10/10: V01_S0304_I00001162_P1627


2025-08-12 20:56:37,636 - INFO - Skipping optional file movement/emotion_valence/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,040 - INFO - Skipping optional file movement/expression/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,277 - INFO - Skipping optional file movement/EmotionValenceToken/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,585 - INFO - Skipping optional file movement/gaze_encodings/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,590 - INFO - Skipping optional file movement/frame_latent/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,620 - INFO - Skipping optional file movement/FAUToken/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:38,892 - INFO - Skipping optional file movement/head_encodings/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:39,117 - INFO - Skipping optional file movement/EmotionArousalToken/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:39,173 - INFO - Skipping optional file movement/FAUValue/V01_S0304_I00001162_P1627.npy
2025-08-12 20:56:

✅ Session Groups download complete!


## Step 3: Basic File Structure Analysis
Let's see what we downloaded

In [None]:
# Analyze file structure
local_files = list(config.local_dir.rglob("*"))
print(f"Total files found: {len([f for f in local_files if f.is_file()])}")

# Count by file type
file_types = {}
for file_path in local_files:
    if file_path.is_file():
        ext = file_path.suffix.lower()
        file_types[ext] = file_types.get(ext, 0) + 1

# Display as DataFrame
df_files = pd.DataFrame([
    {'File_Type': ext, 'Count': count}
    for ext, count in file_types.items()
])

print("\n📁 File Type Distribution:")
display(df_files)

In [None]:
# Visualize file distribution
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.pie(df_files['Count'], labels=df_files['File_Type'], autopct='%1.1f%%')
plt.title('File Type Distribution')

plt.subplot(1, 2, 2)
plt.bar(df_files['File_Type'], df_files['Count'])
plt.title('File Counts by Type')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()