In [1]:
import json
import os
from pathlib import Path
import math
import matplotlib.pyplot as plt
import tiktoken

# Define the range of folders (00 to 10) and exclude 08
folders = [f"{i:02d}" for i in range(22)]

# Base directory for input and output
base_dir = Path.cwd()
condensed_dir = base_dir / "condensed"

# Ensure condensed directory exists
condensed_dir.mkdir(exist_ok=True)

# Data storage for visualizations
file_sizes_before = []
file_sizes_after = []
token_counts_before = []
token_counts_after = []
fps_before = []
fps_after = []
sequence_labels = []

# Original FPS
ORIGINAL_FPS = 10.0

# Token counter using tiktoken
def count_tokens(text, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(text))

for folder in folders:
    # Input and output paths for frame_summaries.json
    input_frame_path = base_dir / folder / "frame_summaries.json"
    output_folder = condensed_dir / folder
    output_frame_path = output_folder / "frame_summaries.json"
    
    # Input and output paths for sequence_summary.json
    input_sequence_path = base_dir / folder / "sequence_summary.json"
    output_sequence_path = output_folder / "sequence_summary.json"
    
    # Check if frame_summaries.json exists
    if not input_frame_path.exists():
        print(f"Warning: {input_frame_path} does not exist, skipping.")
        continue
    
    # Create output folder
    output_folder.mkdir(exist_ok=True)
    
    # Calculate file size before for frame_summaries.json
    frame_size_before = os.path.getsize(input_frame_path) / 1024  # Size in KB
    sequence_size_before = 0
    if input_sequence_path.exists():
        sequence_size_before = os.path.getsize(input_sequence_path) / 1024
    
    # Read frame_summaries.json
    with open(input_frame_path, 'r') as f:
        frame_data = json.load(f)
    
    # Count tokens before for frame_summaries.json
    frame_tokens_before = count_tokens(json.dumps(frame_data))
    sequence_tokens_before = 0
    if input_sequence_path.exists():
        with open(input_sequence_path, 'r') as f:
            sequence_data = json.load(f)
        sequence_tokens_before = count_tokens(json.dumps(sequence_data))

    print(f"Processing {folder}")
    
    # Determine condensation rule based on number of frames
    num_frames = len(frame_data)
    if num_frames <= 800:
        step = 1
        condensed_frame_data = frame_data
        print(f"{folder}: {num_frames} frames, keeping all (no condensation).")
    elif 801 <= num_frames <= 1600:
        # Use step = 2 if it results in >= 600 frames, else step = 1
        if math.ceil(num_frames / 2) >= 600:
            step = 2
            condensed_frame_data = frame_data[::2]
            print(f"{folder}: {num_frames} frames, keeping every 2nd frame.")
        else:
            step = 1
            condensed_frame_data = frame_data
            print(f"{folder}: {num_frames} frames, keeping all (step=2 yields < 600 frames).")
    else:  # num_frames > 1600
        step = math.ceil(num_frames / 800)
        condensed_frame_data = frame_data[::step]
        print(f"{folder}: {num_frames} frames, keeping every {step}th frame.")
    
    # Write condensed frame_summaries.json
    with open(output_frame_path, 'w') as f:
        json.dump(condensed_frame_data, f, indent=4)
    
    # Calculate file size and tokens after for frame_summaries.json
    frame_size_after = os.path.getsize(output_frame_path) / 1024
    frame_tokens_after = count_tokens(json.dumps(condensed_frame_data))
    
    print(f"frame_summaries.json {num_frames} to {len(condensed_frame_data)} frames.")
    
    sequence_size_after = 0
    sequence_tokens_after = 0
    condensed_time_series = []
    
    # Process sequence_summary.json
    if input_sequence_path.exists():
        # Read sequence_summary.json
        with open(input_sequence_path, 'r') as f:
            sequence_data = json.load(f)
        
        # Apply the same condensation rule to time_series
        if 'time_series' in sequence_data:
            time_series = sequence_data['time_series']
            condensed_time_series = time_series[::step]
            sequence_data['time_series'] = condensed_time_series
            
            # Write updated sequence_summary.json
            with open(output_sequence_path, 'w') as f:
                json.dump(sequence_data, f, indent=4)
            
            # Calculate file size and tokens after for sequence_summary.json
            sequence_size_after = os.path.getsize(output_sequence_path) / 1024
            sequence_tokens_after = count_tokens(json.dumps(sequence_data))
            
            print(f"sequence_summary.json time_series -> {len(time_series)} to {len(condensed_time_series)} entries")
        else:
            print(f"Warning: 'time_series' not found in {input_sequence_path}, skipping condensation for sequence_summary.json.")
    else:
        print(f"Warning: {input_sequence_path} does not exist, skipping.")

    print(f"File size before: {frame_size_before + sequence_size_before:.2f} KB, after: {frame_size_after + sequence_size_after:.2f} KB")
    print(f"Token count before: {frame_tokens_before + sequence_tokens_before}, after: {frame_tokens_after + sequence_tokens_after}")
    print(f"FPS before: {ORIGINAL_FPS}, after: {ORIGINAL_FPS / step}")
    print(f"FPS reduction: {ORIGINAL_FPS / step:.2f} FPS")
    print(f"Token reduction: {((frame_tokens_before + sequence_tokens_before) - (frame_tokens_after + sequence_tokens_after)) / (frame_tokens_before + sequence_tokens_before) * 100:.2f}%")
    print("-" * 80)
    
    # Store data for visualizations
    sequence_labels.append(folder)
    file_sizes_before.append(frame_size_before + sequence_size_before)
    file_sizes_after.append(frame_size_after + sequence_size_after)
    token_counts_before.append(frame_tokens_before + sequence_tokens_before)
    token_counts_after.append(frame_tokens_after + sequence_tokens_after)
    fps_before.append(ORIGINAL_FPS)
    fps_after.append(ORIGINAL_FPS / step)

# Visualization 1: File Size Reduction
plt.figure(figsize=(10, 6))
x = range(len(sequence_labels))
plt.bar([i - 0.2 for i in x], file_sizes_before, width=0.4, label='Before', color='skyblue')
plt.bar([i + 0.2 for i in x], file_sizes_after, width=0.4, label='After', color='salmon')
plt.xlabel('Sequence')
plt.ylabel('Total File Size (KB)')
plt.title('File Size Reduction Before and After Condensation')
plt.xticks(x, sequence_labels)
plt.legend()
plt.tight_layout()
plt.savefig(base_dir / 'file_size_reduction.png')
plt.close()

# Visualization 2: Percentage Token Reduction
token_reduction_percent = [((before - after) / before * 100) if before > 0 else 0 
                           for before, after in zip(token_counts_before, token_counts_after)]
plt.figure(figsize=(10, 6))
plt.bar(sequence_labels, token_reduction_percent, color='lightgreen')
plt.xlabel('Sequence')
plt.ylabel('Token Reduction (%)')
plt.title('Percentage Token Reduction After Condensation')
plt.tight_layout()
plt.savefig(base_dir / 'token_reduction.png')
plt.close()

# Visualization 3: FPS Reduction
plt.figure(figsize=(10, 6))
plt.bar([i - 0.2 for i in x], fps_before, width=0.4, label='Before', color='lightblue')
plt.bar([i + 0.2 for i in x], fps_after, width=0.4, label='After', color='coral')
plt.xlabel('Sequence')
plt.ylabel('Frames Per Second (FPS)')
plt.title('FPS Before and After Condensation')
plt.xticks(x, sequence_labels)
plt.legend()
plt.tight_layout()
plt.savefig(base_dir / 'fps_reduction.png')
plt.close()

Processing 00
00: 4541 frames, keeping every 6th frame.
frame_summaries.json 4541 to 757 frames.
sequence_summary.json time_series -> 4541 to 757 entries
File size before: 14823.30 KB, after: 2464.62 KB
Token count before: 3308003, after: 550069
FPS before: 10.0, after: 1.6666666666666667
FPS reduction: 1.67 FPS
Token reduction: 83.37%
--------------------------------------------------------------------------------
Processing 01
01: 1101 frames, keeping all (step=2 yields < 600 frames).
frame_summaries.json 1101 to 1101 frames.
sequence_summary.json time_series -> 1101 to 1101 entries
File size before: 1654.68 KB, after: 1654.68 KB
Token count before: 402920, after: 402920
FPS before: 10.0, after: 10.0
FPS reduction: 10.00 FPS
Token reduction: 0.00%
--------------------------------------------------------------------------------
Processing 02
02: 4661 frames, keeping every 6th frame.
frame_summaries.json 4661 to 777 frames.
sequence_summary.json time_series -> 4661 to 777 entries
File 