### Import Libraries

In [3]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import gemini

importlib.reload(gemini)
from models.gemini import GeminiAsyncRequester

In [4]:
BATCH_SIZE = 20
CONCURRENCY_LIMIT = 5
GEMINI_OUTPUT_JSON_PATH = "../../data/results/gemini_transcripts.json"
GEMINI_METADATA_OUTPUT_JSON_PATH = "../../data/results/gemini_metadata.json"

### Get video paths

In [5]:
# Get video paths from dataset metadata
video_metadata = pd.read_csv(os.getenv("VIDEO_METADATA_PATH"))
video_paths = video_metadata[~video_metadata['is_size_outlier']]['video_path'].tolist()
print(f"Total videos to process: {len(video_paths)}")

Total videos to process: 3556


Gemini's API supports two approaches for video processing:

1. **Direct inline content** (< 20MB): Smaller videos can be embedded directly in the `generateContent` request without using the File API.
2. **File API upload** (≥ 20MB): Larger videos must first be uploaded via the File API before processing.

Since the dataset contains videos exceeding 20MB, videos will be processed in two separate batches based on their file size.

In [6]:
# Split videos by size threshold (20MB)
small_videos = video_metadata[
    (~video_metadata['is_size_outlier']) & 
    (video_metadata['size_mb'] < 20)
]['video_path'].tolist()

large_videos = video_metadata[
    (~video_metadata['is_size_outlier']) & 
    (video_metadata['size_mb'] >= 20)
]['video_path'].tolist()

print(f"Small videos (< {20}MB): {len(small_videos)}")
print(f"Large videos (≥ {20}MB): {len(large_videos)}")
print(f"Total videos to process: {len(small_videos) + len(large_videos)}")

Small videos (< 20MB): 2985
Large videos (≥ 20MB): 571
Total videos to process: 3556


### Run Gemini

In [7]:
# Get prompt
prompt_path = "../../data/prompts/video_understanding_gemini.txt"
with open(prompt_path, "r", encoding="utf-8") as f:
    prompt = f.read()

In [8]:
# Initialize the instance
gemini_model = GeminiAsyncRequester(
    api_key=os.getenv("GEMINI_API_KEY"),
    model="models/gemini-2.5-pro",
    prompt=prompt,
)

In [9]:
async def call_gemini_small_videos(video_paths_list, batch_size, concurrency_limit, use_url=False):
    """
    Process videos with Gemini in batches.
    
    Args:
        video_paths_list: List of video file paths to process
        batch_size: Number of videos to process per batch
        use_url: If True, use File API for large videos; if False, use inline content
    """
    total_batches = (len(video_paths_list) + batch_size - 1) // batch_size
    
    for i in range(0, len(video_paths_list), batch_size):
        batch = video_paths_list[i : i + batch_size]
        batch_num = i // batch_size + 1
        
        print(f"\nProcessing batch {batch_num}/{total_batches} ({len(batch)} videos)...")
        print(f"Video paths in this batch: {batch}")
        await gemini_model.run_async(
            batch,
            GEMINI_OUTPUT_JSON_PATH,
            GEMINI_METADATA_OUTPUT_JSON_PATH,
            concurrency_limit=concurrency_limit,
            overwrite=False,
            append=True,
            use_url=use_url,
        )
        
        print(f"Batch {batch_num} completed and results appended.")
    
    print(f"\nAll batches processed. Results saved in: {GEMINI_OUTPUT_JSON_PATH}")

In [None]:
# Run small videos inline in batches
await call_gemini_small_videos(small_videos, batch_size=BATCH_SIZE, concurrency_limit= CONCURRENCY_LIMIT, use_url=False)

In [20]:
async def call_gemini_large_videos(video_paths_list, batch_size, concurrency_limit):
    """
    Process large videos with Gemini: upload, process, then delete.
    
    Args:
        video_paths_list: List of video file paths to process
        batch_size: Number of videos to upload and process per batch
        concurrency_limit: Max concurrent API requests during processing
    """
    total_batches = (len(video_paths_list) + batch_size - 1) // batch_size
    
    for i in range(0, len(video_paths_list), batch_size):
        batch = video_paths_list[i : i + batch_size]
        batch_num = i // batch_size + 1
        
        print(f"\n{'='*60}")
        print(f"Batch {batch_num}/{total_batches}: Processing {len(batch)} large videos")
        print(f"{'='*60}")
        
        # Step 1: Upload all videos in batch
        print(f"\n[Step 1/{3}] Uploading {len(batch)} videos...")
        uploaded_files = {}
        for video_path in batch:
            print(str(video_path))
            try:
                video_file = gemini_model.upload_video(video_path)
                uploaded_files[video_path] = video_file
                print(f"Uploaded {os.path.basename(video_path)}")
            except Exception as e:
                print(f"Failed to upload {os.path.basename(video_path)}: {e}")
        
        print(f"Uploaded {len(uploaded_files)}/{len(batch)} videos")
        
        # Step 2: Process uploaded videos
        if uploaded_files:
            print(f"\n[Step 2/{3}] Processing {len(uploaded_files)} uploaded videos...")
            
            await gemini_model.run_async(
                uploaded_files,
                GEMINI_OUTPUT_JSON_PATH,
                GEMINI_METADATA_OUTPUT_JSON_PATH,
                concurrency_limit=concurrency_limit,
                overwrite=False,
                append=True,
                use_url=True,
            )
            print(f"Processing complete")
        
        print(f"Batch {batch_num} completed and cleaned up")
    
    print(f"\n{'='*60}")
    print(f"All large video batches processed!")
    print(f"{'='*60}")

In [None]:
# Process large videos (upload -> process -> delete)
await call_gemini_large_videos(large_videos, batch_size=BATCH_SIZE, concurrency_limit=CONCURRENCY_LIMIT)

In [102]:
# Load existing results
with open(GEMINI_OUTPUT_JSON_PATH, 'r', encoding='utf-8') as f:
    existing_results = json.load(f)

# Get list of processed video names
processed_videos = {result['video_path'] for result in existing_results}
print(f"Total processed videos: {len(processed_videos)}")

# Find missing large videos
missing_large_videos = []
for video_path in large_videos:
    video_name = os.path.basename(video_path)
    if video_name not in processed_videos:
        missing_large_videos.append(video_path)

print(f"\nMissing large videos: {len(missing_large_videos)}/{len(large_videos)}")
print(f"Successfully processed: {len(large_videos) - len(missing_large_videos)}/{len(large_videos)}")



Total processed videos: 309

Missing large videos: 262/571
Successfully processed: 309/571
