# Batch Process All Audio Files: Whisper + GPT-4o-mini Pipeline

Process all 42 audio files through the validated Whisper + GPT-4o-mini pipeline.

**Pipeline:**
1. OpenAI Whisper API - Transcription (Kannada script)
2. GPT-4o-mini - Transliteration (Roman script)
3. Save individual JSON files with both versions
4. Generate summary report

**Features:**
- Graceful error handling (continues on failures)
- Progress tracking with time estimates
- Incremental saves (no data loss on crashes)
- Resume support (skip already-processed files)
- Detailed failure reporting

**Expected results:**
- Individual JSON files in `files/transcriptions/batch_whisper_gpt4o/`
- Summary CSV with costs, metrics, and status
- Total cost: ~$3.27 for 9 hours of audio

In [None]:
import json
import time
from pathlib import Path
from datetime import datetime
import pandas as pd
from dotenv import load_dotenv
from src.voice_eval.config import load_config
from src.voice_eval.storage import list_files, write_file
from src.voice_eval.whisper_api import transcribe_audio, transcription_to_dict, estimate_cost
from src.voice_eval.gpt_transliteration import transliterate_text, transliterate_segments

In [None]:
# Load environment and configuration
load_dotenv()
language = load_config('whisper', 'language')
audio_dir = load_config('input', 'audio_dir')
dataset_name = load_config('dataset', 'name')

print(f"Dataset: {dataset_name}")
print(f"Language: {language}")
print(f"Audio directory: {audio_dir}")

## Configuration

In [None]:
# Processing configuration
OUTPUT_DIR = "files/transcriptions/batch_whisper_gpt4o"
SKIP_EXISTING = True  # Set to False to reprocess all files
SUPPORTED_EXTENSIONS = [".mp3", ".wav"]  # Known working formats

print(f"Output directory: {OUTPUT_DIR}")
print(f"Skip existing: {SKIP_EXISTING}")
print(f"Supported formats: {', '.join(SUPPORTED_EXTENSIONS)}")

## Discovery: Find All Audio Files

In [None]:
# Find all audio files
all_files = list_files(base_dir=audio_dir, pattern="*")

# Filter audio files (exclude .DS_Store, etc.)
audio_extensions = [".mp3", ".mp4", ".wav", ".m4a", ".aac", ".amr", ".webm", ".mpeg", ".mpga"]
audio_files = [f for f in all_files if Path(f).suffix.lower() in audio_extensions]

print(f"\nFound {len(audio_files)} audio files to process:")
print(f"  Total files in directory: {len(all_files)}")
print(f"  Audio files: {len(audio_files)}")
print(f"  Other files: {len(all_files) - len(audio_files)}")

# Show file format distribution
format_counts = {}
for f in audio_files:
    ext = Path(f).suffix.lower()
    format_counts[ext] = format_counts.get(ext, 0) + 1

print(f"\nFile format distribution:")
for ext, count in sorted(format_counts.items()):
    supported = "✓" if ext in SUPPORTED_EXTENSIONS else "⚠️ "
    print(f"  {supported} {ext}: {count} files")

## Processing Functions

In [None]:
def process_single_file(audio_path: str, output_dir: str) -> dict:
    """
    Process a single audio file through the full pipeline.
    
    Returns dict with:
    - status: 'success' or 'failed'
    - output_path: Path to saved JSON (if successful)
    - error: Error message (if failed)
    - metrics: Processing metrics (duration, cost, tokens, etc.)
    """
    start_time = time.time()
    filename = Path(audio_path).name
    
    result = {
        "file": filename,
        "status": "failed",
        "output_path": None,
        "error": None,
        "processing_time_seconds": 0,
        "metrics": {}
    }
    
    try:
        # Step 1: Transcribe with Whisper
        print(f"  [1/4] Transcribing with Whisper API...")
        transcript = transcribe_audio(
            audio_path=audio_path,
            language=language,
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )
        
        whisper_cost = estimate_cost(transcript.duration)
        print(f"    ✓ Duration: {transcript.duration:.1f}s, Segments: {len(transcript.segments)}, Cost: ${whisper_cost:.4f}")
        
        # Step 2: Transliterate full text
        print(f"  [2/4] Transliterating full text...")
        romanized_text, full_text_metrics = transliterate_text(
            text=transcript.text,
            source_language="Kannada"
        )
        print(f"    ✓ Cost: ${full_text_metrics['cost_usd']:.6f}")
        
        # Step 3: Transliterate segments
        print(f"  [3/4] Transliterating {len(transcript.segments)} segments...")
        segments_dict = transcription_to_dict(transcript)["segments"]
        romanized_segments, segment_metrics = transliterate_segments(
            segments=segments_dict,
            text_field="text",
            source_language="Kannada",
            verbose=False  # Suppress per-segment progress
        )
        print(f"    ✓ Cost: ${segment_metrics['total_cost_usd']:.6f}")
        
        # Step 4: Save results
        print(f"  [4/4] Saving results...")
        total_cost = whisper_cost + segment_metrics['total_cost_usd']
        
        combined_response = {
            "metadata": {
                "file": filename,
                "file_path": audio_path,
                "language": transcript.language,
                "duration": transcript.duration,
                "whisper_model": "whisper-1",
                "transliteration_provider": "openai-gpt-4o-mini",
                "processed_at": datetime.now().isoformat(),
                "costs_usd": {
                    "whisper_transcription": whisper_cost,
                    "gpt_transliteration_full_text": full_text_metrics['cost_usd'],
                    "gpt_transliteration_segments": segment_metrics['total_cost_usd'],
                    "total_pipeline": total_cost
                },
                "metrics": {
                    "total_tokens": segment_metrics['total_tokens'],
                    "total_segments": segment_metrics['total_segments'],
                    "transliteration_time_seconds": segment_metrics['total_latency_ms'] / 1000
                }
            },
            "transcription": {
                "text_kannada": transcript.text,
                "text_romanized": romanized_text
            },
            "segments": romanized_segments
        }
        
        # Generate output filename (sanitize original filename)
        safe_filename = Path(filename).stem.replace(" ", "_").replace("+", "plus")
        output_filename = f"{safe_filename}.json"
        
        output_path = write_file(
            output_filename,
            json.dumps(combined_response, indent=2, ensure_ascii=False),
            base_dir=output_dir
        )
        
        processing_time = time.time() - start_time
        print(f"    ✓ Saved to: {output_path}")
        print(f"    ✓ Total processing time: {processing_time:.1f}s")
        
        result.update({
            "status": "success",
            "output_path": output_path,
            "processing_time_seconds": processing_time,
            "metrics": {
                "duration": transcript.duration,
                "segments": len(transcript.segments),
                "whisper_cost": whisper_cost,
                "gpt_cost": segment_metrics['total_cost_usd'],
                "total_cost": total_cost,
                "total_tokens": segment_metrics['total_tokens']
            }
        })
        
    except Exception as e:
        processing_time = time.time() - start_time
        error_msg = str(e)
        print(f"    ✗ Error: {error_msg}")
        
        result.update({
            "status": "failed",
            "error": error_msg,
            "processing_time_seconds": processing_time
        })
    
    return result


def is_already_processed(audio_path: str, output_dir: str) -> bool:
    """Check if a file has already been processed."""
    filename = Path(audio_path).name
    safe_filename = Path(filename).stem.replace(" ", "_").replace("+", "plus")
    output_filename = f"{safe_filename}.json"
    output_path = Path(output_dir) / output_filename
    return output_path.exists()

print("✓ Processing functions defined")

## Batch Processing: Main Loop

In [None]:
print("="*80)
print("STARTING BATCH PROCESSING")
print("="*80)
print(f"Total files: {len(audio_files)}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Skip existing: {SKIP_EXISTING}")
print()

# Track results
results = []
start_time = time.time()

for i, audio_file in enumerate(audio_files, 1):
    filename = Path(audio_file).name
    file_ext = Path(audio_file).suffix.lower()
    
    print(f"\n[{i}/{len(audio_files)}] Processing: {filename}")
    print("-" * 80)
    
    # Check if already processed
    if SKIP_EXISTING and is_already_processed(audio_file, OUTPUT_DIR):
        print(f"  ⏭️  Already processed (skipping)")
        results.append({
            "file": filename,
            "status": "skipped",
            "reason": "Already processed"
        })
        continue
    
    # Warn if unsupported format
    if file_ext not in SUPPORTED_EXTENSIONS:
        print(f"  ⚠️  Warning: {file_ext} format may not be supported by Whisper API")
        print(f"     (Supported: {', '.join(SUPPORTED_EXTENSIONS)})")
        print(f"     Attempting anyway...")
    
    # Process file
    result = process_single_file(audio_file, OUTPUT_DIR)
    results.append(result)
    
    # Print progress summary
    successful = sum(1 for r in results if r.get("status") == "success")
    failed = sum(1 for r in results if r.get("status") == "failed")
    skipped = sum(1 for r in results if r.get("status") == "skipped")
    
    elapsed_time = time.time() - start_time
    avg_time_per_file = elapsed_time / i if i > 0 else 0
    remaining_files = len(audio_files) - i
    estimated_remaining = avg_time_per_file * remaining_files
    
    print(f"\n  Progress: {i}/{len(audio_files)} ({i/len(audio_files)*100:.1f}%)")
    print(f"  Status: ✓ {successful} success, ✗ {failed} failed, ⏭️  {skipped} skipped")
    print(f"  Time: {elapsed_time/60:.1f}m elapsed, ~{estimated_remaining/60:.1f}m remaining")

print("\n" + "="*80)
print("BATCH PROCESSING COMPLETE")
print("="*80)

## Summary Report

In [None]:
# Calculate summary statistics
total_files = len(results)
successful = [r for r in results if r.get("status") == "success"]
failed = [r for r in results if r.get("status") == "failed"]
skipped = [r for r in results if r.get("status") == "skipped"]

total_duration = sum(r.get("metrics", {}).get("duration", 0) for r in successful)
total_cost = sum(r.get("metrics", {}).get("total_cost", 0) for r in successful)
total_processing_time = time.time() - start_time

print(f"\n📊 SUMMARY STATISTICS")
print("="*80)
print(f"\nFiles Processed:")
print(f"  ✓ Successful: {len(successful)} ({len(successful)/total_files*100:.1f}%)")
print(f"  ✗ Failed:     {len(failed)} ({len(failed)/total_files*100:.1f}%)")
print(f"  ⏭️  Skipped:    {len(skipped)} ({len(skipped)/total_files*100:.1f}%)")
print(f"  📁 Total:      {total_files}")

if successful:
    print(f"\nAudio Duration:")
    print(f"  Total: {total_duration/3600:.2f} hours ({total_duration/60:.1f} minutes)")
    print(f"  Average per file: {total_duration/len(successful)/60:.1f} minutes")
    
    print(f"\nCosts:")
    print(f"  Total: ${total_cost:.2f}")
    print(f"  Average per file: ${total_cost/len(successful):.4f}")
    print(f"  Per minute of audio: ${total_cost/(total_duration/60):.4f}")
    
    print(f"\nProcessing Time:")
    print(f"  Total: {total_processing_time/60:.1f} minutes ({total_processing_time/3600:.2f} hours)")
    print(f"  Average per file: {total_processing_time/len(successful):.1f} seconds")
    print(f"  Throughput: {total_duration/total_processing_time:.2f}x realtime")

if failed:
    print(f"\n❌ FAILED FILES ({len(failed)}):")
    print("-" * 80)
    for r in failed:
        print(f"  • {r['file']}")
        print(f"    Error: {r.get('error', 'Unknown error')}")
        
print(f"\n💾 Output Location: {OUTPUT_DIR}/")
print(f"   Individual JSON files saved for each successful transcription")

## Save Summary CSV

In [None]:
# Create detailed results DataFrame
summary_data = []
for r in results:
    row = {
        "file": r["file"],
        "status": r["status"],
        "duration_seconds": r.get("metrics", {}).get("duration", None),
        "segments": r.get("metrics", {}).get("segments", None),
        "whisper_cost_usd": r.get("metrics", {}).get("whisper_cost", None),
        "gpt_cost_usd": r.get("metrics", {}).get("gpt_cost", None),
        "total_cost_usd": r.get("metrics", {}).get("total_cost", None),
        "total_tokens": r.get("metrics", {}).get("total_tokens", None),
        "processing_time_seconds": r.get("processing_time_seconds", None),
        "output_path": r.get("output_path", None),
        "error": r.get("error", None),
        "reason": r.get("reason", None)
    }
    summary_data.append(row)

df = pd.DataFrame(summary_data)

# Save to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
reports_dir = load_config('output', 'reports_dir')
csv_path_timestamped = f"{reports_dir}/batch_whisper_gpt4o_summary_{timestamp}.csv"
csv_path_latest = f"{reports_dir}/batch_whisper_gpt4o_summary_latest.csv"

df.to_csv(csv_path_timestamped, index=False)
df.to_csv(csv_path_latest, index=False)

print(f"\n💾 Summary CSV saved:")
print(f"  • {csv_path_timestamped}")
print(f"  • {csv_path_latest}")

# Display summary table
print(f"\n📋 Results Table (first 10 rows):")
print(df.head(10).to_string())

## Next Steps

✅ **Batch processing complete!**

**If you had failed files due to format issues:**
1. Review the failed files list above
2. Convert unsupported formats (`.aac`, `.amr`, `.mp4`) to `.mp3` or `.wav`
3. Re-run this notebook with `SKIP_EXISTING = True` to process only the failed files

**Ready for next phase:**
- Compare with Sarvam AI outputs
- Compare with Azure Speech Services
- Compare with AssemblyAI
- Wait for professional transcriptions for WER/CER evaluation