# Test Whisper API + GPT-4o-mini Transliteration Pipeline

This notebook tests the full pipeline using OpenAI only:
1. Transcribe audio with OpenAI Whisper API (Kannada script)
2. Transliterate to Roman script using OpenAI GPT-4o-mini
3. Save both versions

**Benefits of GPT-4o-mini approach:**
- Single API provider (OpenAI) for both transcription + transliteration
- Very low cost (~$3.27 for full 9-hour dataset)
- Flexible prompting for custom romanization styles
- Consistent phonetic transliteration

**Refactored to use:**
- `src/voice_eval/whisper_api.py`
- `src/voice_eval/gpt_transliteration.py`
- `src/voice_eval/storage.py`

In [None]:
import json
from dotenv import load_dotenv
from src.voice_eval.config import load_config
from src.voice_eval.whisper_api import transcribe_audio, transcription_to_dict, estimate_cost
from src.voice_eval.gpt_transliteration import transliterate_text, transliterate_segments
from src.voice_eval.storage import write_file

In [None]:
# Load environment variables
load_dotenv()
print("✓ API keys loaded")

In [None]:
# Load configuration
language = load_config('whisper', 'language')
audio_dir = load_config('input', 'audio_dir')

print(f"Language: {language}")
print(f"Audio directory: {audio_dir}")

In [None]:
# Pick an MP3 file for testing
test_file = f"{audio_dir}/GHPS.  Bammanakatti.mp3"
print(f"Test file: {test_file}")

# Check file size
import os
file_size_mb = os.path.getsize(test_file) / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB")

## Step 1: Transcribe with Whisper API

In [None]:
print("\n" + "="*60)
print("STEP 1: Transcribing with OpenAI Whisper API...")
print("="*60)

transcript = transcribe_audio(
    audio_path=test_file,
    language=language,
    response_format="verbose_json",
    timestamp_granularities=["segment"]
)

whisper_cost = estimate_cost(transcript.duration)

print(f"✓ Transcription complete!")
print(f"  Language detected: {transcript.language}")
print(f"  Duration: {transcript.duration:.2f} seconds ({transcript.duration/60:.1f} minutes)")
print(f"  Segments: {len(transcript.segments)}")
print(f"  Cost: ${whisper_cost:.4f}")
print(f"\nFirst 200 chars (Kannada script):\n{transcript.text[:200]}...")

## Step 2: Transliterate Full Text with GPT-4o-mini

In [None]:
print("\n" + "="*60)
print("STEP 2: Transliterating full text to Roman script...")
print("="*60)

romanized_text, full_text_metrics = transliterate_text(
    text=transcript.text,
    source_language="Kannada"
)

print(f"✓ Transliteration complete!")
print(f"  Tokens used: {full_text_metrics['tokens']['total']}")
print(f"  Latency: {full_text_metrics['latency_ms']:.0f}ms")
print(f"  Cost: ${full_text_metrics['cost_usd']:.6f}")
print(f"\nFirst 200 chars (Romanized):\n{romanized_text[:200]}...")

## Step 3: Transliterate Individual Segments

In [None]:
print("\n" + "="*60)
print("STEP 3: Transliterating individual segments...")
print("="*60)

# Convert segments to dictionaries
segments_dict = transcription_to_dict(transcript)["segments"]

romanized_segments, segment_metrics = transliterate_segments(
    segments=segments_dict,
    text_field="text",
    source_language="Kannada"
)

print(f"\n✓ All {segment_metrics['total_segments']} segments transliterated!")
print(f"  Total cost: ${segment_metrics['total_cost_usd']:.6f}")
print(f"  Total tokens: {segment_metrics['total_tokens']:,}")
print(f"  Total time: {segment_metrics['total_latency_ms']/1000:.1f}s")
print(f"  Avg per segment: {segment_metrics['avg_latency_per_segment_ms']:.0f}ms")

## Step 4: Calculate Total Pipeline Costs

In [None]:
print("\n" + "="*60)
print("PIPELINE COST BREAKDOWN")
print("="*60)

total_cost = whisper_cost + segment_metrics['total_cost_usd']

print(f"\nWhisper API (transcription):   ${whisper_cost:.6f}")
print(f"GPT-4o-mini (full text):        ${full_text_metrics['cost_usd']:.6f}")
print(f"GPT-4o-mini (segments):         ${segment_metrics['total_cost_usd']:.6f}")
print(f"{'-'*60}")
print(f"Total pipeline cost:            ${total_cost:.6f}")

print(f"\n📊 Cost per minute of audio:    ${total_cost / (transcript.duration/60):.6f}")
print(f"📊 Cost per character:           ${total_cost / len(transcript.text):.8f}")

# Extrapolate to full dataset
dataset_duration_hours = 9.0
dataset_cost_estimate = total_cost * (dataset_duration_hours * 60) / (transcript.duration / 60)
print(f"\n💰 Estimated cost for full dataset (9 hours): ${dataset_cost_estimate:.2f}")

## Step 5: Save Combined Results

In [None]:
print("\n" + "="*60)
print("STEP 5: Saving results...")
print("="*60)

# Create combined response
combined_response = {
    "metadata": {
        "file": test_file,
        "language": transcript.language,
        "duration": transcript.duration,
        "whisper_model": "whisper-1",
        "transliteration_provider": "openai-gpt-4o-mini",
        "costs_usd": {
            "whisper_transcription": whisper_cost,
            "gpt_transliteration_full_text": full_text_metrics['cost_usd'],
            "gpt_transliteration_segments": segment_metrics['total_cost_usd'],
            "total_pipeline": total_cost
        },
        "metrics": {
            "total_tokens": segment_metrics['total_tokens'],
            "total_segments": segment_metrics['total_segments'],
            "processing_time_seconds": segment_metrics['total_latency_ms'] / 1000
        }
    },
    "transcription": {
        "text_kannada": transcript.text,
        "text_romanized": romanized_text
    },
    "segments": romanized_segments
}

# Save to JSON
output_path = write_file(
    "whisper_gpt4o_combined_response.json",
    json.dumps(combined_response, indent=2, ensure_ascii=False),
    base_dir="files/transcriptions/whisper_gpt4o_test"
)

print(f"✓ Combined results saved to: {output_path}")

## Step 6: Display Sample Results

In [None]:
print("\n" + "="*60)
print("SAMPLE SEGMENTS (First 3)")
print("="*60)

for i, seg in enumerate(romanized_segments[:3], 1):
    print(f"\n[Segment {i}] {seg['start']:.2f}s → {seg['end']:.2f}s")
    print(f"  Kannada:    {seg['text'][:80]}..." if len(seg['text']) > 80 else f"  Kannada:    {seg['text']}")
    print(f"  Romanized:  {seg['text_romanized'][:80]}..." if len(seg.get('text_romanized', '')) > 80 else f"  Romanized:  {seg.get('text_romanized', 'N/A')}")

print(f"\n... ({len(romanized_segments) - 3} more segments)")
print("\n✅ Pipeline test successful!")

## Summary

This notebook demonstrates the **OpenAI-only pipeline** using:
- Whisper API for transcription (Kannada native script)
- GPT-4o-mini for transliteration (Roman script)

**Key advantages:**
1. **Single API provider** - Simplified integration and billing
2. **Low cost** - Estimated ~$3.27 for entire 9-hour dataset
3. **Consistent quality** - Phonetic romanization with diacritics
4. **Flexible** - Can customize romanization style via prompts

**Ready for batch processing!**