# Test Whisper API + Sarvam Transliteration Pipeline

This notebook tests the full pipeline:
1. Transcribe audio with OpenAI Whisper API (Kannada script)
2. Transliterate to Roman script using Sarvam AI
3. Save both versions

In [None]:
import os
import json
import requests
from openai import OpenAI
from dotenv import load_dotenv
from src.voice_eval.config import load_config
from src.voice_eval.storage import write_file

In [None]:
# Load environment variables
load_dotenv()

openai_key = os.getenv('OPENAI_API_KEY')
sarvam_key = os.getenv('SARVAM_API_KEY')

if not openai_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")
if not sarvam_key:
    raise ValueError("SARVAM_API_KEY not found in .env file")

openai_client = OpenAI(api_key=openai_key)
print("✓ API keys loaded")

In [None]:
# Load configuration
language = load_config('whisper', 'language')
audio_dir = load_config('input', 'audio_dir')

print(f"Language: {language}")
print(f"Audio directory: {audio_dir}")

In [None]:
# Pick an MP3 file for testing
test_file = f"{audio_dir}/GHPS.  Bammanakatti.mp3"
print(f"Test file: {test_file}")

# Check file size
file_size_mb = os.path.getsize(test_file) / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB (limit: 25 MB) ✓")

## Step 1: Transcribe with Whisper API

In [None]:
print("\n" + "="*60)
print("STEP 1: Transcribing with OpenAI Whisper API...")
print("="*60)

with open(test_file, 'rb') as audio_file:
    transcript = openai_client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        language=language,
        response_format="verbose_json",
        timestamp_granularities=["segment"]
    )

print(f"✓ Transcription complete!")
print(f"  Language detected: {transcript.language}")
print(f"  Duration: {transcript.duration:.2f} seconds")
print(f"  Segments: {len(transcript.segments)}")
print(f"\nFirst 200 chars (Kannada script):\n{transcript.text[:200]}...")

## Step 2: Transliterate with Sarvam AI

In [None]:
print("\n" + "="*60)
print("STEP 2: Transliterating to Roman script with Sarvam AI...")
print("="*60)

transliterate_url = "https://api.sarvam.ai/transliterate"
transliterate_headers = {
    "api-subscription-key": sarvam_key,
    "Content-Type": "application/json"
}

# Transliterate full text
transliterate_payload = {
    "input": transcript.text,
    "source_language_code": "kn-IN",
    "target_language_code": "en-IN"
}

transliterate_response = requests.post(
    transliterate_url,
    headers=transliterate_headers,
    json=transliterate_payload
)

if transliterate_response.status_code == 200:
    transliterate_result = transliterate_response.json()
    romanized_text = transliterate_result.get('transliterated_text', '')
    print(f"✓ Transliteration complete!")
    print(f"\nFirst 200 chars (Romanized):\n{romanized_text[:200]}...")
else:
    print(f"✗ Transliteration failed: {transliterate_response.status_code}")
    print(transliterate_response.text)
    romanized_text = None

## Step 3: Transliterate Each Segment

In [None]:
print("\n" + "="*60)
print("STEP 3: Transliterating individual segments...")
print("="*60)

romanized_segments = []

for i, segment in enumerate(transcript.segments, 1):
    # Transliterate each segment's text
    payload = {
        "input": segment.text,
        "source_language_code": "kn-IN",
        "target_language_code": "en-IN"
    }
    
    response = requests.post(
        transliterate_url,
        headers=transliterate_headers,
        json=payload
    )
    
    if response.status_code == 200:
        result = response.json()
        romanized_segment_text = result.get('transliterated_text', segment.text)
    else:
        print(f"  Warning: Segment {i} transliteration failed, keeping original")
        romanized_segment_text = segment.text
    
    romanized_segments.append({
        "id": segment.id,
        "start": segment.start,
        "end": segment.end,
        "text_kannada": segment.text,
        "text_romanized": romanized_segment_text,
        "tokens": segment.tokens,
        "temperature": segment.temperature,
        "avg_logprob": segment.avg_logprob,
        "compression_ratio": segment.compression_ratio,
        "no_speech_prob": segment.no_speech_prob
    })
    
    if i % 10 == 0:
        print(f"  Processed {i}/{len(transcript.segments)} segments...")

print(f"✓ All {len(romanized_segments)} segments transliterated!")

## Step 4: Save Combined Results

In [None]:
print("\n" + "="*60)
print("STEP 4: Saving results...")
print("="*60)

# Create combined response
combined_response = {
    "metadata": {
        "file": test_file,
        "language": transcript.language,
        "duration": transcript.duration,
        "whisper_model": "whisper-1",
        "transliteration_provider": "sarvam-ai"
    },
    "transcription": {
        "text_kannada": transcript.text,
        "text_romanized": romanized_text
    },
    "segments": romanized_segments
}

# Save to JSON
output_path = write_file(
    "whisper_sarvam_combined_response.json",
    json.dumps(combined_response, indent=2, ensure_ascii=False),
    base_dir="files/transcriptions/whisper_sarvam_test"
)

print(f"✓ Combined results saved to: {output_path}")

## Step 5: Display Sample Results

In [None]:
print("\n" + "="*60)
print("SAMPLE SEGMENTS (First 3)")
print("="*60)

for i, seg in enumerate(romanized_segments[:3], 1):
    print(f"\n[Segment {i}] {seg['start']:.2f}s → {seg['end']:.2f}s")
    print(f"  Kannada:    {seg['text_kannada']}")
    print(f"  Romanized:  {seg['text_romanized']}")

print(f"\n... ({len(romanized_segments) - 3} more segments)")
print("\n✅ Pipeline test successful!")