# Test OpenAI Whisper API on Single File

Test the OpenAI Whisper API with one audio file to verify it works before batch processing.

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from src.voice_eval.config import load_config

In [None]:
# Load environment variables
load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI(api_key=api_key)
print("✓ OpenAI client initialized")

In [None]:
# Load configuration
language = load_config('whisper', 'language')
audio_dir = load_config('input', 'audio_dir')

print(f"Language: {language}")
print(f"Audio directory: {audio_dir}")

In [None]:
# Pick an MP3 file for testing (explicitly supported format)
test_file = f"{audio_dir}/GHPS.  Bammanakatti.mp3"
print(f"Test file: {test_file}")

# Check file size
import os
file_size_mb = os.path.getsize(test_file) / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB (limit: 25 MB) ✓")

In [None]:
# Transcribe with OpenAI Whisper API using verbose_json format
print("\nSending request to OpenAI Whisper API...\n")

with open(test_file, 'rb') as audio_file:
    transcript = client.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file,
        language=language,
        response_format="verbose_json",
        timestamp_granularities=["segment"]
    )

print("✓ Transcription complete!")

In [None]:
# Save raw response to JSON for inspection
import json
from src.voice_eval.storage import write_file

# Convert response to dict
response_dict = {
    "text": transcript.text,
    "language": transcript.language,
    "duration": transcript.duration,
    "segments": [
        {
            "id": seg.id,
            "start": seg.start,
            "end": seg.end,
            "text": seg.text,
            "tokens": seg.tokens,
            "temperature": seg.temperature,
            "avg_logprob": seg.avg_logprob,
            "compression_ratio": seg.compression_ratio,
            "no_speech_prob": seg.no_speech_prob
        }
        for seg in transcript.segments
    ]
}

# Save to JSON
output_path = write_file(
    "whisper_api_test_response.json",
    json.dumps(response_dict, indent=2, ensure_ascii=False),
    base_dir="files/transcriptions/whisper_api_test"
)

print(f"✓ Response saved to: {output_path}")

In [None]:
# Display results
print("\n" + "="*60)
print("METADATA:")
print("="*60)
print(f"Language: {transcript.language}")
print(f"Duration: {transcript.duration:.2f} seconds")

print("\n" + "="*60)
print("FULL TRANSCRIPTION:")
print("="*60)
print(transcript.text)

In [None]:
# Display timestamped segments
print("\n" + "="*60)
print("TIMESTAMPED SEGMENTS:")
print("="*60)

if hasattr(transcript, 'segments') and transcript.segments:
    for i, segment in enumerate(transcript.segments, 1):
        print(f"\n[Segment {i}]")
        print(f"Time: {segment.start:.2f}s -> {segment.end:.2f}s")
        print(f"Text: {segment.text}")
    print(f"\nTotal segments: {len(transcript.segments)}")
else:
    print("No segments available (may need different response_format)")

In [None]:
# Show raw response structure for debugging
print("\n" + "="*60)
print("RAW RESPONSE STRUCTURE:")
print("="*60)
print(f"Type: {type(transcript)}")
print(f"Available attributes: {dir(transcript)}")