# FinSpeak STT Demo
## Speech-to-Text with Whisper

This notebook demonstrates the speech-to-text functionality using Whisper.

In [None]:
import sys
sys.path.append('..')

from fin_speak.stt import transcribe, transcribe_local
from fin_speak.config import Config

## Configure Model

In [None]:
# Set model size
Config.WHISPER_MODEL_SIZE = 'small'  # Options: tiny, base, small, medium, large
print(f"Using Whisper model: {Config.WHISPER_MODEL_SIZE}")

## Generate Sample Audio (using gTTS)

In [None]:
from gtts import gTTS
import os

# Create sample queries
queries = [
    "What is the current NAV of Vanguard S&P 500 Fund?",
    "Show me six month returns for Fidelity Growth Fund",
    "How has Wellington Fund performed over one year?"
]

# Generate audio files
audio_dir = '../demo_assets'
os.makedirs(audio_dir, exist_ok=True)

audio_files = []
for i, query in enumerate(queries):
    audio_path = f'{audio_dir}/query_{i+1}.mp3'
    tts = gTTS(text=query, lang='en', slow=False)
    tts.save(audio_path)
    audio_files.append(audio_path)
    print(f"Generated: {audio_path}")

## Transcribe Audio Files

In [None]:
# Transcribe each audio file
results = []

for audio_path, original_query in zip(audio_files, queries):
    print(f"\nTranscribing: {audio_path}")
    print(f"Original: {original_query}")
    
    transcript = transcribe(audio_path, force_local=True)
    print(f"Transcript: {transcript}")
    
    results.append({
        'original': original_query,
        'transcript': transcript,
        'audio_file': audio_path
    })

## Evaluate Word Error Rate (WER)

In [None]:
from jiwer import wer, cer

# Calculate WER
originals = [r['original'] for r in results]
transcripts = [r['transcript'] for r in results]

word_error_rate = wer(originals, transcripts)
char_error_rate = cer(originals, transcripts)

print(f"\nWord Error Rate (WER): {word_error_rate:.2%}")
print(f"Character Error Rate (CER): {char_error_rate:.2%}")

# Display comparison
import pandas as pd

df = pd.DataFrame(results)
df

## Performance Notes

- **tiny**: Fastest, less accurate
- **base**: Good balance
- **small**: Recommended for most use cases (default)
- **medium**: Higher accuracy, slower
- **large**: Best accuracy, slowest

For production, consider using OpenAI Whisper API for better accuracy without local compute requirements.