In [None]:
# Comparing audio transcription api for speed and cost (speech to text)
# using a cesar.wav 10 sec test file

# Run on colab and setting api key below

In [None]:
# Required for colab. No need for openai packages already included by google colab
!pip install -U "mistralai"

In [None]:
# Used directly by openai python client
os.environ["OPENAI_API_KEY"] = OPEN_API_KEY

os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

In [None]:
import os
from openai import OpenAI
from datetime import datetime
client = OpenAI()

audio_file = open("cesar.wav", "rb")

start_time = datetime.now()
transcript = client.audio.transcriptions.create(
  model="gpt-4o-transcribe",
  file=audio_file
)

print (transcript.text)
end_time = datetime.now()
duration = end_time - start_time
print(f"{duration.total_seconds():.3f}")
print(f"Number of tokens: {transcript.usage}")

In [None]:
import os
from datetime import datetime
from mistralai import Mistral

api_key = os.environ["MISTRAL_API_KEY"]
model = "voxtral-mini-latest"

client = Mistral(api_key=api_key)

start_time = datetime.now()

with open("cesar.wav", "rb") as f:
    transcription_response = client.audio.transcriptions.complete(
        model=model,
        file={
            "content": f,
            "file_name": "cesar.wav"
        },
        ## language="fr"
    )

end_time = datetime.now()
duration = end_time - start_time
print(f"{duration.total_seconds():.3f}")

print (transcription_response)
print(transcription_response.text)


In [None]:
# Result for a 10 seconds french audio
# openai 2.593 s- Number of tokens: UsageTokens(input_tokens=107, output_tokens=38, total_tokens=145, type='tokens', input_token_details=UsageTokensInputTokenDetails(audio_tokens=107, text_tokens=0))
# mistral 0.507 s - usage=UsageInfo(prompt_tokens=3, completion_tokens=36, total_tokens=414, prompt_audio_seconds=10) 

# Duration comparison
# openai gpt-4o-transcribe (fastest than whisper-1 and other from openai) in 2.6 seconds (4x faster than the orignal, and 2x in other tests)
# mistral voxtral-mini-latest is .5 second
# In our test, Mistral model is 5 times faster than OpenAI and 20x faster than the original

# Price comparison
# openai price 6 usd Mtoken audio cost = 6/1000000*107 = 0.00064
# mistral price 0.002usd/min - 10 seconds = 0.0003
# Mistral is half the price of openai (and a third according to official price, Openai estimates :0.006usd/min in https://platform.openai.com/docs/pricing)