# Mistral Voxtral
- Announcement: https://mistral.ai/news/voxtral
- Docs: https://docs.mistral.ai/capabilities/audio/

In [28]:
import getpass
from io import BytesIO
import json
import math
#from mistralai import Mistral # pip install mistralai
import os
from pydub import AudioSegment # pip install pydub
import requests # pip install requests

In [2]:
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Enter {var}: ")

_set_env("MISTRAL_API_KEY")

In [29]:
# For transcription use only "voxtral-mini-2507" not small (https://docs.mistral.ai/capabilities/audio/#transcription)
MODEL_NAME = "voxtral-mini-2507" # https://docs.mistral.ai/getting-started/models/models_overview/
TRANSCR_ENDPOINT = "https://api.mistral.ai/v1/audio/transcriptions" # https://docs.mistral.ai/capabilities/audio/
FILE_ENDPOINT = "https://api.mistral.ai/v1/files"

In [30]:
headers = {
    "x-api-key": os.getenv('MISTRAL_API_KEY')
}

Upload the Audio File (trying Latvian)
- no more than 15 min (https://docs.mistral.ai/capabilities/audio/#faq)

In [None]:
str_input_file = "LAT.mp3" # ~101 min long audio, therefore I need to split it in 8 parts
audio = AudioSegment.from_file(str_input_file)

In [None]:
chunk_length_ms = 14 * 60 * 1000 # each chunk will be 14 min or 840'000 ms long (14 min * 60 sec * 1000 ms)
total_chunks = math.ceil(len(audio) / chunk_length_ms) # 8 chunks

In [None]:
# Output directory - where all chunked audio files and transcriptions will be stored
output_dir = "audio_chunks"
os.makedirs(output_dir, exist_ok=True)

**Audio in Latvian**

In [None]:
# Split audio, export and transcribe
for i in range(total_chunks):
    print(f"Processing chunk {i+1}/{total_chunks}")

    start_ms = i * chunk_length_ms
    end_ms = min((i + 1) * chunk_length_ms, len(audio))
    chunk = audio[start_ms:end_ms]
    
    # Save to disk
    filename = f"part_{i+1:02d}.mp3"
    filepath = os.path.join(output_dir, filename)
    chunk.export(filepath, format="mp3")

    # Prepare in-memory MP3 for upload
    buffer = BytesIO()
    chunk.export(buffer, format="mp3")
    buffer.seek(0)

    files = {
        "purpose": (None, "audio"),
        "file": (f"part_{i+1:02d}.mp3", buffer, "audio/mpeg")  # (filename, file_object, MIME type)
    }

    response = requests.post(FILE_ENDPOINT, headers=headers, files=files)
    response.raise_for_status()

    fileid = response.json()['id']

    signedurlreq = requests.get(f"https://api.mistral.ai/v1/files/{fileid}/url?expiry=24", headers={
    "Accept": "application/json",
    "x-api-key": os.getenv('MISTRAL_API_KEY')
    })

    files = {
    "file_url": (None, signedurlreq.json()['url']),
    "model": (None, MODEL_NAME),
    "language": (None, "lv"),
    "timestamp_granularities": (None, "segment")
    }

    try:
        resp = requests.post(TRANSCR_ENDPOINT, headers=headers, files=files, timeout=130)
        resp.raise_for_status() # raises for 4xx/5xx

        with open(os.path.join(output_dir, f"part_{i+1:02d}.json"), "w", encoding="utf-8") as f:
            json.dump(resp.json(), f, ensure_ascii=False, indent=2)
    except requests.RequestException as err:
        print(f"Request failed: {err}")   

Processing chunk 1/8
Request failed: 404 Client Error: Not Found for url: https://api.mistral.ai/v1/audio/transcriptions
Processing chunk 2/8
Processing chunk 3/8
Processing chunk 4/8
Request failed: 404 Client Error: Not Found for url: https://api.mistral.ai/v1/audio/transcriptions
Processing chunk 5/8
Processing chunk 6/8
Processing chunk 7/8
Request failed: 404 Client Error: Not Found for url: https://api.mistral.ai/v1/audio/transcriptions
Processing chunk 8/8


There are 3 errors, so these parts should be re-run, but since the Latvian transcription is of poor quality, there is no point in running it.

Upload the Audio File (English audio)  
**.m4a**

In [None]:
# Open the local file in binary mode
with open("Recording.m4a", "rb") as f:
    files = {
        "purpose": (None, "audio"),
        "file": ("Recording.m4a", f, "audio/mp4")  # (filename, file_object, MIME type)
    }

    try:
        response = requests.post(FILE_ENDPOINT, headers=headers, files=files)
        response.raise_for_status()
    except requests.RequestException as err:
        print(f"Request failed: {err}") 

Request failed: 422 Client Error: Unprocessable Entity for url: https://api.mistral.ai/v1/files


Error, so trying to convert .m4a into .mp3 format

**.mp3**

In [None]:
# Open the local file in binary mode
with open("Recording.mp3", "rb") as f:
    files = {
        "purpose": (None, "audio"),
        "file": ("Recording.mp3", f, "audio/mpeg")  # (filename, file_object, MIME type)
    }

    response = requests.post(FILE_ENDPOINT, headers=headers, files=files)
    response.raise_for_status()
    #print(response.json())

In [33]:
fileid = response.json()['id']

Get the Signed URL

In [34]:
signedurlreq = requests.get(f"https://api.mistral.ai/v1/files/{fileid}/url?expiry=24", headers={
    "Accept": "application/json",
    "x-api-key": os.getenv('MISTRAL_API_KEY')
})

**Transcription**

Using link to the audio file

In [None]:
files = {
    "file_url": (None, signedurlreq.json()['url']),
    #"file_url": (None, "https://docs.mistral.ai/audio/obama.mp3"), # Example
    "model": (None, MODEL_NAME),
    #"language": (None, "lv"),
    "timestamp_granularities": (None, "segment")
}

In [37]:
try:
    resp = requests.post(TRANSCR_ENDPOINT, headers=headers, files=files, timeout=60)
    resp.raise_for_status() # raises for 4xx/5xx
except requests.RequestException as err:
    print(f"Request failed: {err}")

In [39]:
with open(os.path.join(output_dir, f"out_ENG.json"), "w", encoding="utf-8") as f:
    json.dump(resp.json(), f, ensure_ascii=False, indent=2)

In [42]:
# Sample
resp.json()['segments'][:5]

[{'text': ' Mathematical imprints. How does mathematics make you feel? For many people, mathematics can seem forbidding, too hard, too cold, too abstract.',
  'start': 0.0,
  'end': 12.0},
 {'text': ' That sense of dread may have started at school, especially when mathematics lessons were a matter of manipulating symbols and doing obscure calculations.',
  'start': 12.0,
  'end': 25.0},
 {'text': ' or it can be inherited or passed on by others.',
  'start': 24.8,
  'end': 30.4},
 {'text': ' If people around you are talking about maths being difficult or pointless,',
  'start': 30.4,
  'end': 35.5},
 {'text': " you're very likely to think the same.",
  'start': 35.5,
  'end': 37.9}]

Uploading audio file from the filesystem

In [None]:
# Open the local file in binary mode
with open("out.mp3", "rb") as f:
    files = {
        "model": (None, MODEL_NAME),
        "file": ("out.mp3", f, "audio/mpeg"),  # (filename, file_object, MIME type)
        #"timestamp_granularities": (None, "segment")
    }

    resp = requests.post(TRANSCR_ENDPOINT, headers=headers, files=files)
    resp.raise_for_status()

ENG result
- 'text' key excluded to not show all transcribed text
- if "timestamp_granularities" is not defined then 'segments' value will be empty

In [None]:
# Removed the 'text' field
filtered_data = {k: v for k, v in resp.json().items() if k != 'text'}
filtered_data

{'model': 'voxtral-mini-2507',
 'language': 'en',
 'segments': [],
 'usage': {'prompt_audio_seconds': 843,
  'prompt_tokens': 4,
  'total_tokens': 12810,
  'completion_tokens': 1931}}

## Versions

In [26]:
from importlib.metadata import version
from IPython.display import Markdown, display # pip install ipython
import sys

packages = ['pydub', 'requests']

text = f"Python version: {sys.version}\n\n"
for i in packages:
    text += f"[{i}](https://pypi.org/project/{i}/) version: {version(i)}\n\n"
display(Markdown(text))

Python version: 3.13.3 (tags/v3.13.3:6280bb5, Apr  8 2025, 14:47:33) [MSC v.1943 64 bit (AMD64)]

[pydub](https://pypi.org/project/pydub/) version: 0.25.1

[requests](https://pypi.org/project/requests/) version: 2.32.3

