# Audio Recording and Transcription Notebook

This notebook allows you to record audio directly and transcribe it using faster-whisper.

First, let's install the required packages:

In [3]:
from aiohttp import ClientSession
import json
async with ClientSession() as session:
            headers = {
                "Content-Type": "application/json",
                "X-Gladia-Key": "8dbb5eec-a3e2-470f-9916-e2576c42e83d",
            }
            request_body = json.dumps(
                {
                    "encoding": "wav/pcm",
                    "sample_rate": 16000,
                    "bit_depth": 16,
                    "channels": 1,
                }
            )
            response = await session.post(
                "https://api.gladia.io/v2/live",
                headers=headers,
                data=request_body,
            )
            print(response)

<ClientResponse(https://api.gladia.io/v2/live) [429 Too Many Requests]>
<CIMultiDictProxy('Access-Control-Allow-Origin': '*', 'Content-Length': '269', 'Content-Type': 'application/json; charset=utf-8', 'Cross-Origin-Opener-Policy': 'same-origin', 'Date': 'Wed, 08 Jan 2025 11:21:38 GMT', 'Etag': 'W/"10d-Sj0N78X4EIwEbMkwEdMGFCWjNWY"', 'Origin-Agent-Cluster': '?1', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Strict-Transport-Security': 'max-age=15552000; includeSubDomains', 'X-Content-Type-Options': 'nosniff', 'X-Dns-Prefetch-Control': 'off', 'X-Download-Options': 'noopen', 'X-Permitted-Cross-Domain-Policies': 'none', 'X-Xss-Protection': '0')>



In [1]:
from openai import OpenAI

client=OpenAI(
            base_url="http://localhost:8001/v1/", api_key="cant-be-empty"
        )

In [17]:
try:
    with open("recording_1736137538.wav", "rb") as f:
        data = f.read()

    response = client.audio.transcriptions.create(
        file=data,
        model="Systran/faster-whisper-large-v3",
          response_format="verbose_json",
          
    )
    print(response)
except Exception as e:
    print(f"Transcription error: {str(e)}")

TranscriptionVerbose(duration=5.0, language='en', text='Hello.', segments=[TranscriptionSegment(id=1, avg_logprob=-0.410546875, compression_ratio=0.42857142857142855, end=5.0, no_speech_prob=0.15087890625, seek=500, start=0.0, temperature=0.0, text=' Hello.', tokens=[50365, 2425, 13, 50615], words=None)], words=None, task='transcribe')


In [2]:
!pip install faster-whisper sounddevice scipy wavio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import sounddevice as sd
import wavio
import numpy as np
from faster_whisper import WhisperModel
from IPython.display import Audio, display
import time

## Initialize the Whisper Model

Choose the appropriate configuration based on your hardware:

In [3]:
model_size = "large-v3"

# Uncomment the configuration you want to use:

# GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

# GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")

# CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

## Audio Recording Function

In [5]:
def record_audio(duration=5, samplerate=16000):
    """
    Record audio for a specified duration.
    
    Args:
        duration (int): Recording duration in seconds
        samplerate (int): Sample rate for recording
    
    Returns:
        str: Path to the saved audio file
    """
    print(f"Recording for {duration} seconds...")
    recording = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()
    print("Recording finished!")
    
    # Save the recording
    filename = f"recording_{int(time.time())}.wav"
    wavio.write(filename, recording, samplerate, sampwidth=2)
    
    return filename

In [1]:
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection

  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


## Record and Transcribe

Run this cell to start recording and get the transcription:

In [6]:
# Record audio (you can change the duration)
audio_file = record_audio(duration=5)

# Display the audio for playback
print("\nRecorded audio:")
display(Audio(audio_file))

# Transcribe
print("\nTranscribing...")
segments, info = model.transcribe(audio_file)

# Print results
print(f"\nDetected language: {info.language} (probability: {info.language_probability:.2f})")
print("\nTranscription:")
for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

Recording for 5 seconds...
Recording finished!

Recorded audio:



Transcribing...


INFO:faster_whisper:Processing audio with duration 00:05.000


: 