# 🎙️ Step 1: STT from Audio File using Whisper

This notebook loads an audio file and transcribes it using OpenAI's Whisper model.

In [18]:
# 📚 Import the Whisper library
import whisper
import os
from pathlib import Path
import librosa
import torch
import numpy as np


# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [19]:
# 🧠 Load the Whisper model
# Options: tiny, base, small, medium, large
model = whisper.load_model("base")
model = model.to(device)

In [20]:
# Get the current working directory
current_directory = Path.cwd()
print(f"Current directory: {current_directory}")

Current directory: c:\Users\BERAT\Desktop\custom-ai-agent


In [21]:
# Construct the absolute path to the audio file
# 🔊 Path to your audio file
# Replace with your own file if needed

AUDIO_PATH = current_directory / "audio" / "harvard.wav"

print(f"Absolute path to audio file: {AUDIO_PATH}")

Absolute path to audio file: c:\Users\BERAT\Desktop\custom-ai-agent\audio\harvard.wav


In [22]:
# Check if the audio file exists and transcribe it
if not AUDIO_PATH.is_file():
    print(f"Audio file not found: {AUDIO_PATH}")
    result = None
else:
    # Load audio and resample to 16 kHz
    audio_data, _ = librosa.load("audio/harvard.wav", sr=16000)
    audio_data = librosa.to_mono(audio_data) if audio_data.ndim > 1 else audio_data

    # Ensure float32 dtype
    audio_data = audio_data.astype(np.float32)

    # Transcribe
    result = model.transcribe(audio_data)

    print("Transcription:\n")

    print(result["text"])

Transcription:

 The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.
