In [8]:
from IPython.display import Audio, display
from IPython.display import Javascript
from base64 import b64decode
import io
import os

In [9]:
# Step 1: Install dependencies (run once)
# !pip install -q transformers torchaudio accelerate librosa

# Step 2: Record audio from microphone in Colab


RECORD_SECONDS = 60  # Max recording duration
AUDIO_PATH = "user_input.wav"


# JS-based audio recorder
def record_audio():
    display(Javascript("""
    const sleep = time => new Promise(resolve => setTimeout(resolve, time))
    const b2text = blob => new Promise(resolve => {
        const reader = new FileReader()
        reader.onloadend = () => resolve(reader.result)
        reader.readAsDataURL(blob)
    })

    async function record() {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        const mediaRecorder = new MediaRecorder(stream)
        let audioChunks = []

        mediaRecorder.ondataavailable = e => audioChunks.push(e.data)
        mediaRecorder.start()

        await sleep(%d * 1000)
        mediaRecorder.stop()

        await new Promise(resolve => mediaRecorder.onstop = resolve)

        const audioBlob = new Blob(audioChunks)
        const base64 = await b2text(audioBlob)
        google.colab.kernel.invokeFunction('notebook.save_audio', [base64], {})
    }
    record()
    """ % RECORD_SECONDS))

# Python-side: save base64 audio blob
from google.colab import output
import base64

def save_audio(base64_audio):
    header, encoded = base64_audio.split(",", 1)
    data = base64.b64decode(encoded)
    with open(AUDIO_PATH, "wb") as f:

        f.write(data)
    print("✅ Audio saved as", AUDIO_PATH)

output.register_callback('notebook.save_audio', save_audio)

# Start recording
print("Recoording started for 10 seconds")
record_audio()


Recoording started for 10 seconds


<IPython.core.display.Javascript object>

✅ Audio saved as user_input.wav


In [15]:
import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperTokenizer

SAMPLE_RATE = 16000
MODEL_NAME = "openai/whisper-large-v3"



In [40]:
# Load audio
waveform, sr = torchaudio.load("/content/common_voice_ur_26574929.wav")
if sr != SAMPLE_RATE:
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
    waveform = resampler(waveform)

waveform = waveform.squeeze()


In [17]:

# Load model and processor
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [41]:

# Process input
inputs = processor(waveform, sampling_rate=SAMPLE_RATE, return_tensors="pt", task="transcribe")
inputs = {k: v.to(device) for k, v in inputs.items()}


In [42]:

# Transcribe
with torch.no_grad():
    generated_ids = model.generate(inputs["input_features"])

tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
lang_token_id = generated_ids[0][1].item()
detected_lang = tokenizer.convert_ids_to_tokens(lang_token_id)
print("🌍 Detected language:", detected_lang)

transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("📝 Transcription:\n", transcription)


🌍 Detected language: Ĩ
📝 Transcription:
  आ जाता है कि डाउन लीग्स पहले एक रूम था।
