In [6]:
pip install transformers datasets torchaudio

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install pydub

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor



processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

def load_audio(file_path, target_sr=16000):
    waveform, sample_rate = torchaudio.load(file_path)

    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)

    return waveform.squeeze(0), target_sr

def chunk_audio(audio, chunk_len=15 * 16000):
    return [audio[i:i + chunk_len] for i in range(0, len(audio), chunk_len)]

def speech_to_text(audio_file):
    print("Processing audio...")

    audio, sr = load_audio(audio_file)
    chunks = chunk_audio(audio)
    final_text = ""

    for i, chunk in enumerate(chunks):
       
        input_values = processor(chunk, sampling_rate=sr, return_tensors="pt").input_values

        with torch.no_grad():
            logits = model(input_values).logits

        pred_ids = torch.argmax(logits, dim=-1)
        text = processor.decode(pred_ids[0])
        final_text += text.lower() + " "

        print(f"Chunk {i+1}/{len(chunks)} done.")

        del input_values, logits, pred_ids
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return final_text.strip()

if __name__ == "__main__":
    record = input("Enter the record path: ").strip('"')
    text = speech_to_text(record)
    print("\nFinal Transcription:\n", text)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter the record path: EN_1.mp3
Processing audio...
Chunk 1/13 done.
Chunk 2/13 done.
Chunk 3/13 done.
Chunk 4/13 done.
Chunk 5/13 done.
Chunk 6/13 done.
Chunk 7/13 done.
Chunk 8/13 done.
Chunk 9/13 done.
Chunk 10/13 done.
Chunk 11/13 done.
Chunk 12/13 done.
Chunk 13/13 done.

Final Transcription:
 what's best a morning shower or an evening shower what can magic teach us about the brain making friends as you get older these are all topics you'll discover in the reading rem are new series of magazine style articles on our webside they each come with highlighted vocabulary comprehension questions and an audio download learn about the world and improve your reading skirls with the reading ram at b b c learning english doccom hallo and welcome to the english we speak where we explain expressions used by fluent english speakers so that you can use them too i'm faifa an am joined by georgy and that is an interesting hairst iw georg oh fa fa i was really bored of my usual herstyle so i tried 