# Speech - Transcribe

# Imports

In [19]:
import whisper

import os
import json
from datetime import datetime

from pydub import AudioSegment

from IPython.display import Audio, display

# Load Whisper

In [14]:
whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large-v1',
 'large-v2',
 'large-v3',
 'large',
 'large-v3-turbo',
 'turbo']

In [5]:
# Load Whisper model
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [26:16<00:00, 1.96MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Took me 26 minutes to download 2.88G large model.

In [15]:
model_small = whisper.load_model("small")

100%|███████████████████████████████████████| 461M/461M [04:18<00:00, 1.87MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [6]:
output_folder = "data/transcripts/"

# Define helper functions

In [31]:
# Function to convert MP3 to WAV
def convert_mp3_to_wav(mp3_file, wav_file):
    audio = AudioSegment.from_mp3(mp3_file)
    audio.export(wav_file, format="wav")


# Function to transcribe an audio file
def transcribe_audio_file(audio_file, model, debug=True):
    # Store start time
    start_time = datetime.now()
    if debug:
        print(f"Start time: {start_time}")
    
    # Transcribe the audio file using Whisper
    result = model.transcribe(audio_file)
    
    # Store stop time
    stop_time = datetime.now()
    
    # Calculate elapsed time in seconds
    elapsed_time = (stop_time - start_time).total_seconds()

    if debug:
        # Print stop time and elapsed time
        print(f"Stop time: {stop_time}")
        print(f"{elapsed_time:.2f} seconds taken")
        
        # Print the transcription result
        print("Transcribed", audio_file)
        print()
        print()
        print(result['text'])
        print()
        print()
        print(json.dumps(result, indent=4))
        print()
    else:
        print(f"Transcribed {audio_file}")
        print(f"Stop time: {stop_time}")
        print(f"{elapsed_time:.2f} seconds taken")        
    
    return result['text']


# Main function to process a folder of MP3 files
def transcribe_folder(folder_path, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all MP3 files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".mp3"):
            mp3_file = os.path.join(folder_path, filename)
            wav_file = os.path.join(output_folder, os.path.splitext(filename)[0] + ".wav")
            txt_file = os.path.join(output_folder, os.path.splitext(filename)[0] + ".txt")

            # Convert MP3 to WAV
            convert_mp3_to_wav(mp3_file, wav_file)

            # Transcribe the WAV file
            transcription = transcribe_audio_file(wav_file, model, debug=False)
            
            # Delete the WAV file after transcription
            os.remove(wav_file)
            
            # Save the transcription to a text file
            with open(txt_file, 'w') as f:
                f.write(transcription)

            print(f"Transcription for {filename} saved to {txt_file}")

# Demo

In [28]:
# Load and display the audio player in the notebook
audio_player = Audio(filename="data/mp3s/output000.mp3", autoplay=False)
display(audio_player)

## large model

In [32]:
transcribe_audio_file("data/mp3s/output000.mp3", model)

Start time: 2024-10-02 14:43:13.105156




Stop time: 2024-10-02 14:43:30.313052
17.21 seconds taken
Transcribed data/mp3s/output000.mp3


 Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be...


{
    "text": " Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be...",
    "segments": [
        {
            "id": 0,
            "seek": 0,
            "start": 0.0,
            "end": 6.54,
            "text": " Do you have advice for a programming beginner on how to learn Python the right way?",
            "tokens": [
                50365,
                1144,
                291,
                362,
                5192,
                337,
            

" Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be..."

17.21 seconds taken - still fast enough to do "in real time"

## small model

In [33]:
transcribe_audio_file("data/mp3s/output000.mp3", model_small)

Start time: 2024-10-02 14:43:30.317192




Stop time: 2024-10-02 14:43:33.806880
3.49 seconds taken
Transcribed data/mp3s/output000.mp3


 Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be a...


{
    "text": " Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be a...",
    "segments": [
        {
            "id": 0,
            "seek": 0,
            "start": 0.0,
            "end": 7.0,
            "text": " Do you have advice for a programming beginner on how to learn Python the right way?",
            "tokens": [
                50364,
                1144,
                291,
                362,
                5192,
                337,
          

" Do you have advice for a programming beginner on how to learn Python the right way? Find something you actually want to do with it. If you say, I want to learn skill X, that's not enough motivation. You need to pick something and it can be a..."

3.49 seconds taken

**Note 1: the same transcription results but 5 times slower with large model**

**Note 2: small transcription model is transcribing around 10 seconds of audio per second**

# Transcribe folder

In [36]:
# Path to the folder containing MP3 files
folder_path = "data/mp3s/"

# Path to the folder where output WAV files and transcriptions will be saved
output_folder = "data/transcriptions/"

# Transcribe all MP3 files in the folder
transcribe_folder(folder_path, output_folder)



Transcribed data/transcriptions/output013.wav
Stop time: 2024-10-02 14:46:16.573795
15.77 seconds taken
Transcription for output013.mp3 saved to data/transcriptions/output013.txt




Transcribed data/transcriptions/output007.wav
Stop time: 2024-10-02 14:46:47.594808
30.70 seconds taken
Transcription for output007.mp3 saved to data/transcriptions/output007.txt




Transcribed data/transcriptions/output006.wav
Stop time: 2024-10-02 14:47:15.895681
27.95 seconds taken
Transcription for output006.mp3 saved to data/transcriptions/output006.txt




Transcribed data/transcriptions/output012.wav
Stop time: 2024-10-02 14:47:39.024382
22.80 seconds taken
Transcription for output012.mp3 saved to data/transcriptions/output012.txt




Transcribed data/transcriptions/output004.wav
Stop time: 2024-10-02 14:48:03.633859
24.19 seconds taken
Transcription for output004.mp3 saved to data/transcriptions/output004.txt




Transcribed data/transcriptions/output010.wav
Stop time: 2024-10-02 14:48:35.764727
31.79 seconds taken
Transcription for output010.mp3 saved to data/transcriptions/output010.txt




Transcribed data/transcriptions/output011.wav
Stop time: 2024-10-02 14:49:02.352222
26.24 seconds taken
Transcription for output011.mp3 saved to data/transcriptions/output011.txt




Transcribed data/transcriptions/output005.wav
Stop time: 2024-10-02 14:49:35.777127
33.08 seconds taken
Transcription for output005.mp3 saved to data/transcriptions/output005.txt




Transcribed data/transcriptions/output001.wav
Stop time: 2024-10-02 14:50:02.450091
26.37 seconds taken
Transcription for output001.mp3 saved to data/transcriptions/output001.txt




Transcribed data/transcriptions/output014.wav
Stop time: 2024-10-02 14:50:09.893922
7.14 seconds taken
Transcription for output014.mp3 saved to data/transcriptions/output014.txt




Transcribed data/transcriptions/output000.wav
Stop time: 2024-10-02 14:50:26.854327
16.69 seconds taken
Transcription for output000.mp3 saved to data/transcriptions/output000.txt




Transcribed data/transcriptions/output002.wav
Stop time: 2024-10-02 14:50:54.958681
27.82 seconds taken
Transcription for output002.mp3 saved to data/transcriptions/output002.txt




Transcribed data/transcriptions/output003.wav
Stop time: 2024-10-02 14:51:23.200608
27.90 seconds taken
Transcription for output003.mp3 saved to data/transcriptions/output003.txt




Transcribed data/transcriptions/output008.wav
Stop time: 2024-10-02 14:51:49.165517
25.64 seconds taken
Transcription for output008.mp3 saved to data/transcriptions/output008.txt




Transcribed data/transcriptions/output009.wav
Stop time: 2024-10-02 14:52:12.869887
23.44 seconds taken
Transcription for output009.mp3 saved to data/transcriptions/output009.txt
