In [None]:
!brew install portaudio
!brew install ffmpeg

In [None]:
!pip install pydub nltk pyaudioop
!pip install git+https://github.com/openai/whisper.git

In [None]:
INPUT_FOLDER = "audio_files"
OUTPUT_FOLDER = "transcripts"


In [None]:
import subprocess
import os
import whisper
import nltk

def format_timestamp(ms):
    seconds_total = ms // 1000
    minutes = seconds_total // 60
    seconds = seconds_total % 60
    return f"{minutes:02d}:{seconds:02d}"

def load_audio_model():
    model = whisper.load_model("base")
    return model

def get_audio_chunks_ffmpeg(input_file, output_folder, chunk_length_sec=60):
    os.makedirs(output_folder, exist_ok=True)
    
    # Get audio duration using ffprobe
    duration_cmd = f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 '{input_file}'"
    duration = float(subprocess.check_output(duration_cmd, shell=True).decode().strip())
    
    chunk_files = []
    for i in range(0, int(duration), chunk_length_sec):
        output_file = os.path.join(output_folder, f"chunk_{i//chunk_length_sec}.wav")
        cmd = f"ffmpeg -i '{input_file}' -ss {i} -t {chunk_length_sec} -c:a pcm_s16le -ar 16000 '{output_file}' -y"
        subprocess.call(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        chunk_files.append(output_file)
    
    return chunk_files

def transcribe_chunk(chunk_file, model):
    result = model.transcribe(chunk_file)
    return {"text": result["text"]}

def chop_and_transcribe_ffmpeg(input_file, output_folder, lecture_title, chunk_length_sec=60):
    model = load_audio_model()
    chunks = []
    timestamp_transcript = ""
    plain_transcript = ""

    chunk_files = get_audio_chunks_ffmpeg(input_file, output_folder, chunk_length_sec)
    
    for i, chunk_file in enumerate(chunk_files):
        result = transcribe_chunk(chunk_file, model)
        timestamp = format_timestamp(i * chunk_length_sec * 1000)
        timestamped_text = f"\n--{timestamp}--\n{result['text']}\n"
        chunks.append({"timestamp": timestamp, "content": result['text']})
        timestamp_transcript += timestamped_text
        plain_transcript += result['text'] + " "

    return plain_transcript

def process_audio_folder():
    # Create output folder if it doesn't exist
    os.makedirs(INPUT_FOLDER, exist_ok=True)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Get all supported audio files in the input folder
    supported_formats = ('.m4a', '.mp3', '.mp4')
    audio_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(supported_formats)]

    if not audio_files:
        print("No supported audio files found in the input folder!")
        return

    print(f"Found {len(audio_files)} audio files to process...")

    # Process each audio file
    for audio_file in audio_files:
        print(f"Processing {audio_file}...")
        input_path = os.path.join(INPUT_FOLDER, audio_file)

        # Get base filename without extension
        base_name = os.path.splitext(audio_file)[0]

        # Transcribe the audio using ffmpeg
        transcript = chop_and_transcribe_ffmpeg(input_path, OUTPUT_FOLDER, base_name)

        # Save the transcript
        output_path = os.path.join(INPUT_FOLDER, f"{base_name}.md")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(transcript)

        print(f"Saved transcript to {output_path}")

In [None]:
process_audio_folder()