In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%aimport yt_dlp
from pathlib import Path
from pydub import AudioSegment
from openai import OpenAI
from pydub import AudioSegment
%aimport whisper
%aimport logging
from typing import List, Dict
%aimport json

In [22]:
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

logger = logging.getLogger("audio_extraction_testing")

In [5]:
from data_processing.gpt_processing import token_count

In [23]:
audio_storage_dir = Path("test_extracted_data")

In [7]:
def download_audio_yt(url: str, output_dir: Path) -> Path:
    """
    Downloads audio from a YouTube video using yt_dlp.YoutubeDL.

    Args:
        url (str): URL of the YouTube video.
        output_dir (Path): Directory to save the downloaded audio file.

    Returns:
        Path: Path to the downloaded audio file.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': str(output_dir / '%(title)s.%(ext)s'),  # Save in the output directory
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(url, download=True)
        downloaded_file = Path(ydl.prepare_filename(result)).with_suffix('.mp3')
        return downloaded_file

In [8]:
def split_audio_into_chunks(audio_file: Path, chunk_duration: int, output_dir: Path = None) -> Path:
    """
    Splits an audio file into chunks of a specified time duration.

    Args:
        audio_file (Path): Path to the input audio file.
        chunk_duration (int): Duration of each chunk in milliseconds (e.g., 10 * 60 * 1000 for 10 minutes).
        output_dir (Path): Path to the directory where chunks will be saved. 
                           If None, a subdirectory is created in the same directory as the input file.

    Returns:
        Path: Path to the directory containing the chunks.
    """
    # Load the audio file
    audio = AudioSegment.from_file(audio_file)
    total_duration = len(audio)  # Total duration in milliseconds

    # Create output directory based on filename
    if output_dir is None:
        output_dir = audio_file.parent / f"{audio_file.stem}_chunks"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Split the audio into chunks
    for i, start in enumerate(range(0, total_duration, chunk_duration)):
        chunk = audio[start:start + chunk_duration]
        chunk_path = output_dir / f"chunk_{i + 1}.mp3"
        chunk.export(chunk_path, format="mp3")
        print(f"Exported: {chunk_path}")

    return output_dir

In [9]:
def detect_boundaries(audio_file: Path, model_size: str = 'tiny') -> List[Dict[str, float]]:
    """
    Use Whisper to detect sentence boundaries in the audio file.

    Args:
        audio_file (Path): Path to the audio file.
        model_size (str): Whisper model size (e.g., 'tiny', 'base', 'small').

    Returns:
        List[Dict[str, float]]: List of timestamps with sentence boundaries. Each entry contains:
            - "start": Start time of the sentence (in seconds).
            - "end": End time of the sentence (in seconds).
            - "text": Sentence text.
    """
    # Load the Whisper model
    logger.info("Loading model...")
    model = whisper.load_model(model_size)
    logger.info(f"Model '{model_size}' loaded.")

    # Transcribe the audio file
    result = model.transcribe(str(audio_file), task="transcribe", word_timestamps=True)
    
    # Extract sentence boundaries from segments
    sentence_boundaries = []
    for segment in result['segments']:
        sentence_boundaries.append({
            "start": segment['start'],
            "end": segment['end'],
            "text": segment['text']
        })
    
    return sentence_boundaries

In [10]:
def split_audio_at_boundaries(audio_file: Path, boundaries: List[Dict[str, float]], output_dir: Path  = None, max_duration: int = 10 * 60) -> Path:
    """
    Split the audio file into chunks close to a specified duration while respecting boundaries.

    Args:
        audio_file (Path): Path to the audio file.
        boundaries (List[Dict[str, float]]): List of boundaries with start and end times.
        output_dir (Path): Directory to save the chunks.
        max_duration (int): Maximum duration for each chunk in seconds (default is 10 minutes).

    Returns:
        Path: Path to the directory containing the chunks.
    """
    # Load the audio file
    audio = AudioSegment.from_file(audio_file)
    
    # Create output directory based on filename
    if output_dir is None:
        output_dir = audio_file.parent / f"{audio_file.stem}_chunks"
    output_dir.mkdir(parents=True, exist_ok=True) 
       
    # Initialize variables
    current_chunk = AudioSegment.empty()
    current_start = boundaries[0]["start"]
    chunk_count = 1

    for i, boundary in enumerate(boundaries):
        # Calculate segment duration
        segment_start_ms = boundary["start"] * 1000
        segment_end_ms = boundary["end"] * 1000
        segment = audio[segment_start_ms:segment_end_ms]
        
        # Add segment to the current chunk if it fits
        if len(current_chunk) + len(segment) <= max_duration * 1000:
            current_chunk += segment
        else:
            # Export the current chunk
            chunk_path = output_dir / f"chunk_{chunk_count}.mp3"
            current_chunk.export(chunk_path, format="mp3")
            logger.info(f"Exported: {chunk_path}")
            
            # Start a new chunk
            chunk_count += 1
            current_chunk = segment
            current_start = boundary["start"]
    
    # Export the final chunk
    if len(current_chunk) > 0:
        chunk_path = output_dir / f"chunk_{chunk_count}.mp3"
        current_chunk.export(chunk_path, format="mp3")
        print(f"Exported: {chunk_path}")

    return output_dir

In [11]:
def process_audio_chunks(
    directory: Path, 
    output_file: Path, 
    jsonl_file: Path, 
    model: str = "whisper-1", 
    prompt: str = ""
) -> None:
    """
    Processes all audio chunks in the specified directory using OpenAI's transcription API,
    saves the transcription objects into a JSONL file, and stitches the transcriptions
    into a single text file.

    Args:
        directory (Path): Path to the directory containing audio chunks.
        output_file (Path): Path to the output file to save the stitched transcription.
        jsonl_file (Path): Path to save the transcription objects as a JSONL file.
        model (str): The transcription model to use (default is "whisper-1").
        prompt (str): Optional prompt to provide context for better transcription.
    """
    client = OpenAI()

    # Ensure the output directory exists
    output_file.parent.mkdir(parents=True, exist_ok=True)
    jsonl_file.parent.mkdir(parents=True, exist_ok=True)

    # Collect all audio chunks in the directory
    audio_files = sorted(directory.glob("*.mp3"))  # Sort files by name for sequential processing
    logger.info(f"Audio files found:\n\t{audio_files}")

    # Initialize the output content
    stitched_transcription = []

    # Open the JSONL file for writing
    with jsonl_file.open("w", encoding="utf-8") as jsonl_out:
        # Process each audio chunk
        for audio_file in audio_files:
            logger.info(f"Processing {audio_file.name}...")
            try:
                with audio_file.open("rb") as file:
                    transcript = client.audio.transcriptions.create(
                        model=model,
                        prompt=prompt,
                        file=file
                    )
                    # Save the full response object to the JSONL file
                    jsonl_out.write(json.dumps(transcript.to_dict()) + "\n")
                    
                    # Append the transcribed text to the stitched output
                    stitched_transcription.append(transcript.text)
            except Exception as e:
                logger.error(f"Error processing {audio_file.name}: {e}", exc_info=True)
                

    # Write the stitched transcription to the output file
    with output_file.open("w", encoding="utf-8") as out_file:
        out_file.write(" ".join(stitched_transcription))

    logger.info(f"Stitched transcription saved to {output_file}")
    logger.info(f"Full transcript objects saved to {jsonl_file}")

In [12]:
# download_audio_yt("https://www.youtube.com/watch?v=SEc28BCHgu8&ab_channel=DeerParkMonastery", audio_storage_dir)

In [28]:
audio_file_path = audio_storage_dir / "Taking Care of Our Fear ｜ Br. Phap Luu ｜ 2024-11-06.mp3"

In [29]:
audio_file_path.exists()

True

In [15]:
boundaries = detect_boundaries(audio_file_path, "tiny")

2024-12-08 11:09:55,086 - audio_extraction_testing - INFO - Loading model...
2024-12-08 11:09:55,524 - audio_extraction_testing - INFO - Model 'tiny' loaded.


RuntimeError: Failed to load audio: ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-audiotoolbox --enable-neon
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
[in#0 @ 0x13e004200] Error opening input: No such file or directory
Error opening input file extracted_audio/Taking Care of Our Fear ｜ Br. Phap Luu ｜ 2024-11-06.mp3.
Error opening input files: No such file or directory


In [16]:
test_output_dir = audio_storage_dir / "test_output_dir"

In [17]:
split_audio_at_boundaries(audio_file_path, boundaries, test_output_dir, max_duration=3 * 60)

NameError: name 'boundaries' is not defined

In [18]:

chunk_duration_ms = 8 * 60 * 1000  # 10 minutes
output_directory = split_audio_into_chunks(audio_file_path, chunk_duration_ms)
print(f"Chunks saved to: {output_directory}")

FileNotFoundError: [Errno 2] No such file or directory: 'extracted_audio/Taking Care of Our Fear ｜ Br. Phap Luu ｜ 2024-11-06.mp3'

In [31]:
chunks_directory = audio_storage_dir / "test_output_dir"
output_transcription_file = Path("taking_care_of_fear_transcript.txt")
output_jsonl_file = Path("taking_care_of_fear_data.jsonl")

In [None]:


process_audio_chunks(
    directory=chunks_directory,
    output_file=output_transcription_file,
    jsonl_file=output_jsonl_file,
    prompt="Dharma, Deer Park, Thay, Thich Nhat Hanh, Bodhicitta, Bodhisattva, Mahayana"
)

2024-12-06 14:23:55,293 - audio_extraction_testing - INFO - Audio files found:
	[PosixPath('extracted_audio/test_output_dir/chunk_1.mp3'), PosixPath('extracted_audio/test_output_dir/chunk_2.mp3'), PosixPath('extracted_audio/test_output_dir/chunk_3.mp3'), PosixPath('extracted_audio/test_output_dir/chunk_4.mp3'), PosixPath('extracted_audio/test_output_dir/chunk_5.mp3')]


Processing chunk_1.mp3...


2024-12-06 14:24:03,538 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"


Processing chunk_2.mp3...


2024-12-06 14:24:11,661 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"


Processing chunk_3.mp3...


2024-12-06 14:24:17,874 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"


Processing chunk_4.mp3...


2024-12-06 14:24:26,367 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"


Processing chunk_5.mp3...


2024-12-06 14:24:30,977 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"
2024-12-06 14:24:30,982 - audio_extraction_testing - INFO - Stitched transcription saved to taking_care_of_fear_transcript.txt
2024-12-06 14:24:30,983 - audio_extraction_testing - INFO - Full transcript objects saved to taking_care_of_fear_data.jsonl


In [32]:
# Initialize OpenAI client
client = OpenAI()

# Open the file and perform transcription
with audio_file_path.open("rb") as audio_file:
    transcript = client.audio.transcriptions.create(
        model="whisper-1",
        response_format="verbose_json",
        prompt="Dharma, Deer Park, Thay, Thich Nhat Hanh, Bodhicitta, Bodhisattva, Mahayana",
        file=chunks_directory / "chunk_1.mp3"
    )

2024-12-08 11:14:34,516 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/audio/transcriptions "HTTP/1.1 200 OK"


In [37]:
transcript.segments[5].text

' When I do that I feel stable, I feel'

In [38]:
type(transcript)

openai.types.audio.transcription_verbose.TranscriptionVerbose

In [39]:
from openai.types.audio.transcription_verbose import TranscriptionVerbose

formatting prompt:

Format the following text into paragraphs. Make minimal corrections to grammar if required for logical flow. Make no other changes; add no content. Output the final text only.

In [1]:
print(transcript.text)

NameError: name 'transcript' is not defined

In [24]:
token_count(transcript.text)

911