In [None]:
from pathlib import Path
import argparse
import os
import sys
import logging
import logging_config
from typing import Generator

from logging_config import setup_logging, get_child_logger
from dp_video_processing import (
    get_youtube_urls_from_csv, 
    download_audio_yt,
    detect_boundaries, 
    split_audio_at_boundaries,
    split_audio_on_silence, 
    process_audio_chunks
)

setup_logging(log_filename="transcription_repair.log")
logger = get_child_logger("yt_transcribe_repair")

# Parameters
DEFAULT_OUTPUT_DIR = "./video_transcriptions"
DEFAULT_CHUNK_DURATION_S = 10 * 60  # in seconds. 10 minute default duration
DEFAULT_CHUNK_DURATION_MS = 5 * 60 * 1000 # in miliseconds. 10m 
DEFAULT_PROMPT = "Dharma, Deer Park, Thay, Thich Nhat Hanh, Bodhicitta, Bodhisattva, Mahayana"

In [None]:
def iterate_subdir(directory: Path) -> Generator[Path, None, None]:
    """
    Recursively iterates through all subdirectories in the given directory.

    Args:
        directory (Path): The root directory to start the iteration.

    Yields:
        Path: Paths to each subdirectory.
    
    Example:
        >>> for subdir in iterate_subdirectories(Path('/root')):
        ...     print(subdir)
    """
    for subdirectory in directory.rglob('*'):
        if subdirectory.is_dir():
            yield subdirectory

In [None]:
video_output_dir = Path(DEFAULT_OUTPUT_DIR)

In [None]:
video_output_dir

In [None]:

for video_dir in iterate_subdir(video_output_dir):

    try:
        # Parameters:
        chunks_dir = video_dir / "chunks"
        video_name = video_dir.name
        logger.info(f"Processing video: '{video_name}'") 
        audio_file = video_dir / f"{video_name}.mp3"
        transcript_file = video_dir / f"{video_name}.txt"

        if transcript_file.exists():
            logger.info(f"skipping '{video_name}': .txt file exists")
            continue # can skip this file 

        # begin processing:
        # Step 2: Detect boundaries
        logger.info("Detecting boundaries...")
        boundaries = detect_boundaries(audio_file)
        logger.info(f"Boundaries generated.")

        # Step 3: Split audio into chunks
        logger.info("Splitting audio into chunks...")
        split_audio_at_boundaries(
            audio_file=audio_file,
            boundaries=boundaries, 
            output_dir=chunks_dir, 
            max_duration=DEFAULT_CHUNK_DURATION_S
        )
        logger.info(f"Audio chunks saved to: {chunks_dir}")

        # Step 4: Transcribe audio chunks
        logger.info("Transcribing audio chunks...")
        jsonl_file = video_dir / f"{video_name}.jsonl"
        process_audio_chunks(
            directory=chunks_dir,
            output_file=transcript_file,
            jsonl_file=jsonl_file,
            prompt=DEFAULT_PROMPT,
            translate=False
        )
        logger.info(f"Transcription completed for {video_name}")
        logger.info(f"Transcript saved to: {transcript_file}")
        logger.info(f"Raw transcription data saved to: {jsonl_file}")

    except Exception as e:
        logger.error(f"Failed to process video {video_name}: {e}")