In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import argparse
import os
import sys
import logging
import tnh_scholar.logging_config


In [None]:
from tnh_scholar.logging_config import setup_logging, get_child_logger
from tnh_scholar.video_processing import (
    get_youtube_urls_from_csv, 
    download_audio_yt,
)
from tnh_scholar.audio_processing import (
    detect_whisper_boundaries, 
    split_audio_at_boundaries,
    process_audio_chunks,
)

In [None]:
from tnh_scholar import PROJECT_ROOT_DIR

In [None]:
setup_logging(log_filepath="transcription_process_stepwise.log", log_level=logging.DEBUG)
logger = get_child_logger("yt_transcribe")

In [None]:
# Parameter settings
output_base_dir = PROJECT_ROOT_DIR / "sandbox/video_transcriptions"
start_time = None
url = None
prompt = "Dharma, Deer Park, Thay, Thich Nhat Hanh, Bodhicitta, Bodhisattva, Mahayana"
translate_flag = False
max_chunk_duration = 5 * 60 # 5 minutes
process_setting = "split"

# Directory settings
video_name = "Thiền Là Gì？ [TTSĐCTTĐB 02] TS Thích Nhất Hạnh (24-11-1994, Xóm Thượng, Làng Mai)"
if video_name:
    video_output_dir = output_base_dir / video_name
    video_output_dir.mkdir(parents=True, exist_ok=True)
    chunks_dir = video_output_dir / "chunks"
    chunks_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Step 1: Download audio
if process_setting == "download":
    logger.info("Downloading audio...")
    audio_file = download_audio_yt(url, output_base_dir, start_time=start_time)
    logger.info(f"Downloaded audio file: {audio_file}")

    video_name = audio_file.stem  # Use the stem of the audio file (title without extension)

    # # Move the tmp audio file to the video directory
    # audio_file = video_output_dir / tmp_audio_file.name

    # try:
    #     tmp_audio_file.rename(audio_file)  
    #     logger.info(f"Moved audio file to: {audio_file}")
    # except Exception as e:
    #     logger.error(f"Failed to move audio file to {video_output_dir}: {e}")
    #     # Ensure the code gracefully handles issues here, reassigning to the original tmp path.
    #     audio_file = tmp_audio_file

In [None]:
audio_file = output_base_dir / f"{video_name}.mp3"

In [None]:
print(audio_file)

In [None]:
audio_file.exists()

In [None]:
import os
os.environ["KMP_WARNINGS"] = "0"

In [None]:
if process_setting == "split":
    # Step 2: Detect boundaries    
    logger.info("Detecting boundaries...")
    boundaries, boundary_transcription = detect_whisper_boundaries(audio_file, language='vi')
    logger.info("Boundaries generated.")

In [None]:
type(boundary_transcription)

In [None]:
# Step 3: Split audio into chunks
if process_setting == "split":
    logger.info("Splitting audio into chunks...")
    split_audio_at_boundaries(
        audio_file=audio_file,
        boundaries=boundaries, 
        output_dir=chunks_dir, 
        max_duration=max_chunk_duration
    )
    logger.info(f"Audio chunks saved to: {chunks_dir}")

In [None]:
# Step 4: Transcribe audio chunks
if process_setting == "transcribe":
    logger.info("Transcribing audio chunks...")
    transcript_file = video_output_dir / f"{video_name}.txt"
    jsonl_file = video_output_dir / f"{video_name}.jsonl"

    process_audio_chunks(
        directory=chunks_dir,
        output_file=transcript_file,
        jsonl_file=jsonl_file,
        prompt=prompt,
        translate=translate_flag
    )

    logger.info(f"Transcription completed for {url}")
    logger.info(f"Transcript saved to: {transcript_file}")
    logger.info(f"Raw transcription data saved to: {jsonl_file}")