In [85]:
import os
import shutil
import subprocess
import tempfile
import warnings
from pathlib import Path
from typing import Optional

from IPython.display import Audio, display


In [86]:
def process_audio_for_asr(audio_file_path, output_dir=None,
                          codec="pcm_f32le", sample_rate="48000", channels="2",
                          compression_level="minimal", include_gate=True, 
                          include_eq=True, force_mono=False, target_rate=None):
    """
    Process audio file for ASR using SoX with configurable quality settings.
    
    Parameters:
    -----------
    audio_file_path : Path or str
        Path to the input audio file
    output_dir : Path or str, optional
        Directory to save processed file. If None, uses same directory as input.
    play_result : bool, default=True
        Whether to play and display the processed audio in the notebook
    codec : str, default="pcm_f32le"
        Audio codec: "pcm_f32le" (32-bit float), "pcm_s24le" (24-bit), "pcm_s16le" (16-bit)
    sample_rate : str, default="48000"
        Sample rate in Hz: "48000", "44100", "16000", etc.
    channels : str, default="2"
        Number of channels: "1" (mono), "2" (stereo)
    compression_level : str, default="minimal"
        Processing intensity: "minimal", "light", "moderate", "aggressive"
    include_gate : bool, default=False
        Include noise gate processing
    include_eq : bool, default=False
        Include bass/treble EQ adjustments
    force_mono : bool, default=False
        Force conversion to mono using remix
    target_rate : str, optional
        If specified, resample to this rate (e.g. "16000")
        
    Returns:
    --------
    Path
        Path to the processed audio file
    """
    audio_file_path = Path(audio_file_path)

    # Check if required tools are installed
    for tool in ["sox", "ffmpeg"]:
        try:
            subprocess.run(["which", tool], capture_output=True, text=True, check=True)
        except (subprocess.SubprocessError, FileNotFoundError) as e:
            raise RuntimeError(f"{tool} is not installed. Please install it first.") from e

    # Define compression settings
    compression_settings = {
        "minimal": ["0.1,0.3", "3:-50,-40,-30,-20", "-3", "-80", "0.2"],
        "light": ["0.05,0.2", "6:-60,-50,-40,-30,-20,-10", "-3", "-85", "0.1"],
        "moderate": ["0.03,0.15", "6:-65,-50,-40,-30,-20,-10", "-4", "-85", "0.1"],
        "aggressive": ["0.02,0.1", "8:-70,-55,-45,-35,-25,-15", "-5", "-90", "0.05"],
        "whisper_optimized": ["0.005,0.06", "12:-75,-65,-55,-45,-35,-25,-15,-8", "-8", "-95", "0.03"],
        "whisper_aggressive": ["0.005,0.06", "12:-75,-45,-55,-30,-35,-18,-15,-8", "-8", "-95", "0.03"],
        "primary_speech_only": ["0.005,0.06", "12:-60,-45,-55,-30,-35,-18,-15,-8", "-8", "-60", "0.03"],
    }

    if compression_level not in compression_settings:
        compression_level = "whisper_optimized"
        
    comp_args = compression_settings.get(compression_level)

    # Setup file paths
    if output_dir is None:
        output_dir = audio_file_path.parent
    else:
        output_dir = Path(output_dir)
        os.makedirs(output_dir, exist_ok=True)

    base_name = audio_file_path.stem
    orig_wav = output_dir / f"{base_name}_orig.wav"
    processed_wav = output_dir / f"{base_name}_processed.wav"

    # Convert to WAV with specified settings (only if doesn't exist)
    if not orig_wav.exists():
        print(f"Converting to WAV: {codec} @ {sample_rate}Hz, {channels} channels...")
        subprocess.run([
            "ffmpeg", "-i", str(audio_file_path), 
            "-acodec", codec,
            "-ar", sample_rate,
            "-ac", channels,
            str(orig_wav),
            "-y"
        ], check=True, capture_output=True)
        print(f"Intermediate WAV saved as: {orig_wav}")
    else:
        print(f"Using existing intermediate WAV: {orig_wav}")

    # Build SoX processing command
    sox_cmd = ["sox", str(orig_wav), str(processed_wav)]
    
    # Optional: Force mono conversion
    if force_mono:
        sox_cmd.extend(["remix", "1,2"])
        
    # Optional: Resample to target rate
    if target_rate:
        sox_cmd.extend(["rate", "-v", target_rate])
        
    # Core processing
    sox_cmd.extend([
        "gain", "-n", "-1",           # Gentle normalization
        "highpass", "-1", "175",             # Remove low-frequency noise
        "lowpass", "-1", "15000",          # Preserve speech clarity
        # Speech-focused EQ before compression
        "equalizer", "100", "0.9", "-20",    # Boost lower speech (500Hz)
        "equalizer", "1500", "1", "+4",   # Boost mid speech (1.5kHz) 
        "equalizer", "4000", "0.6", "+15",   # Boost upper speech (3kHz)
        "equalizer", "10000", "1", "-10",
        "compand", *comp_args, ":",   # MUST ADD COLON AFTER COMPAND!! 
                                      # (To prevent following commands to be interpreted as compand args)
    ])
    
    # Optional: Noise gate
    if include_gate:
        sox_cmd.extend(["gate", "0.1", "0.05", "-inf", "0.1", "-90", "0.1"])
        
    # Optional: EQ adjustments
    if include_eq:
        sox_cmd.extend([
            "contrast", "75",
            "bass", "-5", "200",
            "treble", "+3", "3000"
        ])
        
    # Final normalization
    sox_cmd.extend(["norm", "-1"])

    # Run SoX processing
    print(f"Applying {compression_level} audio processing...")
    result = subprocess.run(sox_cmd, capture_output=True, text=True)

    if result.returncode != 0:
        print(f"SoX Error: {result.stderr}")
        raise RuntimeError(f"SoX processing failed: {result.stderr}")

    print(f"Processing complete! File saved to: {processed_wav}")

    return processed_wav


In [87]:
def compress_wav_to_mp4_vbr(
    input_wav: str | Path, output_path: Optional[str | Path] = None, quality: int = 8
    ) -> Path:
    """
    Compress WAV to M4A (AAC VBR) using ffmpeg.
    
    Parameters:
    -----------
    input_wav : str or Path
        Path to the input .wav file
    output_path : str or Path, optional
        Output .mp4 file path. If None, auto-generated from input
    quality : int, default=8
        VBR quality level: 1 = good (~96kbps), 2 = very good (~128kbps), 3+ = higher bitrate
    
    Returns:
    --------
    Path
        Path to the compressed .m4a file
    """
    input_wav = Path(input_wav)
    if output_path is None:
        output_path = input_wav.with_suffix(".mp4")
    else:
        output_path = Path(output_path)

    cmd = [
        "ffmpeg", "-y", "-i", str(input_wav),
        "-c:a", "aac",
        "-q:a", str(quality),
        str(output_path)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print("Error compressing audio:")
        print(result.stderr)
        raise RuntimeError("FFmpeg compression failed.")
    
    print(f"Compressed audio saved to: {output_path}")
    return output_path

In [88]:
def compress_wav_to_mp4_vbr(
    input_wav: str | Path, output_path: Optional[str | Path] = None, quality: int = 8
    ) -> Path:
    """
    Compress WAV to M4A (AAC VBR) using ffmpeg.
    
    Parameters:
    -----------
    input_wav : str or Path
        Path to the input .wav file
    output_path : str or Path, optional
        Output .mp4 file path. If None, auto-generated from input
    quality : int, default=8
        VBR quality level: 1 = good (~96kbps), 2 = very good (~128kbps), 3+ = higher bitrate
    
    Returns:
    --------
    Path
        Path to the compressed .m4a file
    """
    input_wav = Path(input_wav)
    if output_path is None:
        output_path = input_wav.with_suffix(".mp4")
    else:
        output_path = Path(output_path)

    cmd = [
        "ffmpeg", "-y", "-i", str(input_wav),
        "-c:a", "aac",
        "-q:a", str(quality),
        str(output_path)
    ]
    
    result = subprocess.run(cmd)
    
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg compression failed with return code {result.returncode}")
    
    print(f"Compressed audio saved to: {output_path}")
    return output_path

In [89]:
def get_audio_info(file_path):
    """Get detailed audio information from a file"""
    import json
    
    cmd = [
        "ffprobe", "-v", "quiet", "-print_format", "json", 
        "-show_streams", "-select_streams", "a:0", str(file_path)
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        data = json.loads(result.stdout)
        stream = data["streams"][0]
        
        print(f"File: {file_path}")
        print(f"Codec: {stream.get('codec_name', 'Unknown')}")
        print(f"Sample Rate: {stream.get('sample_rate', 'Unknown')} Hz")
        print(f"Channels: {stream.get('channels', 'Unknown')}")
        print(f"Bit Rate: {stream.get('bit_rate', 'Unknown')} bps")
        print(f"Duration: {stream.get('duration', 'Unknown')} seconds")
        print(f"Sample Format: {stream.get('sample_fmt', 'Unknown')}")
        
        return stream
    else:
        print(f"Error: {result.stderr}")

# Usage:
# audio_info = get_audio_info(Path("test_audio.mp4"))

In [90]:
def get_sox_info(file_path):
    """Get audio info using SoX"""
    result = subprocess.run(["sox", "--info", str(file_path)], 
                          capture_output=True, text=True)
    if result.returncode == 0:
        print(result.stdout)
    else:
        print(f"Error: {result.stderr}")

# Usage:
# get_sox_info(Path("test_audio.mp4"))

In [91]:
WORKING_DIR = Path("/Users/phapman/Desktop/wouter_video")

In [92]:
audio_file = WORKING_DIR / "Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio.mp4"

In [93]:
get_audio_info(audio_file)

File: /Users/phapman/Desktop/wouter_video/Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio.mp4
Codec: aac
Sample Rate: 48000 Hz
Channels: 2
Bit Rate: 317374 bps
Duration: 4554.000000 seconds
Sample Format: fltp


{'index': 0,
 'codec_name': 'aac',
 'codec_long_name': 'AAC (Advanced Audio Coding)',
 'profile': 'LC',
 'codec_type': 'audio',
 'codec_tag_string': 'mp4a',
 'codec_tag': '0x6134706d',
 'sample_fmt': 'fltp',
 'sample_rate': '48000',
 'channels': 2,
 'channel_layout': 'stereo',
 'bits_per_sample': 0,
 'initial_padding': 0,
 'id': '0x1',
 'r_frame_rate': '0/0',
 'avg_frame_rate': '0/0',
 'time_base': '1/48000',
 'start_pts': 0,
 'start_time': '0.000000',
 'duration_ts': 218592000,
 'duration': '4554.000000',
 'bit_rate': '317374',
 'nb_frames': '213469',
 'extradata_size': 2,
 'disposition': {'default': 1,
  'dub': 0,
  'original': 0,
  'comment': 0,
  'lyrics': 0,
  'karaoke': 0,
  'forced': 0,
  'hearing_impaired': 0,
  'visual_impaired': 0,
  'clean_effects': 0,
  'attached_pic': 0,
  'timed_thumbnails': 0,
  'non_diegetic': 0,
  'captions': 0,
  'descriptions': 0,
  'metadata': 0,
  'dependent': 0,
  'still_image': 0,
  'multilayer': 0},
 'tags': {'language': 'eng',
  'handler_name':

In [94]:
out_file = process_audio_for_asr(audio_file_path=audio_file, compression_level="primary_speech_only")

Using existing intermediate WAV: /Users/phapman/Desktop/wouter_video/Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio_orig.wav
Applying primary_speech_only audio processing...
Processing complete! File saved to: /Users/phapman/Desktop/wouter_video/Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio_processed.wav


In [83]:
1750 / 75

23.333333333333332

In [95]:
compress_wav_to_mp4_vbr(out_file)

ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex

Compressed audio saved to: /Users/phapman/Desktop/wouter_video/Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio_processed.mp4


[out#0/mp4 @ 0x136710d40] video:0KiB audio:147586KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.567031%
size=  148423KiB time=01:15:54.00 bitrate= 267.0kbits/s speed=39.8x    
[aac @ 0x136714360] Qavg: 944.000


PosixPath('/Users/phapman/Desktop/wouter_video/Spring Rolls, Sr. Tinh Nghiem, Sr. Ho Nghiem_audio_processed.mp4')