In [1]:

import librosa
import librosa.display
import numpy as np
from scipy.ndimage import gaussian_filter1d


# ===========================
#    Structure Detections
# ===========================

import librosa
import numpy as np
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks

# ===========================
#    Structure Detections
# ===========================

def detect_structural_boundaries(audio, sr, kernel_size=32, delta=0.2, hop_length=512):
    """
    Detect musical section boundaries using chroma-based novelty detection.
    Returns sorted boundary times in seconds.
    
    Parameters:
        audio : np.ndarray
            Audio signal.
        sr : int
            Sample rate.
        kernel_size : int
            Size of Gaussian smoothing kernel.
        delta : float
            Threshold for peak picking.
    """
    # 1. Harmonic component & chroma
    y_harm = librosa.effects.harmonic(audio)
    chroma = librosa.feature.chroma_cqt(y=y_harm, sr=sr)

    # 2. Self-similarity (recurrence) matrix
    R = librosa.segment.recurrence_matrix(chroma, mode='affinity', sym=True)

    # 3. Compute novelty as row-wise differences (simple checkerboard kernel)
    novelty = np.zeros(R.shape[0])
    for i in range(1, R.shape[0]-1):
        novelty[i] = np.sum(np.abs(R[i,:] - R[i-1,:]))

    # 4. Smooth novelty to reduce spurious peaks
    novelty = gaussian_filter1d(novelty, sigma=kernel_size)

    # 5. Peak picking
    min_section_s = 2
    distance_frames = int(min_section_s * sr / hop_length)
    peaks, _ = find_peaks(novelty, height=delta, distance=distance_frames)  
    times = librosa.frames_to_time(peaks, sr=sr)
    return np.unique(times).tolist()


# ===========================
#     Tempo Detections
# ===========================

def estimate_local_bpm(audio, sr, start_s, end_s):
    """
    Estimate BPM for a specific time segment.
    Returns None if BPM cannot be estimated.
    """
    start_sample = int(start_s * sr)
    end_sample = int(end_s * sr)

    segment = audio[start_sample:end_sample]
    if len(segment) < sr * 2:
        return None

    tempo, _ = librosa.beat.beat_track(y=segment, sr=sr)
    return float(tempo) if tempo > 0 else None


def compute_tempo_curve(audio, sr, hop_s=0.5, win_s=6.0):
    tempos = []
    times = []

    hop = int(hop_s * sr)
    win = int(win_s * sr)

    for start in range(0, len(audio) - win, hop):
        segment = audio[start:start + win]
        tempo, _ = librosa.beat.beat_track(y=segment, sr=sr)
        tempos.append(float(tempo) if tempo > 0 else np.nan)
        times.append((start + win / 2) / sr)

    return np.array(times), np.array(tempos)


def detect_tempo_change_boundaries(times, tempos, bpm_change_thresh):
    valid = ~np.isnan(tempos)
    tempos = tempos[valid]
    times = times[valid]

    if len(tempos) < 2:
        return []

    dtempo = np.abs(np.diff(tempos))
    change_idxs = np.where(dtempo >= bpm_change_thresh)[0]

    return times[change_idxs + 1].tolist()


# ================================
#  Structure + Tempo Segmentation
# ================================

def get_audio_sections(
    audio_filepath,
    min_section_s=8.0,
    bpm_change_thresh=8.0):
    """
    - detect structural section changes
    - detect tempo changes
    - combine both structural and tempo novelty
    - estimate BPM per resulting section

    Returns:
      tempo_sections = [(bpm, start_s), ...]
      duration_s
      first_beat_s
    """
    print("Loading audio...")
    audio, sr = librosa.load(audio_filepath, sr=None)
    duration_s = len(audio) / sr

    _, beat_frames = librosa.beat.beat_track(y=audio, sr=sr)
    if len(beat_frames) > 0:
        first_beat_s = librosa.frames_to_time(beat_frames[0], sr=sr)
    else:
        first_beat_s = 0.0

    print("Detecting structural boundaries...")
    structural_bounds = detect_structural_boundaries(audio, sr)
    print(f"Structural boundaries start times: {structural_bounds}")


    print("Detecting tempo change boundaries...")
    times, tempos = compute_tempo_curve(audio, sr)
    tempo_bounds = detect_tempo_change_boundaries(
        times, tempos, bpm_change_thresh
    )
    print(f"Tempo boundaries start times: {tempo_bounds}")

    # Combine all boundaries
    boundaries = np.unique(
        np.concatenate((
            [first_beat_s],
            structural_bounds,
            tempo_bounds,
            [duration_s],
        ))
    ).tolist()

    print(f"Combined boundaries start times: {boundaries}")

    print("Estimating BPM per section...")
    tempo_sections = []
    start_s = boundaries[0]
    for i in range(len(boundaries) - 1):
        end_s = boundaries[i + 1]
        if end_s - start_s < min_section_s:
            print(f" section is too short: {end_s - start_s}")
            continue

        bpm = estimate_local_bpm(audio, sr, start_s, end_s)
        if bpm is not None:
            tempo_sections.append((bpm, start_s))
            start_s = end_s
        else:
            print(f" failed to estimate BPM for section {i}")

    if not tempo_sections:
        raise RuntimeError("Failed to detect any sections")

    print("Detected tempo sections:")
    for bpm, start in tempo_sections:
        print(f"  BPM {bpm:.1f} @ {start:.2f}s")

    return tempo_sections, duration_s, first_beat_s


song_list = {
    1: "./data/bedroomTalk_opening.wav",
    2: "./data/janeDoe.wav",
    3: "./data/supernatural_opening.wav",
    4: "./data/tattoo_opening.wav",
    5: "./data/weWillRockYou.wav",
    6: "./data/byeSummer_opening.wav",
}

if __name__ == "__main__":
    tempo_sections, duration_s, first_beat_s = get_audio_sections(song_list[2])

ImportError: dlopen(/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/matplotlib/_c_internal_utils.cpython-310-darwin.so, 0x0002): tried: '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/matplotlib/_c_internal_utils.cpython-310-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64e' or 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/matplotlib/_c_internal_utils.cpython-310-darwin.so' (no such file), '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/matplotlib/_c_internal_utils.cpython-310-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64e' or 'arm64'))

In [None]:
import IPython.display as ipd

def add_boundary_beeps(audio, sr, boundaries, beep_freq=1000, beep_duration=0.05):
    """
    Add short beeps at boundary times.
    
    Parameters
    ----------
    audio : np.ndarray
        Original audio waveform.
    sr : int
        Sample rate.
    boundaries : list of float
        Times (seconds) of section boundaries.
    beep_freq : float
        Frequency of beep in Hz.
    beep_duration : float
        Duration of each beep in seconds.
    
    Returns
    -------
    audio_with_beeps : np.ndarray
        Audio waveform with beeps added.
    """
    audio_out = audio.copy()
    beep_samples = int(beep_duration * sr)
    t = np.arange(beep_samples) / sr
    beep = 0.3 * np.sin(2 * np.pi * beep_freq * t)  # short sine wave

    for b in boundaries:
        idx = int(b * sr)
        if idx + beep_samples < len(audio_out):
            audio_out[idx:idx+beep_samples] += beep
        else:
            audio_out[idx:] += beep[:len(audio_out)-idx]
    
    # Clip to avoid clipping distortion
    audio_out = np.clip(audio_out, -1.0, 1.0)
    return audio_out

tempo_sections, duration_s, first_beat_s = get_audio_sections(song_list[2])
boundaries = [start for bpm, start in tempo_sections]  # section starts

audio, sr = librosa.load(song_list[2], sr=None)
audio_with_beeps = add_boundary_beeps(audio, sr, boundaries)

# Play in Jupyter
ipd.Audio(audio_with_beeps, rate=sr)
