In [1]:
from datetime import datetime
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import subprocess
import moviepy.editor as mp
from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
import re
import random
from matplotlib.lines import Line2D
import plotly.graph_objs as go
import numpy as np


In [2]:
#!pip install plotly
#!pip install numba==0.56.4 librosa==0.10.1


In [7]:
def mmss_to_seconds(time_str):
    """Convert mm:ss format to seconds."""
    match = re.match(r"(\d+):(\d+)", time_str)
    if match:
        minutes, seconds = map(int, match.groups())
        return minutes * 60 + seconds
    else:
        raise ValueError("Time format must be mm:ss")


def detect_audio_peaks(audio_path, start_time, end_time, sr=22050, hop_length=512):
    
    # Load audio file
    y, sr = librosa.load(audio_path, sr=sr, offset=start_time, duration=end_time - start_time)

    # Compute the onset envelope
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)

    # Detect peaks
    peak_indices = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, hop_length=hop_length)

    # Convert peak indices to time (in seconds)
    peak_times = librosa.frames_to_time(peak_indices, sr=sr, hop_length=hop_length)

    return peak_times


def detect_audio_peaks_with_fourier_transform(audio_path, start_time, end_time, sr=22050, hop_length=512, backtrack=True, plot=False):

    # Load audio
    y, sr = librosa.load(audio_path, sr=sr, offset=start_time, duration=end_time - start_time)

    # Compute STFT
    S = np.abs(librosa.stft(y, hop_length=hop_length))
    
    # Spectral flux
    flux = np.sqrt(np.sum(np.diff(S, axis=1)**2, axis=0))
    flux = np.pad(flux, (1, 0))  # match length

    # Normalize flux
    flux = (flux - np.min(flux)) / (np.max(flux) - np.min(flux) + 1e-6)

    # Onset envelope
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    onset_env = (onset_env - np.min(onset_env)) / (np.max(onset_env) - np.min(onset_env) + 1e-6)

    # Combined novelty function
    combined_env = 0.5 * flux + 0.5 * onset_env

    # Detect peaks
    peak_indices = librosa.onset.onset_detect(onset_envelope=combined_env, sr=sr, hop_length=hop_length, backtrack=backtrack)
    peak_times = librosa.frames_to_time(peak_indices, sr=sr, hop_length=hop_length)

    # Plotting
    

    if plot:
        times = librosa.frames_to_time(np.arange(len(combined_env)), sr=sr, hop_length=hop_length)

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=times, y=onset_env, mode='lines', name='Onset Envelope', opacity=0.7))
        fig.add_trace(go.Scatter(x=times, y=flux, mode='lines', name='Spectral Flux', opacity=0.7))
        fig.add_trace(go.Scatter(x=times, y=combined_env, mode='lines', name='Combined Signal', line=dict(width=2)))

        # Add vertical lines for peaks
        for pt in peak_times:
            fig.add_vline(x=pt, line=dict(color='red', dash='dash'), opacity=0.6)

        fig.update_layout(
            title='Peak Detection Plot',
            xaxis_title='Time (s)',
            yaxis_title='Normalized Energy',
            legend=dict(x=0.01, y=0.99),
            height=600
        )

        fig.show()

    return peak_times



def follow_the_peaks(
    image_folder,
    audio_path,
    output_video,
    start_time_str,
    end_time_str,
    fps=30,
    manual_intervals=None  # List of tuples: (start_str, end_str, image_path)
):
    start_time = mmss_to_seconds(start_time_str)
    end_time = mmss_to_seconds(end_time_str)

    # Load and shuffle all images
    all_images = [os.path.join(image_folder, img) for img in os.listdir(image_folder)
                  if img.endswith(('.png', '.jpg', '.jpeg', '.JPG'))]
    random.shuffle(all_images)

    if not all_images:
        raise ValueError("No images found in the specified folder.")

    # Convert manual interval strings to seconds relative to video segment
    manual_intervals = manual_intervals or []
    manual_times = []
    for start_str, end_str, img in manual_intervals:
        s = mmss_to_seconds(start_str)
        e = mmss_to_seconds(end_str)
        if start_time <= s < e <= end_time:
            manual_times.append((s - start_time, e - start_time, img))

    # Block out manual ranges
    blocked_ranges = [(s, e) for s, e, _ in manual_times]

    def is_in_manual_ranges(t):
        return any(s <= t <= e for s, e in blocked_ranges)

    # Detect peaks
    peak_times = detect_audio_peaks_with_fourier_transform(audio_path, start_time, end_time)
    peak_times = [t for t in peak_times if not is_in_manual_ranges(t)]

    # Prepare video segments
    all_segments = []

    # Manual intervals
    for s, e, img in manual_times:
        all_segments.append((s, img, e - s))

    # Peak-based segments
    peak_times = [0.0] + peak_times + [end_time - start_time]
    for i in range(1, len(peak_times)):
        t_start = peak_times[i - 1]
        t_end = peak_times[i]
        if not is_in_manual_ranges(t_start):
            img = all_images[(i - 1) % len(all_images)]
            all_segments.append((t_start, img, t_end - t_start))

    # Sort segments by time
    all_segments.sort(key=lambda x: x[0])  # Sort by segment start time

    final_images = [seg[1] for seg in all_segments]
    final_durations = [seg[2] for seg in all_segments]

    # Create video
    clip = ImageSequenceClip(final_images, durations=final_durations)
    audio = mp.AudioFileClip(audio_path).subclip(start_time, end_time)
    clip = clip.set_audio(audio)
    clip.write_videofile(output_video, fps=fps)



In [4]:
start_time_str="0:40"  
end_time_str="1:13"  
start_time = mmss_to_seconds(start_time_str)
end_time = mmss_to_seconds(end_time_str)

detect_audio_peaks_with_fourier_transform("sources\\audio\\berlioz - jazz is for ordinary people.mp3",  start_time, end_time, plot=True)


array([ 0.        ,  0.11609977,  0.2554195 ,  0.37151927,  0.41795918,
        0.53405896,  0.65015873,  0.7662585 ,  0.90557823,  1.13777778,
        1.41641723,  1.64861678,  1.85759637,  2.29877551,  2.64707483,
        2.90249433,  2.90249433,  3.11147392,  3.36689342,  3.36689342,
        3.66875283,  3.78485261,  3.90095238,  4.13315193,  4.27247166,
        4.66721088,  4.94585034,  5.15482993,  5.2244898 ,  5.38702948,
        5.38702948,  5.68888889,  5.94430839,  6.31582766,  6.59446712,
        6.80344671,  6.89632653,  7.15174603,  7.40716553,  7.70902494,
        7.82512472,  7.94122449,  8.19664399,  8.33596372,  8.68426304,
        8.91646259,  8.98612245,  9.10222222,  9.19510204,  9.42730159,
        9.42730159,  9.70594104,  9.93814059, 10.00780045, 10.10068027,
       10.24      , 10.33287982, 10.4954195 , 10.68117914, 10.93659864,
       11.0062585 , 11.19201814, 11.44743764, 11.44743764, 11.63319728,
       11.86539683, 11.95827664, 12.00471655, 12.14403628, 12.35

In [12]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

follow_the_peaks(
    image_folder="results/stedelijk",
    audio_path="sources/audio/berlioz - jazz is for ordinary people.mp3",
    output_video=f"results/video/output_{timestamp}.mp4",
    start_time_str="0:35",
    end_time_str="1:13",
    fps=30,
    manual_intervals=[
        ("0:35", "0:43", "results/stedelijk/1747433311.JPG"),
    ]
)

Moviepy - Building video results/video/output_20250517_002605.mp4.
MoviePy - Writing audio in output_20250517_002605TEMP_MPY_wvf_snd.mp3


                                                                   

MoviePy - Done.
Moviepy - Writing video results/video/output_20250517_002605.mp4



                                                                

Moviepy - Done !
Moviepy - video ready results/video/output_20250517_002605.mp4
