# Music segmentation

<!-- {{ add_binder_block(page) }} -->

## Introduction

Music segmentation is an instance of change point detection and can be performed with `ruptures`.
Roughly, it consists of finding the temporal boundaries of homogeneous segments, e.g. the intro, verse, chorus and outro in a song.

To that end, the music track is transformed into an informative

In order to do so, we run `ruptures` segmentation algorithms on two different set of features : 

* a chromagram, that maps the input signal into an usually 12 features space representing the classical 12 pitch classes;
* a tempogram computed based on the onset strength envelope of the input signal, that captures the tempo spectrum.

By doing so, we provide two various ways of segmenting a music soundtrack depending on two different representations of the information carried by the input signal, respectively:

* an harmonic spectral representation,
* a rhythmic representation.


To load and manipulate sound data, we use the [librosa package](https://librosa.org/doc/latest/index.html).

## Setup

First, we make the necessary imports.

In [None]:
import time
from itertools import chain, product

import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import ruptures as rpt  # our package
from IPython.display import Audio

## Load the data

A number of music files are available in [Librosa](https://librosa.org/doc/latest/index.html).
See [here](https://librosa.org/doc/latest/recordings.html#description-of-examples) for a complete list.
In this example, we choose the *Dance of the Sugar Plum Fairy* from *The Nutcracker* by Tchaikovsky. 

In [None]:
duration = 30  # in seconds
y, sampling_rate = librosa.load(librosa.ex("nutcracker"), duration=duration)
Audio(data=y, rate=sampling_rate)

## Compute features

### Chromagram

In [None]:
# Number of samples between successive chroma frames
hop_length = 512
# Number of chroma bins to produce
n_chroma = 12
# Compute de Chromagram using the usual 12 chromas
# chromagram of shape [n_chroma, np.ceil(duration*sampling_rate/hop_length)]
chromagram = librosa.feature.chroma_cqt(
    y=y, sr=sampling_rate, hop_length=hop_length, n_chroma=n_chroma
)

### Tempogram

In [None]:
# Length of the onset autocorrelation window (in frames)
win_length = 384
# Compute the onset strength
oenv = librosa.onset.onset_strength(y=y, sr=sampling_rate, hop_length=hop_length)
# tempogram of shape [win_length, np.ceil(y.shape[0]/hop_length)]
tempogram = librosa.feature.tempogram(
    onset_envelope=oenv, sr=sampling_rate, hop_length=hop_length, win_length=win_length
)

## Compute and display segmentations

In [None]:
def plot_util(
    rpt_bkps,
    feature,
    sampling_rate,
    hop_length,
    y_axis,
    vline_min,
    vline_max,
    ax,
    title,
):
    # Make the break points "compatible" with librosa api
    # it expects index 0 and not index of the last point
    # Translate index in the chromagram to time index in the original soundtrack
    rpt_bpks_times = librosa.frames_to_time([0] + rpt_bkps[:-1], sr=sampling_rate)
    # Plot
    librosa.display.specshow(
        feature,
        x_axis="time",
        y_axis=y_axis,
        ax=ax,
        hop_length=hop_length,
        cmap="magma",
        sr=sampling_rate,
    )
    ax.vlines(
        rpt_bpks_times,
        vline_min,
        vline_max,
        color="linen",
        linestyle="--",
        linewidth=1.5,
        alpha=0.9,
    )
    ax.set(title=title)
    ax.label_outer()


# Create figure
fig, axs = plt.subplots(nrows=3, sharex=True, figsize=(8, 6), dpi=150)

# Display raw soundtrack
librosa.display.waveplot(y, sr=sampling_rate, ax=axs[0])
axs[0].set(title="nutcracker")
axs[0].label_outer()

# Display results when asking for 5 break points
n_bkps = 5
start_time = time.time()
algo = rpt.KernelCPD(kernel="linear").fit(chromagram.T)
rpt_bkps = algo.predict(n_bkps=n_bkps)
end_time = time.time()
plot_util(
    rpt_bkps,
    chromagram,
    sampling_rate,
    hop_length,
    "chroma",
    0,
    chromagram.shape[0],
    axs[1],
    f"Algo : {type(algo).__name__}, Cost : {algo.kernel_name}, n_bkps : {n_bkps}, execution time : {(end_time-start_time)*1000:.1f}ms",
)

# Display results when using the penalized search
# method (unknown number of returned breaking point)
penalty = 50.0
start_time = time.time()
algo = rpt.KernelCPD(kernel="linear").fit(chromagram.T)
rpt_bkps = algo.predict(pen=penalty)
end_time = time.time()
plot_util(
    rpt_bkps,
    chromagram,
    sampling_rate,
    hop_length,
    "chroma",
    0,
    chromagram.shape[0],
    axs[2],
    f"Algo : {type(algo).__name__}, Cost : {algo.kernel_name}, penalty : {penalty}, execution time : {(end_time-start_time)*1000:.1f}ms",
)

In [None]:
# Create figure
fig, axs = plt.subplots(nrows=4, sharex=True, figsize=(8, 9), dpi=150)

# Display raw soundtrack
times = librosa.times_like(oenv, sr=sampling_rate, hop_length=hop_length)
librosa.display.waveplot(y, sr=sampling_rate, ax=axs[0])
axs[0].set(title="nutcracker")
axs[0].label_outer()
# Display onset strength
axs[1].plot(times, oenv, label="Onset strength")
axs[1].label_outer()
axs[1].legend(frameon=True)

# Display results when asking for 5 break points
n_bkps = 5
start_time = time.time()
algo = rpt.KernelCPD(kernel="rbf").fit(tempogram.T)
rpt_bkps = algo.predict(n_bkps=n_bkps)
end_time = time.time()
plot_util(
    rpt_bkps,
    tempogram,
    sampling_rate,
    hop_length,
    "tempo",
    16,
    480,
    axs[2],
    f"Algo : {type(algo).__name__}, Cost : {algo.kernel_name}, n_bkps : {n_bkps}, execution time : {(end_time-start_time)*1000:.1f}ms",
)

# Display results when using the penalized search
# method (unknown number of returned breaking point)
penalty = 50.0
start_time = time.time()
algo = rpt.KernelCPD(kernel="rbf").fit(tempogram.T)
rpt_bkps = algo.predict(pen=penalty)
end_time = time.time()
plot_util(
    rpt_bkps,
    tempogram,
    sampling_rate,
    hop_length,
    "tempo",
    16,
    480,
    axs[3],
    f"Algo : {type(algo).__name__}, Cost : {algo.kernel_name}, penalty : {penalty}, execution time : {(end_time-start_time)*1000:.1f}ms",
)

## Study each segment independently

Each segment of the music can be characterized by a set of features. Here, for each resulting segment, we compute the mean of every features. 

We aim at representing how the chosen algorithm detects mean-shifts in the multi-dimensional input signal.

In [None]:
# Run `ruptures` algorithm
n_bkps = 3
algo = rpt.KernelCPD(kernel="linear").fit(tempogram.T)
rpt_bkps = algo.predict(n_bkps=n_bkps)
rpt_bpks_times = librosa.frames_to_time([0] + rpt_bkps[:-1], sr=sampling_rate)

# Compute the mean of every features for each segment
heat_map = np.zeros((n_bkps + 1, tempogram.shape[0]))
for (segment_number, segment) in enumerate(np.split(tempogram.T, rpt_bkps[:-1])):
    heat_map[segment_number, :] = segment.mean(axis=0)
# Stretch x-axis to capture segment time duration
heat_map = np.repeat(heat_map, np.diff([0] + rpt_bkps), axis=0)
# Plot features means on segments
fig, ax = plt.subplots(figsize=(8, 4), dpi=150)
plot_util(
    rpt_bkps,
    heat_map.T,
    sampling_rate,
    hop_length,
    "tempo",
    16,
    480,
    ax,
    "Features means on each segment",
)

# Output audio for each segment
bkps_times = (rpt_bpks_times * sampling_rate).astype(int)[1:]
for (segment_number, segment) in enumerate(np.split(y, bkps_times), start=1):
    print(f"Segment n°{segment_number}")
    display(Audio(data=segment, rate=sampling_rate))