**DSP DEMIXING**


In [None]:
# Install the required packages
%pip install -r requirements.txt

Let's start with loading the dataset. The folder "musdbhq_trimmed" contains 30 seconda of all the tracks. Since we noticed that not all the stems of the tracks were non-silent in the first 30 seconds, we trimmed the dataset in order to retrieve 30 seconds of each track where every stem is non-silent, in order to have a more accurate measure.

In [None]:
import tqdm, os, torchaudio

def load_dataset():
    """
    Load the dataset from the musdb18hq_trimmed folder.
    Each subfolder in the dataset corresponds to a song.
    Each song contains multiple stems (e.g., mixture, drums, bass, etc.).
    Returns:
        dataset_dict (dict): A dictionary where keys are track folders and values are dictionaries of stems.
    """
    dataset_dict = {}

    for track_folder in tqdm.tqdm(os.listdir("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE/musdb18hq_trimmed")):
        track_path = os.path.join("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE/musdb18hq_trimmed", track_folder)
        if not os.path.isdir(track_path):
            continue

        # Prepare a sub-dictionary for this song
        stems_dict = {}
        
        for stem_name in ["mixture", "drums", "bass", "vocals", "other", "new_mixture"]:
            file_path = os.path.abspath(os.path.join(track_path, f"{stem_name}.wav"))
            
            if not os.path.isfile(file_path):
                print(f"Warning: file not found {file_path}")
                continue

            # Load full audio
            waveform, sr = torchaudio.load(file_path)

            stems_dict[stem_name] = waveform
            
        dataset_dict[track_folder] = stems_dict
        
    return dataset_dict

In [None]:
# Load the dataset
dataset_dict = load_dataset()  

print("Number of keys in dataset_dict:", len(dataset_dict))

# Check the first track folder and its contents
first_track_folder = list(dataset_dict.keys())[0]
print("First track folder:", first_track_folder)
print("Contents of the first track folder:")
for stem_name in dataset_dict[first_track_folder].keys():
    print(f" - {stem_name}: {dataset_dict[first_track_folder][stem_name].shape}")

Let's load all the mixtures in a list ***mixture_files[]***.


In [None]:
# Load all new_mixture.wav files 
mixture_files = []
for track_folder in dataset_dict.keys():
    new_mixture_path = os.path.join("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE/musdb18hq_trimmed", track_folder, "new_mixture.wav")
    if os.path.isfile(new_mixture_path):
        mixture_files.append(new_mixture_path)
    else:
        print(f"Warning: file not found {new_mixture_path}")


Define the parameters of the STFT:

In [None]:
import librosa
# STFT parameters
n_fft = 2048
hop_length = 512
win = 'hann'

# Initialize lists to store STFT results
S_full_list = []
phase_list = []

# Loop through each mixture file and compute STFT
for mixture_path in mixture_files:
    # Carica l'audio dal file
    audio, sr = librosa.load(mixture_path, sr=None)
    # Calcola STFT
    D = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length, window=win)
    # Estrai modulo e fase
    mag, phase = librosa.magphase(D)
    
    S_full_list.append(mag)
    phase_list.append(phase)

Now we are going to define some functions:
- drumExtraction
- vocalExtraction
- bassOtherExtraction


**DRUMS EXTRACTION**
(using HPSS):

- STFT Magnitude Input: The function receives the magnitude (mix_mag) and phase (mix_phase) of the mixture’s STFT.

- HPSS Decomposition: It utilizes the **Harmonic-Percussive Source Separation (HPSS)** algorithm to split the mixture’s magnitude into two components:
    1) A ***harmonic component*** that captures the tonal content.
    2) A ***percussive component*** that emphasizes transient, drum-like features.

- Drums Reconstruction: The function then reconstructs the time-domain drums signal by combining the percussive component with the original phase information using the iSTFT.

- Output: The result is a time-domain signal (drums) that represents the extracted percussive (drum) elements from the mixture.

In [None]:
# Drums extraction using HPSS
import numpy as np

def drums_extraction(mix_mag, mix_phase, n_fft=2048, hop_length=512):
    """
    Extract drums from a mixture using Harmonic-Percussive Source Separation (HPSS).
    
    Parameters:
        mixture (ndarray): The audio mixture.
        sr (int): Sample rate of the audio.
        n_fft (int): FFT window size.
        hop_length (int): Hop length for STFT.
        
    Returns:
        drums (ndarray): Extracted drums.
    """
    # Decompose the mixture into harmonic and percussive components
    S_harmonic, S_percussive = librosa.decompose.hpss(mix_mag)

    # Reconstruct the drums using the percussive component and the original phase
    y_drums = librosa.istft(S_percussive*mix_phase, hop_length=hop_length, win_length=n_fft, window=win)
    
    
    return y_drums, S_harmonic

**Vocal Extraction** (using REPET-SIM):

- STFT Magnitude Input: Similar to the drums extraction, this function takes the mixture’s magnitude (mix_mag) and phase (mix_phase) as input.

- NN Filter Processing: The function applies a ***nearest-neighbor (nn) filter*** to the magnitude. This filter:
    - Uses a median aggregation to estimate a smooth background signal.
    - Operates with a cosine similarity metric and a time window (converted from 2.0 seconds into frames) to capture repeating patterns.

- Filter Application: The filtered version (S_filter) is then constrained by taking the element-wise minimum with the original magnitude, ensuring that only components present in both are retained.

- Soft Mask Creation: A soft mask is computed using librosa.util.softmask that emphasizes differences between the original magnitude and the filtered background. This mask is tuned (with a factor of 100 and power 2) to highlight vocal components.

- Vocals Reconstruction: The function applies this mask to the original magnitude to produce a modified magnitude focused on the vocal content. It then reconstructs the time-domain vocal signal by combining this modified magnitude with the original phase via the iSTFT.

- Output: The final output is a time-domain signal representing the extracted vocals from the mixture.

In [None]:
import warnings

# Vocal extraction using REPET-SIM
def vocal_extraction(mix_mag, mix_phase, n_fft=2048, hop_length=512):
    """
    Extract vocals from a mixture using REPET-SIM.
    """
    # add small epsilon to avoid zero‐vectors in cosine similarity
    eps = 1e-8
    mix_mag_eps = mix_mag + eps

    # suppress divide‐by‐zero / overflow warnings during nn_filter
    
    S_filter = librosa.decompose.nn_filter(
        mix_mag_eps,
        aggregate=np.median,
        metric='cosine',
        width=int(librosa.time_to_frames(2.0, sr=sr))
    )
    S_filter = np.minimum(S_filter, mix_mag_eps)
    S_filter = np.nan_to_num(S_filter, nan=0.0, posinf=0.0, neginf=0.0)

    # create and sanitize soft mask
    mask_vocal = librosa.util.softmask(
        mix_mag_eps - S_filter,
        100 * S_filter,
        power=2
    )
    mask_vocal = np.nan_to_num(mask_vocal, nan=0.0, posinf=0.0, neginf=0.0)

    S_vocal = mask_vocal * mix_mag_eps
    y_vocals = librosa.istft(
        S_vocal * mix_phase,
        hop_length=hop_length,
        win_length=n_fft,
        window=win
    )

    return y_vocals, mask_vocal

**BASS & OTHERS EXTRACTION**

For bass and other non-vocal components, the process begins with isolating the non-vocal harmonic residue by subtracting the estimated vocal contribution from the harmonic component. Frequency information is then used to create masks: one that selects frequencies below a certain threshold (250 Hz) to capture bass elements, and a complementary mask that selects the remaining frequencies for other elements. These masks are applied to the non-vocal residue, and by incorporating the original phase data, two distinct time-domain signals are reconstructed—one corresponding to the bass and the other to the rest of the non-vocal components.

In [None]:
# Bass and Other extraction
def bass_other_extraction(S_harmonic, mask_vocal, n_fft=2048, hop_length=512):
    """
    Extract bass and other components from a mixture using a simple thresholding method.
    
    Parameters:
        S_harmonic (ndarray): The harmonic component of the mixture.
        mask_vocal (ndarray): The vocal mask obtained from the vocal extraction.
        n_fft (int): FFT window size.
        hop_length (int): Hop length for STFT.  
    Outputs:
        y_bass (ndarray): Extracted bass component.
        y_other (ndarray): Extracted other component.   
    """
    # Residuo armonico non-vocale
    S_resid = S_harmonic - (mask_vocal * S_harmonic)

    # Opzione A: filtro passa-basso per il basso (< 250 Hz)
    freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
    bass_mask = (freqs[:, None] < 250.0).astype(float)
    other_mask = 1.0 - bass_mask

    y_bass  = librosa.istft((S_resid * bass_mask) * phase, hop_length=hop_length, window=win)
    y_other = librosa.istft((S_resid * other_mask) * phase, hop_length=hop_length, window=win)

    return y_bass, y_other

Now we are going to do the separation for just the first track of the dataset to see the results.

In [None]:
import IPython.display as ipd

# Perform extraction for the first mixture and listen to the results
mixture_path = mixture_files[0]
mixture, sr = librosa.load(mixture_path, sr=None)



In [None]:
import librosa
# Load the first mixture of the dataset and play it
mixture, sr = librosa.load(mixture_path, sr=None)
print("Mixture:")
ipd.Audio(mixture, rate=sr)


In [None]:
# hpss_stereo.py
# HPSS “from scratch” su segnali stereo

from __future__ import annotations
import numpy as np
import librosa
import scipy.ndimage
import soundfile as sf
from pathlib import Path


def hpss_stereo(
    path_in: str,
    *,
    sr: int = 44_100,
    n_fft: int = 4096,
    hop: int = 1024,
    harmonic_filt: int = 31,
    percussive_filt: int = 31,
    mask: str = "soft",        # "soft" oppure "binary"
    power: float = 2.0,        # =2 ⇒ spettro di potenza; =1 ⇒ magnitudine
    out_harm: str | None = None,
    out_perc: str | None = None,
) -> tuple[np.ndarray, np.ndarray]:
    """
    HPSS per file stereo: restituisce due array (shape=(2, n_samples))
    contenenti i segnali armonico e percussivo per i canali [L, R].

    Parameters
    ----------
    path_in         : percorso del file stereo (wav, flac, mp3,…)
    sr              : sample rate di lavoro
    n_fft, hop      : parametri STFT
    harmonic_filt   : lunghezza filtro orizzontale (frames)
    percussive_filt : lunghezza filtro verticale (bin freq)
    mask            : "soft" o "binary"
    power           : esponente per lo spettro (1=magn, 2=potenza)
    out_harm/out_perc : percorsi opzionali per salvare gli stem stereo

    Returns
    -------
    y_h, y_p : np.ndarray
        Array shape=(2, n_samples): [canale_L, canale_R] separati.
    """
    # 1. carica stereo
    y, _ = librosa.load(path_in, sr=sr, mono=False)  # shape (2, n)
    if y.ndim == 1:
        # se mono, duplichiamo per mantenere shape coerente
        y = np.vstack([y, y])
    n_channels, n_samples = y.shape

    # container per gli stem
    y_h = np.zeros_like(y)
    y_p = np.zeros_like(y)

    # 2–6. per ciascun canale, esegui la pipeline HPSS
    for ch in range(n_channels):
        X = librosa.stft(y[ch], n_fft=n_fft, hop_length=hop, window="hann")
        mag = np.abs(X)
        Y = mag**power

        # median filtering anisotropo
        Y_h = scipy.ndimage.median_filter(Y, size=(1, harmonic_filt))
        Y_p = scipy.ndimage.median_filter(Y, size=(percussive_filt, 1))

        # creazione maschere
        if mask == "binary":
            Mh = (Y_h >= Y_p).astype(np.float32)
            Mp = 1.0 - Mh
        else:  # soft
            eps = 1e-10
            denom = Y_h + Y_p + eps
            Mh = Y_h / denom
            Mp = Y_p / denom

        # applicazione maschere e ricostruzione
        X_h = X * Mh
        X_p = X * Mp
        y_h[ch] = librosa.istft(X_h, hop_length=hop, window="hann", length=n_samples)
        y_p[ch] = librosa.istft(X_p, hop_length=hop, window="hann", length=n_samples)

    return y_h, y_p



In [None]:
# Call the function and save the two components
harmonic_comp, percussive_comp = hpss_stereo(
    path_in=mixture_path,
    sr=44100,
    n_fft=2048,
    hop=512,
    harmonic_filt=31,
    percussive_filt=31,
    mask="soft",
    power=2.0
)

# Play the separated components
print("Percussive Component:")
print(percussive_comp.shape)
display(ipd.Audio(percussive_comp, rate=44100))
print("Harmonic Component:")
display(ipd.Audio(harmonic_comp, rate=44100))

In [None]:
harmonic_comp, percussive_comp = hpss_stereo(
    path_in=mixture_path,
    sr=44100,
    n_fft=2048,
    hop=512,
    harmonic_filt=51,
    percussive_filt=101,
    mask="binary",
    power=2.0
)

# Play the separated components
print("Percussive Component:")
print(percussive_comp.shape)
display(ipd.Audio(percussive_comp, rate=44100))
print("Harmonic Component:")
display(ipd.Audio(harmonic_comp, rate=44100))

**REPET_SIM VOCAL EXTRACTION**

In [None]:
import numpy as np
import librosa
import scipy.ndimage
import soundfile as sf
from pathlib import Path

def repet_separate_stereo(
    y: str,
    *,
    sr: int = 44100,
    n_fft: int = 2048,
    hop_length: int = 512,
    mask_type: str = "soft",  # "soft" or "binary"
) -> tuple[np.ndarray, np.ndarray]:
    """
    Separate vocals and music from a stereo mix using the REPET algorithm,
    preserving stereo image by computing mask on the mid-channel and
    applying smoothed channel-specific masks to L and R.

    Parameters
    ----------
    path_in : str
        Path to input stereo audio file.
    sr : int
        Sampling rate for loading and processing.
    n_fft : int
        FFT window size for STFT.
    hop_length : int
        Hop length between STFT frames.
    mask_type : {"soft", "binary"}
        Type of mask: "soft" for proportional, "binary" for hard.
    out_vocal : str | None
        Optional path to save separated vocal track (WAV), stereo.
    out_music : str | None
        Optional path to save separated music track (WAV), stereo.

    Returns
    -------
    vocals : np.ndarray
        Separated vocal signal, shape=(2, n_samples).
    music  : np.ndarray
        Separated music signal, shape=(2, n_samples).
    """
    # 1. Load stereo audio
    # y, _ = librosa.load(path_in, sr=sr, mono=False)
    #display(ipd.Audio(y, rate=sr))
    if y.ndim == 1:
        y = np.vstack([y, y])
    n_ch, n_samples = y.shape

    # 2. Compute mid-channel for mask estimation
    y_mid = np.mean(y, axis=0)

    # 3. STFT on mid-channel
    X_mid = librosa.stft(y_mid, n_fft=n_fft, hop_length=hop_length, window='hann', center=True)
    V_mid = np.abs(X_mid) ** 2

    # 4. Estimate repeating period using onset envelope
    onset_env = librosa.onset.onset_strength(y=y_mid, sr=sr, hop_length=hop_length)
    tempos = librosa.beat.tempo(onset_envelope=onset_env, sr=sr, hop_length=hop_length)
    if len(tempos) == 0:
        raise ValueError("Could not estimate tempo. Check audio quality.")
    tempo = float(tempos[0])
    period = int(round(60 * sr / (tempo * hop_length)))
    if period < 1:
        raise ValueError("Estimated period too small. Check audio or parameters.")
    print(f"Estimated tempo: {tempo:.2f} BPM, repeating period: {period} frames")

    # 5. Build repeating pattern W
    n_bins, n_frames = V_mid.shape
    n_segments = n_frames // period
    if n_segments < 2:
        raise ValueError("Not enough repeating segments. Try a smaller period or longer audio.")
    V_trim = V_mid[:, :n_segments * period]
    segments = V_trim.reshape(n_bins, period, n_segments)
    W = np.median(segments, axis=2)

    # 6. Expand W to full length
    W_full = np.tile(W, (1, n_segments))
    tail = n_frames - W_full.shape[1]
    if tail > 0:
        W_full = np.hstack((W_full, W[:, :tail]))

    # 7. Separate per channel with smoothing
    music = np.zeros((n_ch, n_samples))
    vocals = np.zeros((n_ch, n_samples))
    eps = 1e-10
    for ch in range(n_ch):
        # STFT per channel
        X = librosa.stft(y[ch], n_fft=n_fft, hop_length=hop_length, window='hann', center=True)
        V = np.abs(X)**2

        # Mask creation per channel
        if mask_type == "binary":
            M = (V >= W_full).astype(float)
        else:
            M = W_full / (V + eps)
        M = np.clip(M, 0, 1)

        # Smoothing mask to reduce artefacts
        mask_amp = np.sqrt(M)
        mask_amp = scipy.ndimage.median_filter(mask_amp, size=(3, 3))

        # Apply mask and reconstruct music
        X_music = X * mask_amp
        y_music = librosa.istft(X_music, hop_length=hop_length, window='hann', center=True, length=n_samples)
        # Normalize to avoid clipping
        y_music = librosa.util.normalize(y_music)
        music[ch] = y_music
        vocals[ch] = y[ch] - y_music


    return vocals, music



In [None]:
vocal, music = repet_separate_stereo(
    y=mixture,
    sr=44100,
    n_fft=2048,
    hop_length=512,
    mask_type="soft"
)
other = mixture - vocal - music
print("Mixture:")
display(ipd.Audio(mixture, rate=44100))
print("Difference between original and music component:")
display(ipd.Audio(other, rate=44100))

# output_path = Path("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE")
# output_other= output_path / "other.wav"
# sf.write(output_other, other.T, 44100)

# _, drums = hpss_stereo(
#     path_in=output_other,
#     sr=44100,
#     n_fft=2048,
#     hop=512,
#     harmonic_filt=31,
#     percussive_filt=31,
#     mask="soft",
#     power=2.0
# )

# print("Drums Component:")

# display(ipd.Audio(drums, rate=44100))

# Play the separated components
print("Vocal Component:")
display(ipd.Audio(vocal, rate=44100))
print("Music Component:")
display(ipd.Audio(music, rate=44100))

HPSS+REPET-SIM



In [None]:
import scipy.signal

def separate_sources(
    mixture_path,
    sr=None,
    hpss_margin=1.0,
    bass_cutoff=200,
    bass_order=4,
    nn_width_sec=1.0
):
    """
    Load an audio file and separate it into drums, bass, vocals and other.
    Returns a dict of numpy arrays (all mono).
    """
    # 1) load & mono
    y, sr = librosa.load(mixture_path, sr=sr)
    if y.ndim > 1:
        y = librosa.to_mono(y)

    # 2) HPSS for initial drums extraction
    D = librosa.stft(y)
    D_harm, D_perc = librosa.decompose.hpss(D, margin=hpss_margin, power=2.0)
    y_drums_initial = librosa.istft(D_perc, length=len(y))
    y_harmonic = librosa.istft(D_harm, length=len(y))
    
    # 3) Apply REPET-SIM to drums to remove vocal residuals
    S_drums, phase_drums = librosa.magphase(librosa.stft(y_drums_initial))
    width = int(librosa.time_to_frames(nn_width_sec, sr=sr))
    S_filter = librosa.decompose.nn_filter(
        S_drums, aggregate=np.median, metric='cosine', width=width
    )
    # Extract vocal residuals from drums (to be discarded)
    vocal_residuals_in_drums = np.clip(S_drums - S_filter, 0, None)

    # Listen to the vocal residuals in drums
    y_vocal_residuals_in_drums = librosa.istft(vocal_residuals_in_drums * phase_drums, length=len(y))
    print("Vocal residuals in drums:")
    ipd.display(ipd.Audio(y_vocal_residuals_in_drums, rate=sr))

    # Listen to S_drums before cleaning
    y_drums_before_cleaning = librosa.istft(S_drums * phase_drums, length=len(y))
    print("Drums before cleaning:")
    ipd.display(ipd.Audio(y_drums_before_cleaning, rate=sr))

    # Listen to S_filter
    y_filter = librosa.istft(S_filter * phase_drums, length=len(y))
    print("Filter applied to drums:")
    ipd.display(ipd.Audio(y_filter, rate=sr))

    # Clean drums = original drums - vocal residuals
    S_drums_clean = S_drums - vocal_residuals_in_drums
    y_drums = librosa.istft(S_drums_clean * phase_drums, length=len(y))

    # 4) low-pass for bass
    nyq = 0.5 * sr
    b, a = scipy.signal.butter(bass_order, bass_cutoff/nyq, btype='low')
    y_bass = scipy.signal.lfilter(b, a, y_harmonic)

    # 5) NN-median filter for vocals from harmonic component
    S_full, phase = librosa.magphase(librosa.stft(y_harmonic))
    S_filter = librosa.decompose.nn_filter(
        S_full, aggregate=np.median, metric='cosine', width=width
    )
    mask_voc = np.clip(S_full - S_filter, 0, None)
    y_vocals = librosa.istft(mask_voc * phase, length=len(y))

    # 6) residual "other"
    y_other = y - y_drums - y_bass - y_vocals

    return {
        'drums':   y_drums,
        'bass':    y_bass,
        'vocals':  y_vocals,
        'other':   y_other,
        'sr': sr
    }


Single Track

In [None]:
sources = separate_sources(mixture_path)
drums  = sources['drums']
bass   = sources['bass']
vocals = sources['vocals']
other  = sources['other']

# Play the separated components
print("Drums Component:")
display(ipd.Audio(drums, rate=sources['sr']))
print("Bass Component:")
display(ipd.Audio(bass, rate=sources['sr']))
print("Vocals Component:")
display(ipd.Audio(vocals, rate=sources['sr']))
print("Other Component:")
display(ipd.Audio(other, rate=sources['sr']))

In [None]:

# Perform extraction for the first mixture and listen to the results
mixture_path = mixture_files[49]
mixture, sr = librosa.load(mixture_path, sr=None)

In [None]:
import librosa
import numpy as np
import scipy.signal

def separate_sources_v2( # Rinomino la funzione per chiarezza
    mixture_path,
    sr=None,
    hpss_margin=1.0, # Leggermente aumentato come punto di partenza
    hpss_power=2.0,
    # Parametri per la pulizia delle voci dalla batteria
    drum_clean_vocal_freq_min=300.0,  # Hz, inizio range vocale da attenuare
    drum_clean_vocal_freq_max=3000.0,  # Hz, fine range vocale da attenuare
    drum_clean_vocal_atten_factor=0.4, # Fattore di attenuazione (0.0 = muto, 1.0 = nessun cambiamento)
    # Parametri per il basso
    bass_cutoff=200.0,
    bass_order=4,
    # Parametri per la separazione vocale principale
    nn_width_vocals_sec=2.0 # Aumentato per una migliore separazione vocale
):
    """
    Carica un file audio e lo separa in batteria, basso, voci e altro,
    con un passaggio dedicato per pulire i residui vocali dalla batteria.
    Restituisce un dizionario di array numpy (tutti mono).
    """
    # 1) Caricamento e conversione in mono
    y, sr_loaded = librosa.load(mixture_path, sr=sr)
    if sr is None: # Se sr non era specificato, usa quello del file
        sr = sr_loaded
        
    if y.ndim > 1:
        y = librosa.to_mono(y)

    # 2) HPSS per l'estrazione iniziale di batteria e componente armonica
    # Calcola lo STFT del mix originale
    D_mixture = librosa.stft(y)
    n_fft = (D_mixture.shape[0] - 1) * 2 # Infer n_fft dallo spettrogramma
    
    D_harmonic_mixture, D_percussive_mixture = librosa.decompose.hpss(
        D_mixture, 
        margin=hpss_margin, 
        power=hpss_power
    )
    
    # Componente armonica generale (verrà usata per basso e voci)
    y_harmonic_overall = librosa.istft(D_harmonic_mixture, length=len(y))

    # --- 3) Pulizia della Batteria dai Residui Vocali ---
    # Partiamo da D_percussive_mixture (lo spettrogramma della batteria da HPSS)
    D_perc_mag, D_perc_phase = librosa.magphase(D_percussive_mixture)
    
    # Crea una copia della magnitudine per la modifica
    D_perc_mag_cleaned = np.copy(D_perc_mag)
    
    # Ottieni le frequenze corrispondenti ai bin dello STFT
    frequencies = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
    
    # Applica l'attenuazione nel range di frequenza vocale definito
    for i, freq_bin in enumerate(frequencies):
        if drum_clean_vocal_freq_min <= freq_bin <= drum_clean_vocal_freq_max:
            D_perc_mag_cleaned[i, :] *= drum_clean_vocal_atten_factor
            
    # Ricostruisci lo spettrogramma della batteria pulita
    D_drums_cleaned_stft = D_perc_mag_cleaned * D_perc_phase
    y_drums = librosa.istft(D_drums_cleaned_stft, length=len(y))

    # --- 4) Estrazione del Basso dalla componente armonica generale ---
    # Applica un filtro passa-basso a y_harmonic_overall
    nyquist = 0.5 * sr
    # Assicurati che bass_cutoff sia sotto la frequenza di Nyquist
    actual_bass_cutoff = min(bass_cutoff, nyquist - 1) # -1 per sicurezza
    if actual_bass_cutoff <= 0:
        print(f"Attenzione: bass_cutoff ({bass_cutoff} Hz) non valido con sr={sr} Hz. Il basso non verrà filtrato.")
        y_bass = np.zeros_like(y_harmonic_overall) # o gestisci diversamente
    else:
        b, a = scipy.signal.butter(bass_order, actual_bass_cutoff / nyquist, btype='low', analog=False)
        y_bass = scipy.signal.lfilter(b, a, y_harmonic_overall)

    # --- 5) Estrazione delle Voci dalla componente armonica generale ---
    # Nota: per una migliore separazione, si potrebbe sottrarre il basso stimato
    # da y_harmonic_overall prima di cercare le voci, ma per semplicità usiamo y_harmonic_overall.
    # y_harmonic_minus_bass = y_harmonic_overall - y_bass # Opzionale, potrebbe aiutare
    
    S_harmonic_overall, phase_harmonic_overall = librosa.magphase(librosa.stft(y_harmonic_overall)) # o di y_harmonic_minus_bass
    
    width_vocals_frames = int(librosa.time_to_frames(nn_width_vocals_sec, sr=sr, n_fft=n_fft))
    
    S_instrumental_repeating = librosa.decompose.nn_filter(
        S_harmonic_overall, 
        aggregate=np.median, 
        metric='cosine', 
        width=width_vocals_frames
    )
    
    # La maschera per le voci è ciò che NON è ripetitivo nella componente armonica
    S_vocals_mag = np.clip(S_harmonic_overall - S_instrumental_repeating, 0, None)
    
    y_vocals = librosa.istft(S_vocals_mag * phase_harmonic_overall, length=len(y))

    # --- 6) Calcolo del Residuo "Other" ---
    # Sottrai le componenti stimate dal mix originale
    y_other = y - y_drums - y_bass - y_vocals

    return {
        'drums': y_drums,
        'bass': y_bass,
        'vocals': y_vocals,
        'other': y_other,
        'sr': sr
    }

In [None]:
sources = separate_sources_v2(mixture_path)
drums  = sources['drums']
bass   = sources['bass']
vocals = sources['vocals']
other  = sources['other']

# play mixture
print("Mixture:")
display(ipd.Audio(mixture, rate=sr))

# Play the separated components
print("Drums Component:")
display(ipd.Audio(drums, rate=sources['sr']))
print("Bass Component:")
display(ipd.Audio(bass, rate=sources['sr']))
print("Vocals Component:")
display(ipd.Audio(vocals, rate=sources['sr']))
print("Other Component:")
display(ipd.Audio(other, rate=sources['sr']))

Whole Dataset

In [None]:
# import the libraries
import torch
import numpy as np
from torchmetrics import SignalDistortionRatio
import librosa

In [None]:
sdr_metric = SignalDistortionRatio()

# Calculate average SDR for each stem across all tracks
stems = ['drums', 'vocals', 'bass', 'other']
average_sdr = {}

for stem in stems:
    stem_sdrs = []
    for track_folder in dataset_dict.keys():
        mixture_path = os.path.join("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE/musdb18hq_trimmed", track_folder, "new_mixture.wav")
        separated_sources = separate_sources(mixture_path)
        
        if stem in dataset_dict[track_folder]:
            ref_stem = dataset_dict[track_folder][stem]
            est_stem = separated_sources[stem]
            
            ref_tensor = torch.tensor(ref_stem, dtype=torch.float32)
            est_tensor = torch.tensor(est_stem, dtype=torch.float32)
            
            if ref_tensor.dim() == 2 and est_tensor.dim() == 1:
                ref_tensor = torch.mean(ref_tensor, dim=0)
            elif ref_tensor.dim() == 1 and est_tensor.dim() == 2:
                est_tensor = torch.mean(est_tensor, dim=0)
            elif ref_tensor.dim() == 2 and est_tensor.dim() == 2:
                ref_tensor = torch.mean(ref_tensor, dim=0)
                est_tensor = torch.mean(est_tensor, dim=0)
            
            min_len = min(len(ref_tensor), len(est_tensor))
            ref_tensor = ref_tensor[:min_len]
            est_tensor = est_tensor[:min_len]
            
            sdr_value = sdr_metric(est_tensor, ref_tensor)
            stem_sdrs.append(sdr_value.item())
    
    average_sdr[stem] = np.mean(stem_sdrs) if stem_sdrs else 0

# Create bar plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
bars = plt.bar(average_sdr.keys(), average_sdr.values(), color=['blue', 'green', 'red', 'orange'])
plt.xlabel('Stem Category')
plt.ylabel('Average SDR (dB)')
plt.title('Average SDR Performance by Stem Category')
plt.grid(True, alpha=0.3)

# Add value labels on top of bars
for bar, value in zip(bars, average_sdr.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             f'{value:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Average SDR by stem:")
for stem, avg_sdr in average_sdr.items():
    print(f"{stem}: {avg_sdr:.4f} dB")


In [None]:
sdr_metric = SignalDistortionRatio()

# Calculate average SDR for each stem across all tracks
stems = ['drums', 'vocals', 'bass', 'other']
average_sdr = {}

for stem in stems:
    stem_sdrs = []
    for track_folder in dataset_dict.keys():
        mixture_path = os.path.join("/Users/alessandromanattini/Desktop/MAE/SELECTED TOPIC/PROJECT STMAE/musdb18hq_trimmed", track_folder, "new_mixture.wav")
        separated_sources = separate_sources_v2(mixture_path)
        
        if stem in dataset_dict[track_folder]:
            ref_stem = dataset_dict[track_folder][stem]
            est_stem = separated_sources[stem]
            
            ref_tensor = torch.tensor(ref_stem, dtype=torch.float32)
            est_tensor = torch.tensor(est_stem, dtype=torch.float32)
            
            if ref_tensor.dim() == 2 and est_tensor.dim() == 1:
                ref_tensor = torch.mean(ref_tensor, dim=0)
            elif ref_tensor.dim() == 1 and est_tensor.dim() == 2:
                est_tensor = torch.mean(est_tensor, dim=0)
            elif ref_tensor.dim() == 2 and est_tensor.dim() == 2:
                ref_tensor = torch.mean(ref_tensor, dim=0)
                est_tensor = torch.mean(est_tensor, dim=0)
            
            min_len = min(len(ref_tensor), len(est_tensor))
            ref_tensor = ref_tensor[:min_len]
            est_tensor = est_tensor[:min_len]
            
            sdr_value = sdr_metric(est_tensor, ref_tensor)
            stem_sdrs.append(sdr_value.item())
    
    average_sdr[stem] = np.mean(stem_sdrs) if stem_sdrs else 0

# Create bar plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
bars = plt.bar(average_sdr.keys(), average_sdr.values(), color=['blue', 'green', 'red', 'orange'])
plt.xlabel('Stem Category')
plt.ylabel('Average SDR (dB)')
plt.title('Average SDR Performance by Stem Category')
plt.grid(True, alpha=0.3)

# Add value labels on top of bars
for bar, value in zip(bars, average_sdr.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             f'{value:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Average SDR by stem:")
for stem, avg_sdr in average_sdr.items():
    print(f"{stem}: {avg_sdr:.4f} dB")