In [None]:
!apt install ffmpeg
!pip install spleeter
!pip install audio-separator
!pip install museval
!pip install protobuf==3.20.0

# Automashup : Automatic mashup generator.

Automatic mashup generator algorithm. A Mashup is defined as a mix of different songs, usually formed by mixing the instrumental line of one song with the vocal line of another. In the present project we seek to implement a source separation algorithm, followed by the creation of a new song, mixing the instrumental and vocal lines of the supplied songs.




In [None]:
import matplotlib.pyplot as plt
import librosa.display

import numpy as np
import pandas as pd
import librosa

from IPython.display import Audio

In [None]:
#Load audio, the audio is sampled at 44100 Hz
audio, sr = librosa.load('morat.mp3', sr=44100)
Audio(data=audio, rate=sr)

In [None]:
print(f"Sampling rate: {sr}")
print(f"Audio array length (duration*sr): {audio.shape[0]}")

In [None]:
#Visualize audio
plt.figure(figsize=(10, 5))
librosa.display.waveshow(audio, sr=sr)
plt.show()

In [None]:
#Get and visualize time-frequency domain
fig, ax = plt.subplots(figsize=(10, 5))
S = np.abs(librosa.stft(audio))**2
img = librosa.display.specshow(librosa.power_to_db(S,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax)
ax.set_title('Log Power Spectrogram')
fig.colorbar(img, ax=ax, format="%+2.0f dB")
plt.show()

We can see that the audio frequencies > 11 kHz, so, we can not apply spleeter models (**Unet, Bi-LSTM**) which are trained for performing separation up to 11kHz.

Spleeter released new model's version to separate audios up to 16kHz, use the same pretrain model but spectrogram estimation at separation time is then done until 16kHz. This may lead to unexpected artefacts, still we employ this model to test the performance instead of the previous one.

## Source separation algorithms


### Spleeter method


In [None]:
#Run source separation (mixture -> 4 stems: vocals, drums, bass, other)
!spleeter separate -o source_separation -p spleeter:4stems-16kHz morat.mp3

In [None]:
#Play estimated vocals
Audio('source_separation/morat/vocals.wav')

In [None]:
#Play estimated drums
Audio('source_separation/morat/drums.wav')

In [None]:
#Play estimated bass
Audio('source_separation/morat/bass.wav')

In [None]:
#Play estimated other stem
Audio('source_separation/morat/other.wav')

In [None]:
#Load and visualize ground truth and estimated stems (Waveform)
bass, _ = librosa.load('bass.mp3', sr=44100)
drums, _ = librosa.load('drums.mp3', sr=44100)
vocals, _ = librosa.load('vocals.mp3', sr=44100)
other, _ = librosa.load('other.mp3', sr=44100)

bass_est, _ = librosa.load('source_separation/morat/bass.wav', sr=44100)
drums_est, _ = librosa.load('source_separation/morat/drums.wav', sr=44100)
vocals_est, _ = librosa.load('source_separation/morat/vocals.wav', sr=44100)
other_est, _ = librosa.load('source_separation/morat/other.wav', sr=44100)

fig, ax = plt.subplots(nrows=3, figsize=(12,10), sharex=True)
librosa.display.waveshow(audio, sr=sr, ax=ax[0])
ax[0].set(title='Mixture')
ax[0].label_outer()

librosa.display.waveshow(bass, sr=sr, color='r', alpha=0.6, ax=ax[1], label='bass')
librosa.display.waveshow(other, sr=sr, color='g', alpha=0.6, ax=ax[1], label='other')
librosa.display.waveshow(vocals, sr=sr, color='b', alpha=0.6, ax=ax[1], label='vocals')
librosa.display.waveshow(drums, sr=sr, color='orange', alpha=0.6, ax=ax[1], label='drums')
ax[1].set(title='Ground truth stems')
ax[1].legend()
ax[1].label_outer()

librosa.display.waveshow(bass_est, sr=sr, color='r', alpha=0.6, ax=ax[2], label='bass_est')
librosa.display.waveshow(other_est, sr=sr, color='g', alpha=0.6, ax=ax[2], label='other_est')
librosa.display.waveshow(vocals_est, sr=sr, color='b', alpha=0.6, ax=ax[2], label='vocals_est')
librosa.display.waveshow(drums_est, sr=sr, color='orange', alpha=0.6, ax=ax[2], label='drums_est')
ax[2].set(title='Estimated stems')
ax[2].legend()
plt.show()

In [None]:
fft_vocals = np.fft.fft(vocals_est)
fft_drums = np.fft.fft(drums_est)
fft_bass = np.fft.fft(bass_est)
fft_other = np.fft.fft(other_est)

magnitude_vocals = np.abs(fft_vocals)[:int(len(fft_vocals)/2)]
magnitude_drums = np.abs(fft_drums)[:int(len(fft_drums)/2)]
magnitude_bass = np.abs(fft_bass)[:int(len(fft_bass)/2)]
magnitude_other = np.abs(fft_other)[:int(len(fft_other)/2)]

nyquist = sr / 2
left_frequency = np.linspace(0, nyquist, int(len(fft_vocals)/2))

max_frequency = 1000
max_index = np.where(left_frequency > max_frequency)[0][0]

# Plotting with different colors
plt.figure(figsize=(12, 6))
plt.plot(left_frequency[:max_index], magnitude_vocals[:max_index], color='r', label='Vocals')
plt.plot(left_frequency[:max_index], magnitude_drums[:max_index], color='b', label='Drums')
plt.plot(left_frequency[:max_index], magnitude_bass[:max_index], color='g', label='Bass')
plt.plot(left_frequency[:max_index], magnitude_other[:max_index], color='y', label='Other')

plt.title('Frequency vs Magnitude (Up to 500 Hz)')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Magnitude')
plt.legend()
plt.grid()
plt.show()

In [None]:
#Visualize ground truth and estimated stems (TF domain)
S_bass = np.abs(librosa.stft(bass))**2
S_drums = np.abs(librosa.stft(drums))**2
S_vocals = np.abs(librosa.stft(vocals))**2
S_other = np.abs(librosa.stft(other))**2

S_bass_est = np.abs(librosa.stft(bass_est))**2
S_drums_est = np.abs(librosa.stft(drums_est))**2
S_vocals_est = np.abs(librosa.stft(vocals_est))**2
S_other_est = np.abs(librosa.stft(other_est))**2

fig, ax = plt.subplots(ncols=2, nrows=4, figsize=(10,12), sharex=True)

librosa.display.specshow(librosa.power_to_db(S_bass,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[0,0])
ax[0,0].set(title='Ground truth bass')
ax[0,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_bass_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[0,1])
ax[0,1].set(title='Estimated bass')
ax[0,1].yaxis.set_label_position("right")
ax[0,1].yaxis.tick_right()
ax[0,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_drums,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[1,0])
ax[1,0].set(title='Ground truth drums')
ax[1,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_drums_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[1,1])
ax[1,1].set(title='Estimated drums')
ax[1,1].yaxis.set_label_position("right")
ax[1,1].yaxis.tick_right()
ax[1,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_vocals,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[2,0])
ax[2,0].set(title='Ground truth vocals')
ax[2,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_vocals_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[2,1])
ax[2,1].set(title='Estimated vocals')
ax[2,1].yaxis.set_label_position("right")
ax[2,1].yaxis.tick_right()
ax[2,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_other,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[3,0])
ax[3,0].set(title='Ground truth other stems')
librosa.display.specshow(librosa.power_to_db(S_other_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[3,1])
ax[3,1].set(title='Estimated other stems')
ax[3,1].yaxis.set_label_position("right")
ax[3,1].yaxis.tick_right()
plt.show()

In [None]:
import museval

src = np.concatenate([bass.reshape((1,len(bass))),
                      drums.reshape((1,len(bass))),
                      vocals.reshape((1,len(bass))),
                      other.reshape((1,len(bass)))], axis=0)
src = np.expand_dims(src, axis=2) #channels = 1

estim = np.concatenate([bass_est.reshape((1,len(bass))),
                      drums_est.reshape((1,len(bass))),
                      vocals_est.reshape((1,len(bass))),
                      other_est.reshape((1,len(bass)))], axis=0)
estim = np.expand_dims(e betstim, axis=2)

SDR, ISR, SIR, SAR = museval.evaluate(src, estim)

In [None]:
print(f"Bass SDR: {np.mean(SDR[0]):+.2f}, SIR: {np.mean(SIR[0]):+.2f}, SAR: {np.mean(SAR[0]):+.2f}")
print(f"Drums SDR: {np.mean(SDR[1]):+.2f}, SIR: {np.mean(SIR[1]):+.2f}, SAR: {np.mean(SAR[1]):+.2f}")
print(f"Vocals SDR: {np.mean(SDR[2]):+.2f}, SIR: {np.mean(SIR[2]):+.2f}, SAR: {np.mean(SAR[2]):+.2f}")
print(f"Other SDR: {np.mean(SDR[3]):+.2f}, SIR: {np.mean(SIR[3]):+.2f}, SAR: {np.mean(SAR[3]):+.2f}")

In [None]:
#Residual between vocal_est and ground truth vocal

import matplotlib.pyplot as plt

t = np.linspace(0,len(vocals)/44100, len(vocals))
plt.plot(t, vocals-vocals_est)

### DEMUCS

Demucs is a state-of-the-art music source separation model, currently capable of separating drums, bass, and vocals from the rest of the accompaniment. Demucs is based on a U-Net convolutional architecture inspired by Wave-U-Net. The v4 version features Hybrid Transformer Demucs, a hybrid spectrogram/waveform separation model using Transformers. It is based on Hybrid Demucs (also provided in this repo), with the innermost layers replaced by a cross-domain Transformer Encoder. This Transformer uses self-attention within each domain, and cross-attention across domains. The model achieves a SDR of 9.00 dB on the MUSDB HQ test set. Moreover, when using sparse attention kernels to extend its receptive field and per source fine-tuning, we achieve state-of-the-art 9.20 dB of SDR.

In [None]:
!pip install demucs

In [None]:
!python -m demucs.separate morat.mp3

In [None]:
Audio('separated/htdemucs/morat/vocals.wav')

In [None]:
Audio('separated/htdemucs/morat/bass.wav')

In [None]:
Audio('separated/htdemucs/morat/drums.wav')

In [None]:
Audio('separated/htdemucs/morat/other.wav')

In [None]:
#Load and visualize ground truth and estimated stems (Waveform)
dm_bass, _ = librosa.load('bass.mp3', sr=44100)
dm_drums, _ = librosa.load('drums.mp3', sr=44100)
dm_vocals, _ = librosa.load('vocals.mp3', sr=44100)
dm_other, _ = librosa.load('other.mp3', sr=44100)

dm_bass_est, _ = librosa.load('separated/htdemucs/morat/bass.wav', sr=44100)
dm_drums_est, _ = librosa.load('separated/htdemucs/morat/drums.wav', sr=44100)
dm_vocals_est, _ = librosa.load('separated/htdemucs/morat/vocals.wav', sr=44100)
dm_other_est, _ = librosa.load('separated/htdemucs/morat/other.wav', sr=44100)

fig, ax = plt.subplots(nrows=3, figsize=(12,10), sharex=True)
librosa.display.waveshow(audio, sr=sr, ax=ax[0])
ax[0].set(title='Mixture')
ax[0].label_outer()

librosa.display.waveshow(dm_bass, sr=sr, color='r', alpha=0.6, ax=ax[1], label='bass')
librosa.display.waveshow(dm_other, sr=sr, color='g', alpha=0.6, ax=ax[1], label='other')
librosa.display.waveshow(dm_vocals, sr=sr, color='b', alpha=0.6, ax=ax[1], label='vocals')
librosa.display.waveshow(dm_drums, sr=sr, color='orange', alpha=0.6, ax=ax[1], label='drums')
ax[1].set(title='Ground truth stems')
ax[1].legend()
ax[1].label_outer()

librosa.display.waveshow(dm_bass_est, sr=sr, color='r', alpha=0.6, ax=ax[2], label='bass_est')
librosa.display.waveshow(dm_other_est, sr=sr, color='g', alpha=0.6, ax=ax[2], label='other_est')
librosa.display.waveshow(dm_vocals_est, sr=sr, color='b', alpha=0.6, ax=ax[2], label='vocals_est')
librosa.display.waveshow(dm_drums_est, sr=sr, color='orange', alpha=0.6, ax=ax[2], label='drums_est')
ax[2].set(title='Estimated stems')
ax[2].legend()
plt.show()

In [None]:

fft_vocals = np.fft.fft(dm_vocals_est)
fft_drums = np.fft.fft(dm_drums_est)
fft_bass = np.fft.fft(dm_bass_est)
fft_other = np.fft.fft(dm_other_est)

magnitude_vocals = np.abs(fft_vocals)[:int(len(fft_vocals)/2)]
magnitude_drums = np.abs(fft_drums)[:int(len(fft_drums)/2)]
magnitude_bass = np.abs(fft_bass)[:int(len(fft_bass)/2)]
magnitude_other = np.abs(fft_other)[:int(len(fft_other)/2)]

nyquist = sr / 2
left_frequency = np.linspace(0, nyquist, int(len(fft_vocals)/2))

max_frequency = 1000
max_index = np.where(left_frequency > max_frequency)[0][0]

# Plotting with different colors
plt.figure(figsize=(12, 6))
plt.plot(left_frequency[:max_index], magnitude_vocals[:max_index], color='r', label='Vocals')
plt.plot(left_frequency[:max_index], magnitude_drums[:max_index], color='b', label='Drums')
plt.plot(left_frequency[:max_index], magnitude_bass[:max_index], color='g', label='Bass')
plt.plot(left_frequency[:max_index], magnitude_other[:max_index], color='y', label='Other')

plt.title('Frequency vs Magnitude (Up to 500 Hz)')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Magnitude')
plt.legend()
plt.grid()
plt.show()

In [None]:
#Visualize ground truth and estimated stems (TF domain)
S_dm_bass = np.abs(librosa.stft(dm_bass))**2
S_dm_drums = np.abs(librosa.stft(dm_drums))**2
S_dm_vocals = np.abs(librosa.stft(dm_vocals))**2
S_dm_other = np.abs(librosa.stft(dm_other))**2

S_dm_bass_est = np.abs(librosa.stft(dm_bass_est))**2
S_dm_drums_est = np.abs(librosa.stft(dm_drums_est))**2
S_dm_vocals_est = np.abs(librosa.stft(dm_vocals_est))**2
S_dm_other_est = np.abs(librosa.stft(dm_other_est))**2

fig, ax = plt.subplots(ncols=2, nrows=4, figsize=(10,12), sharex=True)

librosa.display.specshow(librosa.power_to_db(S_dm_bass,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[0,0])
ax[0,0].set(title='Ground truth bass')
ax[0,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_dm_bass_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[0,1])
ax[0,1].set(title='Estimated bass')
ax[0,1].yaxis.set_label_position("right")
ax[0,1].yaxis.tick_right()
ax[0,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_dm_drums,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[1,0])
ax[1,0].set(title='Ground truth drums')
ax[1,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_dm_drums_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[1,1])
ax[1,1].set(title='Estimated drums')
ax[1,1].yaxis.set_label_position("right")
ax[1,1].yaxis.tick_right()
ax[1,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_dm_vocals,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[2,0])
ax[2,0].set(title='Ground truth vocals')
ax[2,0].label_outer()
librosa.display.specshow(librosa.power_to_db(S_dm_vocals_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[2,1])
ax[2,1].set(title='Estimated vocals')
ax[2,1].yaxis.set_label_position("right")
ax[2,1].yaxis.tick_right()
ax[2,1].label_outer()

librosa.display.specshow(librosa.power_to_db(S_dm_other,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[3,0])
ax[3,0].set(title='Ground truth other stems')
librosa.display.specshow(librosa.power_to_db(S_dm_other_est,ref=np.max), y_axis='log',sr=sr, x_axis='time', ax=ax[3,1])
ax[3,1].set(title='Estimated other stems')
ax[3,1].yaxis.set_label_position("right")
ax[3,1].yaxis.tick_right()
plt.show()

## Evaluation of source separation methods

In [None]:
import museval

src = np.concatenate([bass.reshape((1,len(bass))),
                      drums.reshape((1,len(bass))),
                      vocals.reshape((1,len(bass))),
                      other.reshape((1,len(bass)))], axis=0)
src = np.expand_dims(src, axis=2) #channels = 1

estim = np.concatenate([dm_bass_est.reshape((1,len(bass))),
                      dm_drums_est.reshape((1,len(bass))),
                      dm_vocals_est.reshape((1,len(bass))),
                      dm_other_est.reshape((1,len(bass)))], axis=0)
estim = np.expand_dims(estim, axis=2)

SDR, ISR, SIR, SAR = museval.evaluate(src, estim)

In [None]:
print(f"Bass SDR: {np.mean(SDR[0]):+.2f}, SIR: {np.mean(SIR[0]):+.2f}, SAR: {np.mean(SAR[0]):+.2f}")
print(f"Drums SDR: {np.mean(SDR[1]):+.2f}, SIR: {np.mean(SIR[1]):+.2f}, SAR: {np.mean(SAR[1]):+.2f}")
print(f"Vocals SDR: {np.mean(SDR[2]):+.2f}, SIR: {np.mean(SIR[2]):+.2f}, SAR: {np.mean(SAR[2]):+.2f}")
print(f"Other SDR: {np.mean(SDR[3]):+.2f}, SIR: {np.mean(SIR[3]):+.2f}, SAR: {np.mean(SAR[3]):+.2f}")

### Separator model

In [None]:
# Initialize the Separator with the audio file and model name
separator = Separator('morat.mp3', model_name='UVR_MDXNET_KARA_2')

# Perform the separation
primary_stem_path, secondary_stem_path = separator.separate()

print(f'Primary stem saved at {primary_stem_path}')
print(f'Secondary stem saved at {secondary_stem_path}')

In [None]:
Audio('morat_(Instrumental)_UVR_MDXNET_KARA_2.wav')

In [None]:
Audio('morat_(Vocals)_UVR_MDXNET_KARA_2.wav')

In [None]:
#Load and visualize ground truth and estimated stems (Waveform)
vocals_, _ = librosa.load('vocals.mp3', sr=44100)

vocals_est_, _ = librosa.load('morat_(Vocals)_UVR_MDXNET_KARA_2.wav', sr=44100)
other_est_, _ = librosa.load('separated/htdemucs/morat/other.wav', sr=44100)

fig, ax = plt.subplots(nrows=3, figsize=(12,10), sharex=True)
librosa.display.waveshow(audio, sr=sr, ax=ax[0])
ax[0].set(title='Mixture')
ax[0].label_outer()

librosa.display.waveshow(vocals_, sr=sr, color='b', alpha=0.6, ax=ax[1], label='vocals')
ax[1].set(title='Ground truth stems')
ax[1].legend()
ax[1].label_outer()

librosa.display.waveshow(vocals_est_, sr=sr, color='b', alpha=0.6, ax=ax[2], label='vocals_est')
ax[2].set(title='Estimated stems')
ax[2].legend()
plt.show()