In [None]:
!sudo apt install ffmpeg

In [None]:
!git clone https://github.com/bimapras/MusicSeparation.git

In [None]:
# %pip install tensorflow==2.17 numpy musdb librosa soundfile
%pip install numpy musdb librosa soundfile # tensorflow 2.19 still compatible

In [None]:
%cd MusicSeparation

In [None]:
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
from IPython.display import display, Audio
from utils import read_audio, inference

In [None]:
'''
Inference expect model with input shape (time, 2)
Make sure segment_length is your time length input model (Default model use 88064)
Use GPU & Keras Format for faster inference
'''

reader = read_audio.AudioReader()
audio_data, samplerate = reader.read(r'sample/Pierce The Veil - So Far So Fake (Visualizer).mp4')
# change path to your upload audio file

keras_model_path = r'models/DPTCN.keras'
tflite_model_path = r'models/DPTCN.tflite'

inference = inference.AudioInference(model=tflite_model_path,
                            is_tflite=True,
                            segment_length=88064,
                            overlap=0.5,
                            batch_size=4,
                            use_wiener=True,
                            stft_frame_length=4096,
                            stft_frame_step=1024,
                            wiener_iterations=3)
pred = inference.predict(audio_data[:44100*20],
                        segment_length_sec=30,
                        export=True,
                        export_dir="output")

In [None]:
SR = 44100
display(Audio(pred[:, 0, :].numpy().T, rate = SR))
display(Audio(pred[:, 1, :].numpy().T, rate = SR))
display(Audio(pred[:, 2, :].numpy().T, rate = SR))
display(Audio(pred[:, 3, :].numpy().T, rate = SR))

In [None]:
# Visualization
stems = ['Vocals', 'Drums', 'Bass', 'Other']
fig, axes = plt.subplots(2, 2, figsize=(15, 8))

for i in range(4):
    row = i // 2
    col = i % 2
    audio_segment = pred[:, i, :].numpy()

    # Convert stereo to mono for spectrogram calculation
    if audio_segment.shape[1] == 2:
        audio_segment = np.mean(audio_segment, axis=1)

    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_segment)), ref=np.max)

    img = librosa.display.specshow(D, sr=samplerate, x_axis='time', y_axis='log', ax=axes[row, col])
    axes[row, col].set_title(f'Spectrogram of {stems[i]}')
    fig.colorbar(img, ax=axes[row, col], format='%+2.0f dB')

plt.tight_layout()
plt.show()