This notebook tests if the DMC models (https://dl4am.github.io/tutorial/landing-page.html) may improve the quality of the mashup. The easy conclusion is that yes, it helps. It should be implemented, why it is not for now.

Modify sklearn --> scikit-learn in setup.py (in case build failed)

In [None]:
import os
import glob
import torch
import torchaudio
import numpy as np

import IPython
import IPython.display as ipd
import matplotlib.pyplot as plt
import librosa.display

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
track_dir = "./test_mix"
track_tech = "./techniques"
track_ext_tech = "wav"
track_ext = "mp3"

# load the input tracks
track_filepaths = glob.glob(os.path.join(track_dir, f"*.{track_ext}"))
track_filepaths = sorted(track_filepaths)

track_filepaths_tech = glob.glob(os.path.join(track_tech, f"*.{track_ext_tech}"))
track_filepaths_tech = sorted(track_filepaths_tech)

tracks = []
track_names = []
for idx, track_filepath in enumerate(track_filepaths):
    x, sr1 = torchaudio.load(track_filepath)
    start_sample = int(0 * sr1)
    end_sample = start_sample + int(40 * sr1)
    if "Vocal" in track_filepath or "Bass" in track_filepath:
      x_L = x[0:1, start_sample:end_sample]
      #x_L /= x_L.abs().max().clamp(1e-8) # peak normalize
      #x_L *= 10 ** (-12/20.0) # set peak to -12 dB
      tracks.append(x_L)
      track_names.append(os.path.basename(track_filepath))
        
    else:
      x_L = x[0:1, start_sample:end_sample]
      x_R = x[1:2, start_sample:end_sample]

      #x_L /= x_L.abs().max().clamp(1e-8) # peak normalize
      #x_L *= 10 ** (-12/20.0) # set peak to -12 dB

      #x_R /= x_R.abs().max().clamp(1e-8) # peak normalize
      #x_R *= 10 ** (-12/20.0) # set peak to -12 dB

      tracks.append(x_L)
      tracks.append(x_R)
      track_names.append(os.path.basename(track_filepath) + "-L")
      track_names.append(os.path.basename(track_filepath) + "-R")

    plt.figure(figsize=(10, 2))
    librosa.display.waveshow(x_L.view(-1).numpy(), sr=sr1, zorder=3)
    plt.title(f"{idx+1} {os.path.basename(track_filepath)}")
    plt.ylim([-1,1])
    plt.grid(c="lightgray")
    plt.show()
    IPython.display.display(ipd.Audio(x_L.view(-1).numpy(), rate=sr1, normalize=True))    

# stack tracks into a tensor
tracks = torch.stack(tracks, dim=0)
tracks = tracks.permute(1, 0, 2)
# tracks have shape (1, num_tracks, seq_len)

# listen to the input (mono) before mixing
input_mix = tracks.sum(dim=1, keepdim=True).clamp(-1, 1)
plt.figure(figsize=(10, 2))
plt.title("Mono Mix")
librosa.display.waveshow(input_mix.view(-1).numpy(), sr=sr1, zorder=3, color="tab:orange")
plt.ylim([-1,1])
plt.grid(c="lightgray")
plt.show()
IPython.display.display(ipd.Audio(input_mix.view(-1).numpy(), rate=sr1, normalize=False))

for track_filepath in track_filepaths_tech:
    x, sr2 = torchaudio.load(track_filepath)
    start_sample = int(0 * sr2)
    end_sample = start_sample + int(40 * sr2)
    x_L = x[0:1, start_sample:end_sample]
    plt.figure(figsize=(10, 2))
    librosa.display.waveshow(x_L.view(-1).numpy(), sr=sr2, zorder=3, color="tab:green")
    plt.title(f"{os.path.basename(track_filepath)}")
    plt.ylim([-1,1])
    plt.grid(c="lightgray")
    plt.show()
    IPython.display.display(ipd.Audio(x_L.view(-1).numpy(), rate=sr2, normalize=True))   

In [None]:
!git clone https://github.com/csteinmetz1/automix-toolkit.git
!cd automix-toolkit
!pip install -e .

In [None]:
from automix.system import System

In [None]:
os.makedirs("checkpoints", exist_ok=True)

!wget https://huggingface.co/csteinmetz1/automix-toolkit/resolve/main/medleydb-16-dmc.ckpt
!mv medleydb-16-dmc.ckpt checkpoints/medleydb-16-dmc.ckpt

In [None]:
dmc_ckpt_path = "checkpoints/medleydb-16-dmc.ckpt"

# load pretrained model
medley_dmc_system = System.load_from_checkpoint(dmc_ckpt_path, pretrained_encoder=False, map_location="cpu").eval()

In [None]:
with torch.no_grad(): # no need to compute gradients
    mix = medley_dmc_system.model.block_based_forward(tracks, 262144, 262144//2)
#print(mix.shape, params.shape)

# view the mix
mix /= mix.abs().max()
plt.figure(figsize=(10, 2))
plt.title("Differentiable Mixing Console")
librosa.display.waveshow(mix.view(2,-1).numpy(), sr=sr1, zorder=3)
plt.ylim([-1,1])
plt.grid(c="lightgray")
plt.show()
IPython.display.display(ipd.Audio(mix.view(2,-1).numpy(), rate=sr1, normalize=True))