In [None]:
!pip install musdb
!pip install librosa
!pip install youtube-dl

In [None]:
# Generic imports
import tensorflow as tf
from IPython.display import Audio, display
import numpy as np
import musdb
import librosa
import librosa.display
import soundfile as sf

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
model = tf.keras.models.load_model("your-model-saving-path-to-gdrive/model.h5", compile=False)

In [None]:
def estimate_sources(audio):
  # Resampling to 22.05 kHz
  channel = librosa.core.resample(audio, orig_sr=44100, target_sr=22050)

  # Calculating Short Time Fourier Transform
  mixture_ft_magn = np.abs(librosa.stft(channel, n_fft=4096, win_length=1024, hop_length=256, window='hann'))
  mixture_ft_phase = np.angle(librosa.stft(channel,n_fft=4096, win_length=1024, hop_length=256, window='hann'))

  # Parameters
  freq_bins = mixture_ft_magn.shape[0]               # 2049
  time_bins = mixture_ft_magn.shape[1]
  num_frames = 9                                     # Each input for neural net is ~104.49 msec
  num_ft_bins = freq_bins * num_frames               # 2049(0~11.025kHz) * 9 (~104.49 msec)
  medium_frame = int(np.floor(num_frames/2))         # 4
  hop_num_frames = 8  

  # Padding for frames overlapping
  pad_min = np.zeros((freq_bins, medium_frame))
  pad_min[:,:] = min(mixture_ft_magn.min(0))
  mixture_padded = np.concatenate((pad_min, mixture_ft_magn, pad_min), axis=1)

  # Creating neural net's input
  input = np.zeros((time_bins, num_ft_bins))
  for i in range(time_bins):
            start_index = i
            end_index = start_index + num_frames
            input[i:i+1,:] = np.transpose(np.reshape(mixture_padded[:,start_index:end_index],(num_ft_bins,1),order="F"))

  # Estimating soft masks
  vocals_soft_mask = np.zeros((freq_bins, time_bins))
#   predictions = model.predict(input)
  for i in range(time_bins):
        temp_soft_mask = np.transpose(np.reshape(model.predict(input[i:i+1,:]), (num_frames, freq_bins)))
      #   temp_soft_mask = np.transpose(np.reshape(predictions[i,:],(num_frames, freq_bins)))
        vocals_soft_mask[:,i] = temp_soft_mask[:, medium_frame]
  accompaniment_soft_mask = 1-vocals_soft_mask

  # Applying thresholds to make signals cleaner
  vocals_soft_mask[vocals_soft_mask < 0.15] = 0
  accompaniment_soft_mask[accompaniment_soft_mask < 0.8] = 0
  voc_ft_magn = np.multiply(vocals_soft_mask, mixture_ft_magn)
  acc_ft_magn = np.multiply(accompaniment_soft_mask, mixture_ft_magn)

  # Computing complex signals
  voc_complex_signal = np.multiply(voc_ft_magn, mixture_ft_phase)
  acc_complex_signal = np.multiply(acc_ft_magn, mixture_ft_phase)

  # iSTFT reconstruction of time domain signals and resampling to 44.1 kHz 
  vocals_audio= librosa.istft(voc_complex_signal, hop_length=256, win_length=1024, window='hann')
  vocals_audio = librosa.core.resample(vocals_audio, orig_sr=22050, target_sr=44100)
  accompaniment_audio = librosa.istft(acc_complex_signal, hop_length=256, win_length=1024, window='hann')
  accompaniment_audio = librosa.core.resample(accompaniment_audio, orig_sr=22050, target_sr=44100)

  estimates = {
        'vocals': vocals_audio,
        'accompaniment': accompaniment_audio,
    }

  return estimates

In [None]:
#Youtube
import youtube_dl
from IPython.display import HTML
url = "nv2rp5JCWj0" #@param {type:"string"}
start =  90#@param {type:"number"}
stop =  120#@param {type:"number"}
embed_url = "https://www.youtube.com/embed/%s?rel=0&start=%d&end=%d&amp;controls=0&amp;showinfo=0" % (url, start, stop)
HTML('<iframe width="560" height="315" src=' + embed_url + 'frameborder="0" allowfullscreen></iframe>')

In [None]:
def my_hook(d):
    if d['status'] == 'finished':
        print('Done downloading...')


ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '44100',
    }],
    'outtmpl': '%(title)s.wav',
    'progress_hooks': [my_hook],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(url, download=False)
    status = ydl.download([url])

audio, rate = librosa.load(info.get("title", None) + '.wav', sr=44100, mono=True)
audio = audio[start*rate:stop*rate]
display(Audio(audio, rate=rate))

In [None]:
estimates = estimate_sources(audio)

In [None]:
Audio(estimates['vocals'], rate=44100)

In [None]:
Audio(estimates['accompaniment'], rate=44100)

In [None]:
sf.write('your-saving-track-path_vocals.wav', estimates['vocals'], 44100, subtype='PCM_24')
sf.write('your-saving-track-path_accompaniment.wav', estimates['accompaniment'], 44100, subtype='PCM_24')