<a href="https://colab.research.google.com/github/architb1703/Hostel_Allocation/blob/master/Single_song.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This file contains the code for experimenting with HITL adaptation on a single songs with only part of the music track annotated by the user. The purpose of this experiment was to check if our approach could be used for real-time audio editting.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install stempeg
!pip install mir_eval
!pip install museval
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset
from torch.nn import functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm, trange
import soundfile as sf
import librosa
import stempeg as st
import mir_eval
import pickle
import museval
np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

Collecting stempeg
[?25l  Downloading https://files.pythonhosted.org/packages/8d/e5/84adc8506b61ca9f205d9dcc5558b6b5b1fa477c45616f553a0ca1b8020d/stempeg-0.2.3-py3-none-any.whl (963kB)
[K     |▍                               | 10kB 15.6MB/s eta 0:00:01[K     |▊                               | 20kB 19.2MB/s eta 0:00:01[K     |█                               | 30kB 20.6MB/s eta 0:00:01[K     |█▍                              | 40kB 18.7MB/s eta 0:00:01[K     |█▊                              | 51kB 11.8MB/s eta 0:00:01[K     |██                              | 61kB 11.6MB/s eta 0:00:01[K     |██▍                             | 71kB 11.4MB/s eta 0:00:01[K     |██▊                             | 81kB 12.2MB/s eta 0:00:01[K     |███                             | 92kB 12.9MB/s eta 0:00:01[K     |███▍                            | 102kB 9.8MB/s eta 0:00:01[K     |███▊                            | 112kB 9.8MB/s eta 0:00:01[K     |████                            | 122kB 9.8MB/

In [None]:
basepath = '/content/drive/MyDrive/UGP/UGP/ugp data/train/'
from shutil import copyfile
from joblib import Parallel, delayed
try:
  os.mkdir('./data/')
except:
  pass
files = []
for f in os.listdir(basepath):
  if(f.split('.')[-1]=='npz'):
    files.append(f)
Parallel(n_jobs=8)(delayed(lambda x : copyfile(basepath+x, './data/'+x))(x) for x in files)

['./data/Clara Berry And Wooldog - Air Traffic.npz',
 './data/Clara Berry And Wooldog - Stella.npz',
 './data/Clara Berry And Wooldog - Waltz For My Victims.npz',
 './data/Cnoc An Tursa - Bannockburn.npz',
 './data/Creepoid - OldTree.npz',
 './data/Dark Ride - Burning Bridges.npz',
 './data/Dreamers Of The Ghetto - Heavy Love.npz',
 './data/Drumtracks - Ghost Bitch.npz',
 './data/Faces On Film - Waiting For Ga.npz',
 './data/Fergessen - Back From The Start.npz',
 './data/Fergessen - Nos Palpitants.npz',
 './data/Fergessen - The Wind.npz',
 './data/Flags - 54.npz',
 './data/Giselle - Moss.npz',
 './data/Grants - PunchDrunk.npz',
 './data/Helado Negro - Mitad Del Mundo.npz',
 './data/Hezekiah Jones - Borrowed Heart.npz',
 './data/Hollow Ground - Left Blind.npz',
 './data/Hop Along - Sister Cities.npz',
 './data/Invisible Familiars - Disturbing Wildlife.npz',
 './data/James May - All Souls Moon.npz',
 './data/James May - Dont Let Go.npz',
 './data/James May - If You Say.npz',
 './data/Jam

In [None]:
basepath = '/content/drive/MyDrive/UGP/'
from shutil import copyfile
from joblib import Parallel, delayed
try:
  os.mkdir('./data/')
except:
  pass
files = []
for f in os.listdir(basepath):
  if(f.split('.')[-1]=='npz'):
    files.append(f)
Parallel(n_jobs=8)(delayed(lambda x : copyfile(basepath+x, './data/'+x))(x) for x in files)

['./data/hitl0.npz',
 './data/hitl1.npz',
 './data/hitl2.npz',
 './data/hitl3.npz',
 './data/hitl4.npz',
 './data/hitl5.npz',
 './data/hitl6.npz',
 './data/hitl7.npz',
 './data/hitl8.npz',
 './data/hitl9.npz',
 './data/hitl10.npz',
 './data/hitl11.npz',
 './data/hitl12.npz',
 './data/hitl13.npz',
 './data/hitl14.npz',
 './data/hitl15.npz',
 './data/hitl16.npz',
 './data/hitl17.npz',
 './data/hitl18.npz',
 './data/hitl19.npz',
 './data/hitl20.npz',
 './data/hitl21.npz']

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

class Source_Pitch(nn.Module):
  def __init__(self):
    super(Source_Pitch, self).__init__()
    
    self.conv1 = nn.Conv2d(1, 16, (5,5), (2,2), (2,2))
    self.conv2 = nn.Conv2d(16, 32, (5,5), (2,2), (2,2))
    self.conv3 = nn.Conv2d(32, 64, (5,5), (2,2), (2,2))
    self.conv4 = nn.Conv2d(64, 128, (5,5), (2,2), (2,2))
    self.conv5 = nn.Conv2d(128, 256, (5,5), (2,2), (2,2))
    self.conv6 = nn.Conv2d(256, 512, (5,5), (2,2), (2,2))

    self.conv7 = nn.Conv2d(512, 256, (5,5), (1,1), (2,2))
    self.conv8 = nn.Conv2d(256, 128, (5,5), (1,1), (2,2))
    self.conv9 = nn.Conv2d(128, 64, (5,5), (1,1), (2,2))
    self.conv10 = nn.Conv2d(64, 32, (5,5), (1,1), (2,2))
    self.conv11 = nn.Conv2d(32, 16, (5,5), (1,1), (2,2))
    

    self.convT1 = nn.ConvTranspose2d(512, 256, (5,5), (2,2), (2,2))
    self.convT2 = nn.ConvTranspose2d(256, 128, (5,5), (2,2), (2,2))
    self.convT3 = nn.ConvTranspose2d(128, 64, (5,5), (2,2), (2,2))
    self.convT4 = nn.ConvTranspose2d(64, 32, (5,5), (2,2), (2,2))
    self.convT5 = nn.ConvTranspose2d(32, 16, (5,5), (2,2), (2,2))
    self.convT6 = nn.ConvTranspose2d(16, 1, (5,5), (2,2), (2,2))

    self.dropout = nn.Dropout(0.5)
    self.bn0 = nn.BatchNorm2d(1)
    self.bn1 = nn.BatchNorm2d(16)
    self.bn2 = nn.BatchNorm2d(32)
    self.bn3 = nn.BatchNorm2d(64)
    self.bn4 = nn.BatchNorm2d(128)
    self.bn5 = nn.BatchNorm2d(256)
    self.bn6 = nn.BatchNorm2d(512)

    self.bn10 = nn.BatchNorm2d(1)
    self.bn11 = nn.BatchNorm2d(16)
    self.bn12 = nn.BatchNorm2d(32)
    self.bn13 = nn.BatchNorm2d(64)
    self.bn14 = nn.BatchNorm2d(128)
    self.bn15 = nn.BatchNorm2d(256)
    
  def forward(self, x):
    x1 = self.conv1(x)
    x1 = F.leaky_relu(x1, 0.2)
    x1 = self.bn1(x1)
    x2 = self.conv2(x1)
    x2 = F.leaky_relu(x2, 0.2)
    x2 = self.bn2(x2)
    x3 = self.conv3(x2)
    x3 = F.leaky_relu(x3, 0.2)
    x3 = self.bn3(x3)
    x4 = self.conv4(x3)
    x4 = F.leaky_relu(x4, 0.2)
    x4 = self.bn4(x4)
    x5 = self.conv5(x4)
    x5 = F.leaky_relu(x5, 0.2)
    x5 = self.bn5(x5)
    x6 = self.conv6(x5)
    x6 = F.leaky_relu(x6, 0.2)
    x6 = self.bn6(x6)

    d1 = self.convT1(x6, output_size= x5.shape[2:])
    d1 = self.conv7(torch.cat((d1, x5), dim=1))
    d1 = F.relu(d1)
    d1 = self.bn15(d1)
    d1 = self.dropout(d1)
    d2 = self.convT2(d1, output_size=x4.shape[2:])
    d2 = self.conv8(torch.cat((d2, x4), dim=1))
    d2 = F.relu(d2)
    d2 = self.bn14(d2)
    d2 = self.dropout(d2)
    d3 = self.convT3(d2, output_size=x3.shape[2:])
    d3 = self.conv9(torch.cat((d3, x3), dim=1))
    d3 = F.relu(d3)
    d3 = self.bn13(d3)
    d3 = self.dropout(d3)
    d4 = self.convT4(d3, output_size=x2.shape[2:])
    d4 = self.conv10(torch.cat((d4, x2), dim=1))
    d4 = F.relu(d4)
    d4 = self.bn12(d4)
    d5 = self.convT5(d4, output_size=x1.shape[2:])
    d5 = self.conv11(torch.cat((d5, x1), dim=1))
    d5 = F.relu(d5)
    d5 = self.bn11(d5)
    d6 = self.convT6(d5, output_size=x.shape[2:])
    d6 = torch.sigmoid(d6)
    d6 = self.bn10(d6)
    out1 = torch.mul(d6, x)
    return out1

Using cuda device


In [None]:
train_path = '/content/drive/MyDrive/UGP/UGP/ugp data/train/'
train_files = np.array(sorted(list(set(map(lambda x : x, os.listdir(train_path))))))

In [None]:
val_files = np.load('/content/drive/MyDrive/UGP/UGP/val.npz.npy')
val_files = [x.split('/')[-1] for x in val_files]

t = []
for x in train_files:
  if(x not in val_files and x.split('.')[-1]=='npz'):
    t.append(x)
train_files = list(set(t))

len(val_files), len(train_files)

(10, 90)

In [None]:
STEM = 'vocal'

In [None]:
class NewTrainGenerator(Dataset):
  def __init__(self, sup_files, hitl_files, sup_basepath, hitl_basepath, stem, batch_size, hitl_size, hitl_iter):
    self.sup_files = sup_files
    self.hitl_files = hitl_files
    self.sup_basepath = sup_basepath
    self.hitl_basepath = hitl_basepath
    self.stem = stem
    self.batch_size = batch_size

    self.hitl_size = hitl_size
    self.hitl_iter = hitl_iter

    self.hitl_mapping = []
    self.hitl_mapping_file = []

    x = []
    for r,f in enumerate(self.hitl_files):
      for k in range(self.hitl_iter):
        data = np.load(self.hitl_basepath + f)['mix']
        l = data.shape[-1]-512
        for j in range(self.hitl_size):
          idx = np.random.randint(0, l)
          x.append(idx)
        self.hitl_mapping.append(x)
        self.hitl_mapping_file.append([r for i in range(self.hitl_size)])
        x = []

    self.sup_mapping = []
    self.sup_mapping_file = []

    np.random.seed(42)

    x = []
    f = open('/content/drive/MyDrive/UGP/UGP/data_len', 'rb')
    out = pickle.load(f)
    f.close()
    for r,f in enumerate(self.sup_files):
      l = out[f.split('/')[-1]]-512
      for j in range(10):
        idx = np.random.randint(0, l)
        x.append(idx)
      self.sup_mapping.append(x)
      self.sup_mapping_file.append(r)
      x = []
    
    x = []
    for i in range(len(self.hitl_mapping)):
      for m in range(self.batch_size - self.hitl_size):
        j = np.random.randint(len(self.sup_mapping))
        k = np.random.randint(len(self.sup_mapping[j]))
        self.hitl_mapping[i].append(k)
        self.hitl_mapping_file[i].append(self.sup_mapping_file[j])
    
    self.idx_to_idx = np.arange(len(self.hitl_mapping))
    np.random.shuffle(self.idx_to_idx)
     
  def __len__(self):
    return(len(self.hitl_mapping))

  def __getitem__(self, idx):
    X, Y = [], []
    idx = self.idx_to_idx[idx]

    for i in range(self.hitl_size):
      mask_data = np.load(self.hitl_basepath + self.hitl_files[self.hitl_mapping_file[idx][i]])
      norm = np.max(mask_data['mix'])
      j = self.hitl_mapping[idx][i]
      X.append(np.expand_dims(mask_data['mix'][:-1, j:j+512], axis=0) / norm)
      Y.append(np.expand_dims(mask_data[self.stem][:-1, j:j+512], axis=0) / norm)
    
    for i in range(self.batch_size-self.hitl_size):
      mask_data = np.load(self.sup_basepath + self.sup_files[self.hitl_mapping_file[idx][i+self.hitl_size]])
      norm = np.max(mask_data['mix'])
      j = self.hitl_mapping[idx][i+self.hitl_size]
      X.append(np.expand_dims(mask_data['mix'][:-1, j:j+512], axis=0) / norm)
      Y.append(np.expand_dims(mask_data[self.stem][:-1, j:j+512], axis=0) / norm)
    
    return(torch.FloatTensor(X), torch.FloatTensor(Y))

In [None]:
STEM = 'vocal'

def eval(files, model, savePath, step_size):
  
  SR =  22050
  window_size = 2048
  hop_length = 512
  orig_sr=44100

  for f in files:
    fname = f.split('.')[:-1]
    strname = ''
    for i in fname:
      if(strname == ''):
        strname += i
      else:
        strname += ('.'+i)
    file_name = strname+'.stem'
    print(savePath+file_name)
    try:
      os.mkdir(savePath+file_name)
    except:
      # print("pepe")
      pass
    data = np.load('/content/drive/MyDrive/UGP/UGP/ugp data/train/'+f)
    prediction = np.zeros(data['mix'].shape)
    x = np.ceil(np.ceil(data['mix'].shape[1]/512)/32)
    curr_idx = 0
    norm = data['mix'].max()
    for i in range(int(x)):
      X = []
      flag = 0
      while(len(X)!=32 and curr_idx < data['mix'].shape[1]-511):
        X.append(data['mix'][:-1, curr_idx:curr_idx+512] / norm)
        curr_idx += 512
      if(len(X)!=32 and data['mix'].shape[1]!=curr_idx and data['mix'].shape[1]-curr_idx<512):
        X.append(data['mix'][:-1, -512:]/norm)
        flag = 1

      X = np.array(X)
      X = torch.from_numpy(X)
      X = torch.unsqueeze(X, dim=1).to(device)
      # X = X.to(device)
      # print(X.shape)
      model.eval()
      preds = model(X).detach().cpu()
      preds = np.squeeze(preds, axis=1)
      preds = np.expand_dims(preds, axis=-1)
      # print(preds.shape)
      if(flag):
        for j in range(i*32, data['mix'].shape[1]//512):
          prediction[:-1, j*512 : (j+1)*512] = np.squeeze(preds[j-(i*32)], axis=-1)
        prediction[:-1, data['mix'].shape[1]//512*512:] = np.squeeze(preds[-1][:, -(data['mix'].shape[1]-((data['mix'].shape[1]//512)*512)):], axis=-1)
      else:
        for j in range(i*32, (i+1)*32):
          prediction[:-1, j*512 : (j+1)*512] = np.squeeze(preds[j-(i*32)], axis=-1)


    # print(f'Stem Extracted for file')
    f_name = '/content/drive/MyDrive/UGP/UGP/ugp data/train/'+f[:-4]+'.stem.mp4'
    # print(f_name)
    S, rate = st.read_stems(f_name)
    y_mix_o = librosa.to_mono(S[0].T)  
    # y_mix =  librosa.core.resample(y_mix_o,orig_sr,SR)
    # spec = librosa.stft(y_mix,n_fft=window_size,hop_length=hop_length)
    # mag, phase = librosa.magphase(spec)
    prediction *= norm
    phase = np.load('/content/drive/MyDrive/UGP/UGP/ugp data/train/phase/'+f[:-4]+".npz")["arr_0"]
    y = librosa.istft(prediction*phase,win_length=window_size,hop_length=hop_length)
    y = librosa.resample(y,SR,orig_sr, fix=True)

    file_path = (savePath+strname+'.stem/'+STEM+'.wav')
    sf.write(file_path, y, orig_sr)

    accom = y_mix_o-y
    file_path = (savePath+strname+'.stem/'+'accom.wav')
    sf.write(file_path, accom, orig_sr)

In [None]:
def evaluate(files):
  est_path = '/content/drive/MyDrive/UGP/UGP/preds/'
  src_path = '/content/drive/MyDrive/UGP/UGP/stems/'
  s = []
  for f in files:
    fname = f.split('.')[:-1]
    strname = ''
    for i in fname:
      if(strname == ''):
        strname += i
      else:
        strname += ('.'+i)
    srcs = np.array([sf.read(src_path+strname+'_accom.wav')[0],
            sf.read(src_path+strname+'_vocal.wav')[0]])
    # drum = sf.read(est_path + f.split('.')[0]+'.stem/drum.wav')[0]
    # bass = sf.read(est_path + f.split('.')[0]+'.stem/bass.wav')[0]
    vocal = sf.read(est_path + strname+'.stem/vocal.wav')[0]
    # other = sf.read(est_path + f.split('.')[0]+'.stem/rest.wav')[0]
    accom = sf.read(est_path + strname+'.stem/accom.wav')[0]
    y = np.array([accom, vocal])
    # print(museval.metrics.validate(np.expand_dims(srcs, -1), np.expand_dims(y, -1)))
    # y = np.concatenate([y, np.zeros((y.shape[0], srcs.shape[1]-y.shape[1]))], axis=-1)
    score = museval.metrics.bss_eval(np.expand_dims(srcs, -1), np.expand_dims(y, -1))
    # print(type(score[0]))
    s.append(score)
    # sdr += score[0]
    # sir += score[2]
    # sar += score[3]

  return(s)

Complete pipeline for finetuning base model on synthetic music track created from partial annotations of a single song. First the dataloader is defined with appropriate hyperparameter values. Next the base model is finetuned and finally the finetuned model is evaluated on mean and median SDR metrics.

In [None]:
BATCH_SIZE = 16

for i in range(BATCH_SIZE):
  print(i)
  np.random.seed(42)
  torch.random.seed = 42

  files = ['hitl16.npz']

  train_datagen = NewTrainGenerator(train_files, files, './data/', './data/', STEM, BATCH_SIZE, 1, i+1)

  model = torch.load(f'/content/drive/MyDrive/UGP/UGP/baseline_models/pytorch_hitl_new.pt').to(device)
  optimizer = optim.Adam(model.parameters(), lr= 0.00001)
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', threshold=2e-5)
  mae = torch.nn.L1Loss()
  mae.to(device)
  for epoch in range(1):
    model.train()
    t_loss = 0
    for batch in train_datagen:
      X, Y_mask = batch[0].to(device), batch[1].to(device)

      output = model(X)
      # print(output.shape)

      loss = mae(output, Y_mask)

      loss.backward()

      t_loss += loss.item()

      optimizer.step()
      model.zero_grad()
    # print(f'Train Loss : {t_loss/len(train_datagen)}')

    if(epoch==0 or epoch==4 or epoch==9):
      eval(["Phre The Eon - Everybody's Falling Apart.npz"], model, '/content/drive/MyDrive/UGP/UGP/preds/', 512)
      s = evaluate(["Phre The Eon - Everybody's Falling Apart.npz"])
      sdr = np.array([0,0], dtype=np.float64)
      sir = np.array([0,0], dtype=np.float64)
      sar = np.array([0,0], dtype=np.float64)

      for track in s:
        sdr += np.array([np.nanmedian(track[0][0]), np.nanmedian(track[0][1])])
        sir += np.array([np.nanmedian(track[2][0]), np.nanmedian(track[2][1])])
        sar += np.array([np.nanmedian(track[3][0]), np.nanmedian(track[3][1])])

      print(sdr)

      sdr = np.array([0,0], dtype=np.float64)
      sir = np.array([0,0], dtype=np.float64)
      sar = np.array([0,0], dtype=np.float64)

      for track in s:
        sdr += np.array([np.nanmean(track[0][0]), np.nanmean(track[0][1])])
        sir += np.array([np.nanmean(track[2][0]), np.nanmean(track[2][1])])
        sar += np.array([np.nanmean(track[3][0]), np.nanmean(track[3][1])])

      print(sdr)
  


0
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.02324905  6.7520741 ]
[18.49024168  1.64544417]
1
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.02197728  6.79287478]
[18.51309886  1.71821654]
2
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.06721867  6.79709608]
[18.57782665  1.89556815]
3
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.0702291   6.78218315]
[18.58704964  1.93310067]
4
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.0929759   6.88351916]
[18.67160189  2.17256917]
5
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.11951212  6.96062252]
[18.65717853  2.07353363]
6
/content/drive/MyDrive/UGP/UGP/preds/Phre The Eon - Everybody's Falling Apart.stem
[15.13323324  6.93427173]
[18.68422557  2.16947657]
7
/content/drive/MyDrive/UGP/UGP/preds/Ph