# Setup

install requirements

In [None]:
!pip install -r requirements.txt &> /dev/null

In [None]:
# install fluidsynth
!apt-get install fluidsynth

imports

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
import pretty_midi
from util import crop_or_pad
from plot_listen.listen import play
from ddsp.multi_scale_spectral_loss import multi_scale_spectral_loss
from save_load import load_weights_from_file
from plot_listen.plot import plot, plot_multi_stfts, plot_diff_multi_stfts, plot_midi_conditioning
from globals import *
from data.gset_midi_dataset import GsetMidiDataset
from data.guitarset_loader import GuitarSetLoader
from midi_synth.midi_synth import MidiSynth
from train.model_trainer import ModelTrainer
from util import torch_to_numpy, numpy_to_torch
from midi2audio import FluidSynth
from sample_bank_synth.sample_bank_synth import naiveOnsetFrameSynth
from save_load import save_audio
import librosa
from torch import stft

# Load Test Dataset

In [None]:

test_dataset = GsetMidiDataset("test_players-0304_fingerstyle_gset-midi_3.0s.npz")

# Load Synth Model

In [None]:
synth = MidiSynth(use_context_net=True)

# your path here
synth_load_path = ""
synth.load_state_dict(load_weights_from_file(checkpoint_path=synth_load_path))

synth = synth.to(DEVICE)
synth.eval();

In [None]:
print(synth)

# Evaluate Model

MSS and HPSS calculations

In [None]:
def eval_mss(o, r):
    '''takes in two numpy arrays: original and resynth
    calculates multi scale spectral loss and also for the harmonic and percussive parts separately'''

    # normalize, convert to tensor
    original_audio = numpy_to_torch(librosa.util.normalize(o))
    resynth_audio = numpy_to_torch(librosa.util.normalize(r))

    MSS_loss_tensor = multi_scale_spectral_loss(original_audio, resynth_audio)
    MSS_loss = float(MSS_loss_tensor.data.cpu().numpy())

    ## do harmonic_percussive separation
    # original
    o_STFT = librosa.stft(o)
    o_H, o_P = librosa.decompose.hpss(o_STFT)
    original_h = numpy_to_torch(librosa.istft(o_H)) # back to time domain
    original_p = numpy_to_torch(librosa.istft(o_P)) # back to time domain

    # resynth
    r_STFT = librosa.stft(r)
    r_H, r_P = librosa.decompose.hpss(r_STFT)
    resynth_h = numpy_to_torch(librosa.istft(r_H)) # back to time domain
    resynth_p = numpy_to_torch(librosa.istft(r_P)) # back to time domain

    # compute MSS_harm and MSS_perc
    MSS_harm_loss_tensor = multi_scale_spectral_loss(original_h, resynth_h)
    MSS_harm_loss = float(MSS_harm_loss_tensor.data.cpu().numpy())
    MSS_perc_loss_tensor = multi_scale_spectral_loss(original_p, resynth_p)
    MSS_perc_loss = float(MSS_perc_loss_tensor.data.cpu().numpy())

    return MSS_loss, MSS_harm_loss, MSS_perc_loss




In [None]:
MSS_losses = []
MSS_harm_losses = []
MSS_perc_losses = []

for item in tqdm(test_dataset):
    original_audio = item["mic_audio"]
    conditioning = item["conditioning"]

    resynth_audio = torch_to_numpy(synth(numpy_to_torch(conditioning).unsqueeze(0))['audio'].squeeze(0))

    loss, h_loss, p_loss = eval_mss(original_audio, resynth_audio)
    MSS_losses.append(loss)
    MSS_harm_losses.append(h_loss)
    MSS_perc_losses.append(p_loss)


 # Print MSS results
# print(f"MSS_losses: {MSS_losses}")
print(f"\nMSS mean: {np.mean(MSS_losses)}")
print(f"MSS std: {np.std(MSS_losses)}")

# print(f"\nHarmonic_losses: {MSS_harm_losses}")
print(f"\nHarmonic mean: {np.mean(MSS_harm_losses)}")
print(f"Harmonic std: {np.std(MSS_harm_losses)}")

# print(f"\nPercussive_losses: {MSS_perc_losses}")
print(f"\nPercussive mean: {np.mean(MSS_perc_losses)}")
print(f"Percussive std: {np.std(MSS_perc_losses)}")

In [None]:
plot(original_audio)

In [None]:
plot(resynth_audio)

# Sonify with FluidSynth

In [None]:
def conditioning_to_MIDI(conditioning):
    # conditioning shape: (frames, 6, 2)
    n_frames = conditioning.shape[0]

    # notes are [pitch, vel, start time, end time]
    active_notes = [None,None,None,None,None,None] # per string
    notes = []

    # frame_period
    frame_period = (1/FRAME_RATE)

    for frame_ind in range(n_frames):
        # use the center of the frame by adding half of a frame period
        frame_time = ( (frame_ind * frame_period) + (frame_period / 2) )

        for string_ind in range(6):
            pitch = int(round(conditioning[frame_ind][string_ind][0]))
            onset_vel = int(round(conditioning[frame_ind][string_ind][1]))

            # make a new note if onset vel is nonzero
            if onset_vel > 0:
                # if there is an active note already on this string, end the note
                if active_notes[string_ind] != None:
                    note = active_notes[string_ind]
                    note[3] = frame_time
                    notes.append(note)

                # add a new active note for nonzero onsets
                active_notes[string_ind] = [pitch, onset_vel, frame_time, None]

            # or make a new note if first frame and pitch is nonzero
            elif frame_ind == 0 and pitch > 0:
                # we don't know the onset velocity so guess the middle value, 64
                active_notes[string_ind] = [pitch, 64, frame_time, None]

            # if we have an active note (with nonzero pitch) and pitch goes to 0, end the note
            elif (active_notes[string_ind] is not None) and (active_notes[string_ind][0] > 0) and (pitch == 0):
                note = active_notes[string_ind]
                note[3] = frame_time
                notes.append(note)
                active_notes[string_ind] = None

            # if we have an active note but the pitch for it is zero
            if (active_notes[string_ind] is not None) and (active_notes[string_ind][0] == 0):
                # if we now have a nonzero pitch, update it
                # print(f"string: {string_ind} active note with pitch of zero detected")
                if pitch > 0:
                    active_notes[string_ind][0] = pitch



    # end any still active notes
    for i in range(6):
        note = active_notes[i]
        if note is not None:
            note[3] = frame_time
            notes.append(note)
            active_notes[i] = None

    midi_obj = pretty_midi.PrettyMIDI()
    inst = pretty_midi.Instrument(program=25)

    for note in notes:
        p, v, t1, t2  = note
        midi_note = pretty_midi.Note(velocity=v,
                                    pitch=p,
                                    start=t1,
                                    end=t2)
        inst.notes.append(midi_note)

    midi_obj.instruments.append(inst)

    return midi_obj

def conditioning_thru_fluid(conditioning, fname="temp"):
    midi_obj = conditioning_to_MIDI(conditioning)
    midi_obj.write(f".{fname}.mid")
    fs = FluidSynth(sample_rate=SR)
    fs.midi_to_audio(f'.{fname}.mid', f'.{fname}.wav')
    y, sr = librosa.load(f".{fname}.wav", sr=SR)
    os.remove(f".{fname}.mid")
    os.remove(f".{fname}.wav")
    return y

In [None]:
fluid_audio = conditioning_thru_fluid(example["conditioning"])
play(fluid_audio)

In [None]:
play(example["mic_audio"])

# Evaluate FluidSynth on Test Data

In [None]:
MSS_losses = []
MSS_harm_losses = []
MSS_perc_losses = []


for i, item in tqdm(enumerate(test_dataset)):
    original_audio = item["mic_audio"]
    conditioning = item["conditioning"]

    resynth_audio = crop_or_pad(conditioning_thru_fluid(conditioning, fname=f"file_{i}"), len(original_audio))

    loss, h_loss, p_loss = eval_mss(original_audio, resynth_audio)
    MSS_losses.append(loss)
    MSS_harm_losses.append(h_loss)
    MSS_perc_losses.append(p_loss)


 # Print MSS results
print(f"\nMSS mean: {np.mean(MSS_losses)}")
print(f"MSS std: {np.std(MSS_losses)}")

print(f"\nHarmonic mean: {np.mean(MSS_harm_losses)}")
print(f"Harmonic std: {np.std(MSS_harm_losses)}")

print(f"\nPercussive mean: {np.mean(MSS_perc_losses)}")
print(f"Percussive std: {np.std(MSS_perc_losses)}")



# Sonify with GuitarSet Sample Bank

### conditioning to onsets/frames representation

In [None]:
def conditioning_to_onsets_frames(conditioning):

    # conditioning shape: (frames, 6, 2)
    n_frames = conditioning.shape[0]

    # notes are [pitch, vel, start time, end time]
    active_notes = [None,None,None,None,None,None] # per string
    notes = []

    # frame_period
    frame_period = (1/FRAME_RATE)

    for frame_ind in range(n_frames):
        # use the center of the frame by adding half of a frame period
        frame_time = ( (frame_ind * frame_period) + (frame_period / 2) )

        for string_ind in range(6):
            pitch = int(round(conditioning[frame_ind][string_ind][0]))
            onset_vel = int(round(conditioning[frame_ind][string_ind][1]))

            # make a new note if onset vel is nonzero
            if onset_vel > 0:
                # if there is an active note already on this string, end the note
                if active_notes[string_ind] != None:
                    note = active_notes[string_ind]
                    note[3] = frame_time
                    notes.append(note)

                # add a new active note for nonzero onsets
                active_notes[string_ind] = [pitch, onset_vel, frame_time, None]

            # or make a new note if first frame and pitch is nonzero
            elif frame_ind == 0 and pitch > 0:
                # we don't know the onset velocity so guess the middle value, 64
                active_notes[string_ind] = [pitch, 64, frame_time, None]

            # if we have an active note (with nonzero pitch) and pitch goes to 0, end the note
            elif (active_notes[string_ind] is not None) and (active_notes[string_ind][0] > 0) and (pitch == 0):
                note = active_notes[string_ind]
                note[3] = frame_time
                notes.append(note)
                active_notes[string_ind] = None

            # if we have an active note but the pitch for it is zero
            if (active_notes[string_ind] is not None) and (active_notes[string_ind][0] == 0):
                # if we now have a nonzero pitch, update it
                # print(f"string: {string_ind} active note with pitch of zero detected")
                if pitch > 0:
                    active_notes[string_ind][0] = pitch

    # end any still active notes
    for i in range(6):
        note = active_notes[i]
        if note is not None:
            note[3] = frame_time
            notes.append(note)
            active_notes[i] = None

    # 44 notes playable on guitar
    onsets_tr = np.zeros((1, 44, n_frames))
    frames_tr = np.zeros((1, 44, n_frames))

    for note in notes:
        p, v, t1, t2  = note
        # convert times back to frame indices
        f1 = int(round((t1 - (frame_period / 2)) / (frame_period)))
        f2 = int(round((t2 - (frame_period / 2)) / (frame_period)))

        # subtract 40, for the lowest midi note playable on a standard acoustic guitar
        pitch_index = p - 40

        # scale velocity to [0, 1)
        scaled_vel = v / MIDI_NORM

        onsets_tr[:, pitch_index, f1] = scaled_vel
        frames_tr[:, pitch_index, f1:(f2+1)] = scaled_vel

    return onsets_tr, frames_tr

def conditioning_thru_sample_bank(conditioning, ignore_envs=True):
    o, f = conditioning_to_onsets_frames(conditioning)
    o = numpy_to_torch(o)
    f = numpy_to_torch(f)
    naive_synth = naiveOnsetFrameSynth()
    naive_synth.to(DEVICE)
    naive_audio = naive_synth(o, f, ignore_envs=ignore_envs).squeeze(0)
    return naive_audio

In [None]:
sample_bank_audio = conditioning_thru_sample_bank(example["conditioning"])
print(torch.mean(sample_bank_audio))
play(sample_bank_audio)

In [None]:
play(example["mic_audio"])

In [None]:
plot(sample_bank_audio)

In [None]:
plot(example["mic_audio"])

# Evaluate Sample Bank Synth on Test Data

In [None]:
MSS_losses = []
MSS_harm_losses = []
MSS_perc_losses = []


for i, item in tqdm(enumerate(test_dataset)):
    original_audio = item["mic_audio"]
    conditioning = item["conditioning"]

    resynth_audio = torch_to_numpy(crop_or_pad(conditioning_thru_sample_bank(conditioning), len(original_audio)))

    loss, h_loss, p_loss = eval_mss(original_audio, resynth_audio)
    MSS_losses.append(loss)
    MSS_harm_losses.append(h_loss)
    MSS_perc_losses.append(p_loss)


 # Print MSS results
print(f"\nMSS mean: {np.mean(MSS_losses)}")
print(f"MSS std: {np.std(MSS_losses)}")

print(f"\nHarmonic mean: {np.mean(MSS_harm_losses)}")
print(f"Harmonic std: {np.std(MSS_harm_losses)}")

print(f"\nPercussive mean: {np.mean(MSS_perc_losses)}")
print(f"Percussive std: {np.std(MSS_perc_losses)}")