<a href="https://colab.research.google.com/github/asigalov61/Tegridy-MIDI-Dataset/blob/master/Advanced_MIR_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Advanced MIR Tool (Ver. 1.2.)

***

## Based on a work of love by Willie Payne and Ana Elisa Mendez Mendez, on whose repo and code it is based: https://github.com/Huriphoonado

***

#### Project Los Angeles

#### Tegridy Code 2020

***

In [None]:
#@title Install Dependencies
#!pip install ipython -U
#!pip install ipython_genutils -U
!git clone https://github.com/marl/medleydb.git
%cd /content/medleydb
!python setup.py install
!sudo apt-get install sox libsox-fmt-mp3
!pip install pyyaml
!!pip install numpy
!pip install six
!pip install librosa
!pip install pydrive
!pip install scipy
!pip install scikit-learn

!pip install mir-eval
!pip install pretty_midi
!pip install pypianoroll
!apt install fluidsynth #Pip does not work for some reason. Only apt works
!pip install midi2audio
!cp /usr/share/sounds/sf2/FluidR3_GM.sf2 /content/font.sf2

In [None]:
#@title Original helpers.py module/functions
# Useful Helper Functions that get called throughout all of our code
# This includes data conversion, array concatenation, etc...

# ----------------- Imports
import numpy as np
import librosa
import librosa.display
from collections import Counter


# For plotting
import mido
import pretty_midi
import pypianoroll
from pypianoroll import Multitrack, Track
import matplotlib
import matplotlib.pyplot as plt
import mir_eval.display

%matplotlib inline

from mido import MidiFile


from midi2audio import FluidSynth

from google.colab import output, drive

from IPython.display import display, Javascript, HTML, Audio, Image

# ----------------- Helper Functions
def hz_to_note_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing note names instead of frequencies
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = '0'
        if a != 0:
            new_a = librosa.hz_to_note(a, cents=False)
        new_values = np.append(new_values, new_a)

    return new_values


def note_to_hz_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing frequencies instead of note names
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = 0
        if a != '0':
            new_a = librosa.note_to_hz(a)
        new_values = np.append(new_values, new_a)

    return new_values


def midi_to_hz_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing frequencies instead of note names
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = 0
        if a != 0:
            new_a = librosa.midi_to_hz(a)
        new_values = np.append(new_values, new_a)

    return new_values


def hz_to_midi_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing frequencies instead of note names
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = 0
        if a != 0:
            new_a = librosa.hz_to_midi(a)
        new_values = np.append(new_values, new_a)

    return new_values


def note_to_midi_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing frequencies instead of note names
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = 0
        if a != '0':
            new_a = librosa.note_to_midi(a)
        new_values = np.append(new_values, new_a)

    return new_values


def midi_to_note_zeros(annotation):
    '''
        Special function so that zeros represent silence
        Input: Annotation List taken straight from mtrack
        Output: 1d np.array containing frequencies instead of note names
    '''
    new_values = np.array([])

    for a in annotation:
        new_a = '0'
        if a != 0:
            new_a = librosa.midi_to_note(a)
        new_values = np.append(new_values, new_a)

    return new_values


def concat(data, feature_type):
    '''
    Concatenates all track information into one np.array to be used for model
    Inputs:
        List of dicts containing all song data
        String representing the feature type to concatenate
    Output: 1d or 2d np.array
    '''
    all_data = np.concatenate([d[feature_type] for d in data])
    print(feature_type, 'array has shape: ', all_data.shape)
    return all_data


def make_output_name(e, n, s):
    '''
        Creates a JSON output file name simply by concatenating the three
        modes set by the user at the start of the program
    '''
    return 'results/' + 'predict_' + e + '_' + n + '_' + s + '.json'


def count_pitches(annotation):
    '''
        Counts the unique classes in an annotation
        Input: 1d np.array of either note or voicing annotations
        Output: dict where class is the key and count is the value
    '''
    unique, counts = np.unique(annotation, return_counts=True)
    pairs = np.asarray((unique, counts)).T  # 2d np.array
    string_dict = dict(pairs.tolist())  # Converts counts to strings :(
    int_dict = {k: int(v) for k, v in string_dict.items()}

    return int_dict


def common_pitches(data, threshold):
    '''
        Aggregates unique pitches across the entire inputted dataset
        Inputs:
            Dataset containing 'class_counts' field
            Threshold value for to only include pitches above the line
        Outputs:
            Dict containing total counts based on each pitch
            List containing all the pitches we plan to remove
    '''
    counts_list = [d['class_counts'] for d in data]
    counter = Counter()

    for d in counts_list:
        counter.update(d)

    all_counts = dict(counter)
    to_remove = {k: v for k, v in counter.items() if v < threshold or k == '0'}

    return to_remove, all_counts


def keep_some_frames(track_dict, to_remove):
    '''
        Given a list of labels we do not want to train on, this function will
        update the labels, features, and times lists to remove those frames
        Inputs:
            An audio track dict to be altered
            A dict/list containing labels to wish to remove
        Output: Modified dict
    '''

    lbls = track_dict['labels']
    times = track_dict['times']
    features = track_dict['features']

    # Create a list of indices from labels to filter out
    i_to_keep = [i for i, lbl in enumerate(lbls) if lbl not in to_remove]

    # Then filter those values from times, labels, features
    track_dict['labels'] = np.array([lbls[i] for i in i_to_keep])
    track_dict['times'] = [times[i] for i in i_to_keep]  # Regular old list
    track_dict['features'] = np.array([features[i] for i in i_to_keep])

    return track_dict


# TODO - Make the ordering of the input list stay constant
def input_string(prompt_type, options_dict):
    '''
        Generates a string to print prompting for user input
        Inputs:
            String to indicate what the user is being asked for
            Dict containing numbered options
        Outputs:
            String which will be printed to the console
    '''
    i_string = 'Please choose a(n) ' + prompt_type + ' mode: \n'
    for k, v in options_dict.items():
        i_string += str(k) + ': ' + v + '\n'
    i_string += 'Your (integer) choice: '

    return i_string


In [None]:
#@title Original exporter.py module/functions
# Any exports done to memory contained in this file

# ----------------- Imports
import json
#import helpers as hr
import os


# ----------------- Types of Data to Export
def predictions(all_test_data, mode, file_name):
    '''
        Exports the predictions dict into a json file so that results
        be loaded and graphed in another program
        Inputs:
            List of dicts containing all test results
            String containing ['voicing' | 'melody' | 'all']
        Outputs:
            JSON File
    '''
    copy = []
    for d in all_test_data:
        new_d = {}
        if mode != 'voicing':  # Convert Note Names to MIDI vals for plotting
            new_d['labels'] = (note_to_midi_zeros(d['labels'])).tolist()
            new_d['guesses'] = note_to_midi_zeros(d['guesses']).tolist()
        else:
            new_d['labels'] = d['labels'].tolist()
            new_d['guesses'] = d['guesses'].tolist()

        new_d['t_id'] = d['t_id']
        new_d['times'] = d['times']
        copy.append(new_d)

    # Predictions written to a results directory, so make it if one does not
    # yet exist
    if not os.path.exists('results'):
        os.makedirs('results')

    with open(file_name, 'w') as file:
        json.dump(copy, file)


def train_test(train_test_data, train_test_name):
    '''
        Simply writes train/test split to a dictionary stored in melody
        Inputs:
            Dict containing the train/test split
            Name to write the file as (Should be based on the task)
        Output:
            JSON File
    '''
    with open(train_test_name, 'w') as file:
        json.dump(train_test_data, file)

In [None]:
#@title Original split.py module/functions
# File used for generating or loading train/test splits
# In practice, only final function should be used based on how the end user
# has seleted to split up the data

# ----------------- Imports

import medleydb as mdb
import os.path
import json


# ----------------- Global Variables
train_test_split = 0.2  # Used for original dataset creation
train_validate_split = 0.2  # Change if you would like new validation split
train_test_name = 'train_test.json'


# ----------------- Functions
def quick_mode():
    '''
        Uses three short audio files for train and test data
        Should be used for testing that the system runs all the way through
        whithout crashing and not for estimating the strength of the features
        Outputs:
            List of medleydb tracks holding training data
            List of medleydb tracks holding test data
    '''
    train = [mdb.MultiTrack('MusicDelta_Reggae'),
             mdb.MultiTrack('MusicDelta_Rockabilly')]
    test = [mdb.MultiTrack('MusicDelta_Shadows')]
    return train, test


def validation_mode():
    '''
        Creates a random split of the training data into train/test
        Inputs:
            Float ranging from 0 to 1 referring to how to split data up
        Outputs:
            train: List of multitrack objects to be used for training
            test: List of multitrack objects to be used for validation
    '''
    # If we do not have a train/test set yet - create it!
    if not os.path.isfile(train_test_name):
        make_test_data()

    with open(train_test_name, 'r') as file:
        tt_data = json.load(file)

    melody_ids = tt_data['train']  # Do nothing with test data
    splits = mdb.utils.artist_conditional_split(trackid_list=melody_ids,
                                                test_size=train_validate_split,
                                                num_splits=1)

    train = [mdb.MultiTrack(t_id) for t_id in splits[0]['train']]
    test = [mdb.MultiTrack(t_id) for t_id in splits[0]['test']]
    return train, test


def test_mode():
    '''
        Loads the train_test data split from memory
        Outputs:
            train: List of multitrack objects to be used for training
            test: List of multitrack objects to be used for testing
    '''
    if not os.path.isfile(train_test_name):
        make_test_data()
        print('Uh oh, you are starting to test before any validation?!')
        print('Run again if you are really sure, but consider switching modes')
        quit()

    with open(train_test_name, 'r') as file:
        tt_data = json.load(file)
        train_ids = tt_data['train']  # Do nothing with test data
        test_ids = tt_data['test']

    train = [mdb.MultiTrack(t_id) for t_id in train_ids]
    test = [mdb.MultiTrack(t_id) for t_id in test_ids]
    return train, test


def make_test_data():
    '''
        Creates one pair of data and exports: train/test
        Should only be used once to generate train/test datasets at
        the beginning of research
        Outputs: JSON file called 'train_test.json'
    '''
    generator = mdb.load_melody_multitracks()
    melody_ids = [mtrack.track_id for mtrack in generator]
    splits = mdb.utils.artist_conditional_split(trackid_list=melody_ids,
                                                test_size=train_test_split,
                                                num_splits=1)

    train, test = splits[0]['train'], splits[0]['test']

    train_test_data = {'train': train, 'test': test}
    train_test(train_test_data, train_test_name)

    print('Generated Train/Test Data Split!')


# ----------------- Function Generator
def generate_split(split_type):
    '''
        Returns the right split function based on the string inputted
        If the string 'options' is inputted, it will return a dict containing
        modes rather than a function
        Input: String containing ['options' | 'voicing' | 'melody' | 'all']
        Output: Evaluation function corresponding to input
    '''
    splits_dict = {
        'quick': quick_mode,
        'validate': validation_mode,
        'test': test_mode
    }

    if split_type == 'options':
        return {i: k for i, k in enumerate(splits_dict)}
    else:
        return splits_dict[split_type]

In [None]:
#@title Original evaluate.py module/functions
# File containing functions used for evaluation of all data
# In practice, only the run_eval function should be used given a string
# variable in the main function pertaining to the mode

# Functions convert to cents and voicing following mirex guidlines:
#    http://craffel.github.io/mir_eval/#module-mir_eval.melody

# ----------------- Imports
from mir_eval import melody as mel_eval
import numpy as np

# ----------------- Evaluation Functions
def evaluate_model_voicing(test_guesses, test_labels):
    '''
        If we are only looking at voicing, then we only care about some metrics
        Inputs:
            1d Boolean np.array containing all predictions made by the model
            1d Boolean np.array containing all ground truth labels
        Output: Dict containing
    '''
    ref_voicing = test_labels.astype(bool)
    est_voicing = test_guesses.astype(bool)

    print('Evaluating voicing...')
    vx_recall, vx_false_alarm = mel_eval.voicing_measures(ref_voicing,
                                                          est_voicing)
    print('Evaluating overall accuracy...')
    correct_tries = (ref_voicing == est_voicing)
    overall_accuracy = sum(correct_tries)/correct_tries.size

    metrics = {
        'vx_recall': vx_recall,
        'vx_false_alarm ': vx_false_alarm,
        'overall_accuracy': overall_accuracy
    }

    for m, v in metrics.items():  # Python2 is iteritems I think
        print(m, ':', v)
    return metrics


def evaluate_model_melody(test_guesses, test_labels):
    '''
        Run standard pitch and chroma evaluations on all test data
        Inputs:
            1d np.array containing all predictions made by the model
            1d np.array containing all ground truth labels
        Outputs:
            Dict holding results of all evaluations
    '''
    ref_freq = hr.note_to_hz_zeros(test_labels)
    est_freq = hr.note_to_hz_zeros(test_guesses)

    ref_cent = mel_eval.hz2cents(ref_freq)
    est_cent = mel_eval.hz2cents(est_freq)

    all_voiced = np.ones(len(ref_cent), dtype=bool)

    print('Evaluating pitch...')
    raw_pitch = mel_eval.raw_pitch_accuracy(all_voiced, ref_cent,
                                            all_voiced, est_cent,
                                            cent_tolerance=50)

    print('Evaluating chroma...')
    raw_chroma = mel_eval.raw_chroma_accuracy(all_voiced, ref_cent,
                                              all_voiced, est_cent,
                                              cent_tolerance=50)
    metrics = {
        'raw_pitch': raw_pitch,
        'raw_chroma': raw_chroma,
    }

    for m, v in metrics.items():
        print(m, ':', v)

    return metrics


def evaluate_model_all(test_guesses, test_labels):
    '''
        Run standard Mirex evaluations on all test data
        Inputs:
            1d np.array containing all predictions made by the model
            1d np.array containing all ground truth labels
        Outputs:
            Dict holding results of all evaluations
    '''
    print('Running conversions...')
    ref_freq = hr.note_to_hz_zeros(test_labels)  # And back to Hz!
    est_freq = hr.note_to_hz_zeros(test_guesses)

    ref_cent = mel_eval.hz2cents(ref_freq)  # Then to cents...
    est_cent = mel_eval.hz2cents(est_freq)

    ref_voicing = mel_eval.freq_to_voicing(ref_freq)[1]  # And voicings!
    est_voicing = mel_eval.freq_to_voicing(est_freq)[1]

    print('Evaluating voicing...')
    vx_recall, vx_false_alarm = mel_eval.voicing_measures(ref_voicing,
                                                          est_voicing)

    print('Evaluating pitch...')
    raw_pitch = mel_eval.raw_pitch_accuracy(ref_voicing, ref_cent,
                                            est_voicing, est_cent,
                                            cent_tolerance=50)

    print('Evaluating chroma...')
    raw_chroma = mel_eval.raw_chroma_accuracy(ref_voicing, ref_cent,
                                              est_voicing, est_cent,
                                              cent_tolerance=50)

    print('Evaluating overall accuracy...')
    overall_accuracy = mel_eval.overall_accuracy(ref_voicing, ref_cent,
                                                 est_voicing, est_cent,
                                                 cent_tolerance=50)

    metrics = {
        'vx_recall': vx_recall,
        'vx_false_alarm ': vx_false_alarm,
        'raw_pitch': raw_pitch,
        'raw_chroma': raw_chroma,
        'overall_accuracy': overall_accuracy
    }

    for m, v in metrics.items():  # Python2 is iteritems I think
        print(m, ':', v)

    return metrics


# ----------------- Function Generator
def generate_eval(mode):
    '''
        Returns the right evaluation function based on the string inputted
        Input: String containing ['options' | 'voicing' | 'melody' | 'all']
        Output: Evaluation function corresponding to input
    '''
    evaluations = {
        'voicing': evaluate_model_voicing,
        'melody': evaluate_model_melody,
        'all': evaluate_model_all
    }

    if mode == 'options':
        return {i: k for i, k in enumerate(evaluations)}
    else:
        return evaluations[mode]

In [None]:
#@title Original features.py module/functions
# Possible Feature Representations to convert input audio
# Style Note: Feature Representations meant to be called by user should begin
# with 'with_' and should end with a call to 'final_steps'
# Once a feature representation is ready, add it to the dict created in the
# generator function and it will be accessible to the user

# TODO - Figure out how to make final_steps a higher order function to
# reduce code duplication for functions below that have the same steps (stft)

# ----------------- Imports
import numpy as np
import librosa

from scipy.fftpack import fft
from scipy.fftpack import ifft


# ----------------- Global Variables
target_sr = 22050  # Lower this if you decide to downsample
original_sr = 44100  # Sampling rate for tracks in Medleydb
n_fft = 1024
win_length = 1024
hop_length = int(256 * (target_sr / original_sr))  # So time points line up
window = 'hann'


# ----------------- Transformation Functions
def with_stft(y):
    '''
        Runs a short-term Fourier transform
        Input: Audio file
        Output: 2d np.Array of size (time_points, num_features)
    '''
    s_array = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                           win_length=win_length, window=window)

    abs_s = np.absolute(s_array)  # converts complex64 values to floats
    return final_steps(abs_s)


def with_cube_root(y):
    '''
        Takes the cube root of the STFT - supposed to model human hearing
        Input: Audio file
        Output: 2d np.Array of size (time_points, num_features)
    '''
    s_array = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                           win_length=win_length, window=window)

    abs_s = np.absolute(s_array)
    cbrt = np.cbrt(abs_s)
    return final_steps(cbrt)


def with_autocorrelation(y):
    '''
        Uses autocorrelation to detect periodic signals
        Input: Audio file
        Output: 2d np.Array of size (time_points, num_features)
    '''
    min_lag = 15
    max_lag = 400

    N_l = np.array(np.linspace((win_length)-min_lag,
                   win_length-max_lag, max_lag-min_lag+1), ndmin=2)
    N_l = np.transpose(N_l)
    N_l = 1 / N_l

    stft = stft_no_loss(y)

    acf = np.zeros(stft.shape)
    acf = acf + 0j
    for i in range(stft.shape[1]):
        acf[:, i] = ifft(np.power(np.absolute(stft[:, i]), 2))

    acf = np.real(acf)

    acf = acf[min_lag:max_lag+1, :]

    acf = N_l * acf

    return final_steps(acf)


# TODO - We have determined that the entire frame contains all zeros
# Figure out why this funcion is not working
def with_cepstrum(y):
    '''
        Ceptstrum may be computed as the following:
            FT -> abs() -> log() -> IFT -> real()
        Input: Audio file
        Output: 2d np.Array of size (time_points, num_features)
    '''
    stft = stft_no_loss(y)

    abs_stft = np.absolute(stft)
    abs_stft[abs_stft == 0] = 0.00001

    log_stft = np.log(abs_stft)

    i_log_stft = ifft(log_stft, axis=0)

    cepstrum = np.real(i_log_stft)

    return final_steps(cepstrum)


def with_salience(y):
    '''
        Measurement of salience (percieved amplitude/energy) over time
        Input: Audio file
        Output: 2d np.Array of size (time_points, num_features)
    '''
    s_array = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                           win_length=win_length, window=window)
    s_array = np.abs(s_array)
    freqs = librosa.fft_frequencies(sr=target_sr, n_fft=n_fft)
    h_range = [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    salience = librosa.salience(s_array, freqs, h_range)
    salience[np.isnan(salience)] = 0

    return final_steps(salience)


def stft_no_loss(y):
    '''
        Unlike Librosa, this version of STFT does not remove any redundant
        information
        Input: Audio file
        Output: 2d np.Array of size (num_features, time_points)
    '''
    stft = np.zeros((win_length, int(np.ceil(len(y)/hop_length)+1)))
    stft = stft+0j

    y = np.pad(y, int(n_fft / 2), mode='reflect')
    stft_buffer = librosa.util.frame(y, frame_length=win_length,
                                     hop_length=hop_length)

    for i in range(stft_buffer.shape[1]):
        stft[:, i] = fft(stft_buffer[:, i], n_fft)

    return stft


def final_steps(s):
    '''
    Wraps around normalize function and runs any final steps common
    to all functions to finish processing a feature vector array
    Input: 2d np.array
    Output: Modified 2d np.array with range 0-1 and axes swapped
    '''
    # Normalize entire matrix from 0 to 1 - useful since amplitude differs
    min_val = np.amin(s)
    s = s - min_val
    max_val = np.amax(s)
    s = s / max_val

    # Swap axes so that dimensions line up with annotation
    return np.swapaxes(s, 0, 1)


# ----------------- Function Generator
def generate_transform(type):
    '''
        Returns the right transform function based on the string inputted
        If the string 'options' is inputted, it will return a dict containing
        modes rather than a function
        Input: String transformation options or 'list'
        Output: Either:
            Transform function corresponding to input
            Dict containing possible transformation functions for user input
    '''
    transformations = {
        'stft': with_stft,
        'cube_root': with_cube_root,
        'autocorr': with_autocorrelation,
        'cepstrum': with_cepstrum,
        'salience': with_salience
    }

    if type == 'options':
        return {i: k for i, k in enumerate(transformations)}
    else:
        return transformations[type]

In [None]:
#@title Original playground.py module/functions { run: "auto" }
number_of_processing_threads = 4 #@param {type:"slider", min:1, max:64, step:1}
evaluation_mode_selection = "melody" #@param ["melody", "voicing", "all"]
feature_transformation_selection = "stft" #@param ["stft", "cube_root", "autocorr", "cepstrum", "salience"]
split_train_val_test_selection = "quick" #@param ["quick", "validate", "test"]

%cd /content/medleydb/

# Willie Payne
# Ana Elisa Mendez Mendez
# Run Command: python3 playground.py

# ----------------- Imports
from multiprocessing import Pool  # For parallel processing

import numpy as np  # For numerous calculations

from sklearn import svm  # Machine learning algorithms
from sklearn.ensemble import RandomForestClassifier

import librosa  # Audio processing and data conversions
from medleydb import *
# Functions that we have written
#import split  # For splitting train/validation/test
#import features  # Feature transformation methods
#import helpers as hr  # Useful helper functions
#import exporter  # For exporting data to JSON
#import evaluate  # Final step evaluation functions


# ----------------- Global Variables
num_processes = number_of_processing_threads  # Number of parallel processes for supported code blocks
global e_mode  # Evaluation mode
global n_mode  # Type of Feature transformation to use
global s_mode  # Type of train/test/validate split to use
e_mode = evaluation_mode_selection
n_mode = feature_transformation_selection
s_mode = split_train_val_test_selection

# ----------------- Functions
def get_started():
    '''
        Asks for user input for three categories: evaluation, feature, split.
        This should be the first function ran since it sets the chain of events
        by which the software follows.
        Input: None
        Output: None, Updates three global variables
    '''
    #global e_mode  # Only time global keyword is used since these should be
    #global n_mode  # known across the entirety of the program
    #global s_mode
'''
    e_options = generate_eval('options')
    n_options = generate_transform('options')
    s_options = generate_split('options')

    e_choice = input(input_string('evaluation', e_options))
    n_choice = input(input_string('feature', n_options))
    s_choice = input(input_string('split', s_options))
'''
try:
        #e_mode = e_options.get(int(e_choice), e_mode)
        #n_mode = n_options.get(int(n_choice), n_mode)
        #s_mode = s_options.get(int(s_choice), s_mode)
        print('Ready! :)')
except:
        print('Oops, you must have typed something weird. Try running again.')
        quit()


def normalize_all(n_func, train_or_test):
    '''
        Iterates through all provided data and normalizes each mtrack
        Inputs:
            Function referring to the normalization function to run
            List containing either training data or test data
        Outputs: List of dicts
    '''
    func_with_data = [(n_func, mtrack) for mtrack in train_or_test]

    with Pool(processes=num_processes) as pool:
        feature_dict = pool.starmap(load_and_normalize, func_with_data)

    return feature_dict


def load_and_normalize(n_func, mtrack):
    '''
        Loads the selected audio file and runs normalization on it
        Inputs:
            Normalization function to use (begins with 'with_')
            mtrack object from Medleydb
        Output: Dict containing
                    t_id: track id
                    features: 2d np.array containing normalized feature vector
                    labels: 1d np.array containing all labels
                    times: list containing times for all annotations
    '''
    t_id = mtrack.track_id
    y, sr = librosa.load(mtrack.mix_path, res_type='kaiser_fast',
                         sr=target_sr, mono=True)  # or 'kaiser_best'

    # Each annotation contains time stamp, and pitch value
    times, annotation = zip(*mtrack.melody2_annotation)
    annotation = list(annotation)
    times = list(times)

    normalized = n_func(y)  # Transform to feature representation

    if len(normalized) != len(annotation):
        if len(normalized) - len(annotation) == 1:
            normalized = normalized[:-1]  # remove extra vector from end
        elif len(annotation) - len(normalized) == 1:
            annotation = annotation[:-1]
            times = times[:-1]
        else:  # Something really went wrong otherwise!
            print('Error! Feature vector differs in length from labels.')
            print(t_id, 'labels has size:', len(annotation))
            print(t_id, 'features has size:', len(normalized))
            quit()

    if e_mode == 'voicing':
        annotation = np.array([int(bool(v)) for v in annotation])
    else:
        annotation = hz_to_note_zeros(annotation)

    # count unique pitches/voicings
    class_counts = count_pitches(annotation)

    print('Normalized', t_id, 'with', len(normalized), 'feature vectors')
    return {'t_id': t_id, 'features': normalized,
            'labels': annotation, 'times': times,
            'class_counts': class_counts}


def only_voiced_frames(data, to_remove=['0']):
    '''
        Mainly used by the melody mode classification to hold voiced frames
        Inputs:
            List of dicts containing all train or test data
            List of pitches too rare to classify in addition to 0
        Outputs:
            Modified data list where undesirable frames/times/classes are
            cut out
    '''
    arg_list = [(track, to_remove) for track in data]
    with Pool(processes=num_processes) as pool:
        updated_tracks = pool.starmap(keep_some_frames, arg_list)

    return updated_tracks


def train_model_svm(train_features, train_labels):
    '''
        Uses an SVM to train the melody prediction model
        Inputs:
            2d np.array containing all feature vectors for each time
            1d np.array containing labels for all feature vectors*
            * The length of both lists must be equal
        Output: A classifier to be used for melody prediction
    '''
    clf = svm.SVC()
    clf.fit(train_features, train_labels)
    return clf


def train_model_forest(train_features, train_labels):
    '''
        Runs a Random Forest Classifier to train melody prediction model
        Inputs:
            2d np.array containing all feature vectors for each time
            1d np.array containing labels for all feature vectors*
            * The length of both lists must be equal
        Output: A classifier to be used for melody prediction
    '''
    clf = RandomForestClassifier(max_depth=100, random_state=0,
                                 n_jobs=num_processes)
    clf.fit(train_features, train_labels)
    return clf


# TODO - Calculate accuracy per track to evaluate performance across genres
def predict(clf, all_test_data):
    '''
        Run predictions on all tracks
        Inputs:
            Classifier created by train_model function
            List of dicts containing
        Output: Modified list of dicts containing 'guesses' field
    '''
    for track in all_test_data:
        track['guesses'] = clf.predict(track['features'])

    return all_test_data

In [None]:
# Fix long path access:
import ntpath
ntpath.realpath = ntpath.abspath
# Fix long path access.

In [None]:
#@title RUN THE CODE with your selections
import os
os.chdir('/content/medleydb')
get_started()  # Choose your weapon!

e_func = generate_eval(e_mode)
n_func = generate_transform(n_mode)
s_func = generate_split(s_mode)

print('You chose to evaluate', e_mode, 'training with', n_mode,
      'using', s_mode, 'data')
print('Here we go!')

print('Splitting Train and Test Sets..........', end='')
train, test = s_func()
print('Done')

print('Extracting Training Features..........')
all_training_data = normalize_all(n_func, train)
to_remove, train_counts = common_pitches(all_training_data, 50)
print('Extracting Training Features..........Done')

if e_mode == 'melody':
    print('Removing Unvoiced Frames From Train..........', end='')
    all_training_data = only_voiced_frames(all_training_data, to_remove)
    print('Done')

print('Concatenating all Feature Vectors..........')
train_features = concat(all_training_data, 'features')
train_labels = concat(all_training_data, 'labels')
print('Concatenating all Feature Vectors..........Done')

print('Training  Model..........', end='')
clf = train_model_forest(train_features, train_labels)
print('Done')

print('Extracting Test Features..........')
all_test_data = normalize_all(n_func, test)
print('Extracting Test Features..........Done')

if e_mode == 'melody':
    print('Removing Unvoiced Frames From Test..........', end='')
    all_test_data = only_voiced_frames(all_test_data)
    print('Done')

print('Making Predictions..........', end='')
predictions = predict(clf, all_test_data)
print('Done')

print('Exporting Predictions..........')
f_name = make_output_name(e_mode, n_mode, s_mode)
exporter.predictions(all_test_data, e_mode, f_name)
print('Done: Exported file as', f_name)

print('Evaluating Results..........')
test_guesses = concat(predictions, 'guesses')
test_labels = concat(predictions, 'labels')
e_func(test_guesses, test_labels)
print('Evaluating Results..........Done')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as ms
import json

with open('/content/train_test.json', 'r') as file:
    predictions = json.load(file)

print(predictions[0]['t_id'])
print(type(predictions[0]['labels']), len(predictions[0]['labels']))
print(type(predictions[0]['guesses']), len(predictions[0]['guesses']))
print(type(predictions[0]['times']), len(predictions[0]['times']))