In [7]:
import json

import wave
import numpy as np
import pandas as pd

In [9]:
meta_fpath = 'landing/nsynth-test/examples.json'

meta = json.load(open(meta_fpath))

meta_df = pd.DataFrame(meta).T
display(meta_df.head())

Unnamed: 0,qualities,pitch,note,instrument_source_str,velocity,instrument_str,instrument,sample_rate,qualities_str,instrument_source,note_str,instrument_family,instrument_family_str
bass_synthetic_068-049-025,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",49,217499,synthetic,25,bass_synthetic_068,656,16000,[dark],2,bass_synthetic_068-049-025,0,bass
keyboard_electronic_001-021-127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",21,299359,electronic,127,keyboard_electronic_001,40,16000,[],1,keyboard_electronic_001-021-127,4,keyboard
guitar_acoustic_010-066-100,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",66,72288,acoustic,100,guitar_acoustic_010,219,16000,[],0,guitar_acoustic_010-066-100,3,guitar
reed_acoustic_037-068-127,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",68,22259,acoustic,127,reed_acoustic_037,387,16000,[reverb],0,reed_acoustic_037-068-127,7,reed
flute_acoustic_002-077-100,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",77,72001,acoustic,100,flute_acoustic_002,86,16000,[reverb],0,flute_acoustic_002-077-100,2,flute


In [62]:
def sample_rows(meta_df, n):
    """
    Sample n rows from the meta dataframe, one from each instrument family.
    """
    sampled_rows = meta_df.groupby('instrument_family').apply(lambda x: x.sample(1)).sample(n)
    return sampled_rows

# Example usage
n = 5
sampled_rows = sample_rows(meta_df, n)
display(sampled_rows)


Unnamed: 0_level_0,Unnamed: 1_level_0,qualities,pitch,note,instrument_source_str,velocity,instrument_str,instrument,sample_rate,qualities_str,instrument_source,note_str,instrument_family,instrument_family_str
instrument_family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8,string_acoustic_056-047-050,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0]",47,115180,acoustic,50,string_acoustic_056,436,16000,"[fast_decay, reverb]",0,string_acoustic_056-047-050,8,string
10,vocal_synthetic_003-036-127,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 0]",36,49889,synthetic,127,vocal_synthetic_003,37,16000,"[bright, distortion, long_release, nonlinear_env]",2,vocal_synthetic_003-036-127,10,vocal
7,reed_acoustic_018-062-025,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",62,20713,acoustic,25,reed_acoustic_018,183,16000,[],0,reed_acoustic_018-062-025,7,reed
2,flute_acoustic_002-069-025,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0]",69,15522,acoustic,25,flute_acoustic_002,86,16000,"[long_release, reverb]",0,flute_acoustic_002-069-025,2,flute
0,bass_synthetic_134-057-025,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0]",57,186082,synthetic,25,bass_synthetic_134,914,16000,"[dark, reverb]",2,bass_synthetic_134-057-025,0,bass


In [70]:
from pydub import AudioSegment

def combine_wav_files(output_path, *input_paths):
    """
    Combine multiple WAV files into one such that they play simultaneously.
    """
    # Load the first audio file
    combined = AudioSegment.from_wav(input_paths[0])

    # Mix in the other audio files
    for path in input_paths[1:]:
        next_wav = AudioSegment.from_wav(path)
        combined = combined.overlay(next_wav)

    # Save the result
    return combined

In [71]:
def generate_training_sample(meta_df, n):
    """
    Generate a training sample by combining n random rows from the meta dataframe.
    """
    sampled_rows = sample_rows(meta_df, n)
    input_dir = 'landing/nsynth-test/audio'
    output_path = 'raw/combined.wav'

    input_paths = sampled_rows['note_str'].apply(lambda x: f'{input_dir}/{x}.wav').values
    combined_wav = combine_wav_files(output_path, *input_paths)
    return sampled_rows['note_str'].tolist(), combined_wav

N = 100 # Number of training samples
n = 3 # Number of instruments per sample

train_df = pd.DataFrame(columns=['id'] + [f'instrument_{i}' for i in range(1, n+1)])
for i in range(N):
    id = i
    row, combined_wav = generate_training_sample(meta_df, n)
    row = [id] + row
    combined_wav.export(f'raw/{id}.wav', format='wav')
    train_df.loc[i] = row

In [72]:
train_df

Unnamed: 0,id,instrument_1,instrument_2,instrument_3
0,0,string_acoustic_014-061-100,organ_electronic_057-063-127,keyboard_electronic_003-049-100
1,1,keyboard_electronic_078-080-100,flute_acoustic_002-087-127,guitar_acoustic_015-021-127
2,2,vocal_synthetic_003-099-100,keyboard_electronic_002-075-100,reed_acoustic_037-052-025
3,3,brass_acoustic_006-079-050,mallet_acoustic_062-071-100,flute_synthetic_000-039-100
4,4,reed_acoustic_037-050-127,organ_electronic_104-074-075,keyboard_electronic_001-068-075
...,...,...,...,...
95,95,mallet_acoustic_056-119-025,flute_synthetic_000-095-050,brass_acoustic_006-068-127
96,96,organ_electronic_104-066-127,guitar_acoustic_010-070-025,bass_electronic_025-023-100
97,97,bass_synthetic_098-078-075,brass_acoustic_059-051-025,organ_electronic_007-009-050
98,98,keyboard_electronic_003-031-025,bass_electronic_018-037-050,vocal_synthetic_003-068-100


In [None]:
# return an array of the max amplitude within binned frequency ranges, bins are defined using the boundaries within them
def get_amplitude_in_frequency_bins(wav, bins):
    """
    Return an array of the max amplitude within binned frequency ranges.
    """
    n = len(bins) - 1
    amplitudes = np.zeros(n)
    for i in range(n):
        low, high = bins[i], bins[i+1]
        amplitudes[i] = max(wav[low:high])
    return amplitudes