In [18]:
import numpy as np
import librosa
import tarfile
import soundfile as sf
import io

### CONFIGURATION VARIABLES
### UPDATE THESE WHEN RUNNING LOCALLY
PATH = "C:/Users/alexc/Downloads/nsynth-train.jsonwav.tar.gz"

targets_list = [
    "bass", "brass", "flute", "guitar", "keyboard",
    "mallet", "organ", "reed", "string", "synth", "vocal"
]

# recover wav files from tarball
def get_files(PATH=PATH, num_files = 100):
    """Sequentially pull wav files from .tar.gz file
    :param PATH: path to compressed dataset file
    :param num_files: number of files to read
    :returns: 4 generators for data (wav file converted to list), sr (sample rate)
            target name (e.g. 'guitar', 'mallet', ect.), and target index
     """

    # open the tar file
    with tarfile.open(PATH, 'r:gz') as tar:

        # Initialize counter to count number of files pulled.
        index = 0
        while index < num_files:
            fname = tar.next()

            # Break if there are no more files.
            if fname is None:
                break

            # Check that we're dealing with the proper format
            if fname.name.endswith(".wav"):

                # Extract file
                wav_file = tar.extractfile(fname).read()

                # Convert bytes to a readable format
                data, sr = sf.read(io.BytesIO(wav_file))

                # Get target from filename
                target = fname.name.split('/')[2].split('_')[0]

                # yeild the 4 generators
                yield data, sr, target, targets_list.index(target)
                index += 1

# collect raw data into an array
def get_dataset(num_files=10):
    """Docstring pending
    """
    data = []
    data_generator = get_files(PATH, num_files)
    for i in range(num_files):
        data.append(next(data_generator))
    return data

# calculate mfccs
def get_mfccs(data_tuple):
    """Take a tuple of data (data, sr, target, target_index) and return the associated mfcc"""
    data = data_tuple[0]
    sr = data_tuple[1]
    mfcc = librosa.feature.mfcc(y=data, sr=sr)
    return mfcc

# prepare the data for input to model
def prepare_data(dataset):
    """Get the mfccs for each record in the dataset and the associated target values
    """

    X = np.array([get_mfccs(i[0]) for i in dataset])
    t = np.array([i[3] for i in dataset])

    return X, t

In [19]:
dataset = get_dataset(num_files=100)

In [27]:
dataset

[(array([-0.00012207, -0.00445557, -0.00973511, ...,  0.        ,
          0.        ,  0.        ]),
  16000,
  'mallet',
  5),
 (array([ 0.09405518,  0.48147583, -0.1960144 , ...,  0.        ,
          0.        ,  0.        ]),
  16000,
  'bass',
  0),
 (array([ 0.01577759,  0.00390625, -0.02041626, ...,  0.        ,
          0.        ,  0.        ]),
  16000,
  'mallet',
  5),
 (array([0., 0., 0., ..., 0., 0., 0.]), 16000, 'keyboard', 4),
 (array([0., 0., 0., ..., 0., 0., 0.]), 16000, 'string', 8),
 (array([0., 0., 0., ..., 0., 0., 0.]), 16000, 'keyboard', 4),
 (array([0., 0., 0., ..., 0., 0., 0.]), 16000, 'brass', 1),
 (array([ 0.        , -0.00094604, -0.00268555, ...,  0.        ,
          0.        ,  0.        ]),
  16000,
  'reed',
  7),
 (array([ 0.00112915, -0.0015564 ,  0.00216675, ...,  0.        ,
          0.        ,  0.        ]),
  16000,
  'keyboard',
  4),
 (array([0., 0., 0., ..., 0., 0., 0.]), 16000, 'string', 8),
 (array([0., 0., 0., ..., 0., 0., 0.]), 1600

In [20]:
t = np.array([i[3] for i in dataset])

In [21]:
t

array([ 5,  0,  5,  4,  8,  4,  1,  7,  4,  8,  4,  3,  6,  9,  0,  4,  4,
        1,  4,  3,  4,  8,  2,  0,  8,  4,  6,  5,  4,  3,  7,  6,  5,  0,
        6,  0,  4,  0,  0,  5,  5,  4,  3,  1,  8,  6,  4,  3,  6,  5,  4,
        7,  6,  4,  4,  1,  5,  0,  5,  0,  8,  8,  7,  8,  4,  8,  2,  0,
        2,  3,  0,  3,  4,  1,  3,  8,  8,  4,  4,  0,  3, 10,  2,  6,  9,
        0,  0,  0,  2,  3,  5,  0,  3,  3,  4,  4,  3,  4,  4,  5])

In [22]:
from tensorflow.keras.utils import to_categorical

In [23]:
t_bin = to_categorical(t)

In [25]:
t_bin.shape

(100, 11)

In [41]:
# testing the pipeline
from train_data_preparation import MfccPipeline

ImportError: cannot import name 'MfccPipeline' from 'train_data_preparation' (C:\Users\alexc\Documents\GitHub\music-analysis\train_data_preparation.py)

In [39]:
pipe.targets_list

['bass',
 'brass',
 'flute',
 'guitar',
 'keyboard',
 'mallet',
 'organ',
 'reed',
 'string',
 'synth_lead',
 'vocal']

In [31]:
X_train, X_test, t_train, t_test = pipe.mfcc_pipeline(num_samples=1000)

ValueError: 'synth' is not in list