In [36]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [145]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', category=ImportWarning)

import functools
import os
import sys


import matplotlib.pyplot as plt
plt.style.use("ggplot")

import numpy as np
import seaborn as sns
import pandas as pd

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import edward2 as ed
from tensorflow.estimator.inputs import numpy_input_fn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

tfd = tfp.distributions

## Global Defaults

In [187]:
WAV_SECONDS = 15
WAV_SAMPLE_RATE = 22050
WAV_SHAPE = [WAV_SECONDS * WAV_SAMPLE_RATE, 1]  # Time-steps X Features

LEARNING_RATE = 0.0005
BATCH_SIZE = 1
BASE_DEPTH = 12
LATENT_DIMENSIONS = 8
ACTIVATION = "leaky_relu"


MODEL_DIR="/data/tensorflow/vae_acoustic_guitar3"
DATA_DIR="{}/data".format(MODEL_DIR)
MAX_STEPS=601
VIZ_STEPS=100

## Helper Functions

In [188]:
def _softplus_inverse(x):
  """Helper which computes the function inverse of `tf.nn.softplus`."""
  return tf.log(tf.math.expm1(x))


## Encoders aka Inference Networks

In [189]:
def make_cnn_encoder(activation, latent_size, base_depth):
  """Creates the encoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: The dimensionality of the encoding.
    base_depth: The lowest depth for a layer.
  Returns:
    encoder: A `callable` mapping a `Tensor` of images to a
      `tfd.Distribution` instance over encodings.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="CAUSAL", activation=activation)

  encoder_net = tf.keras.Sequential([
      conv(base_depth, 5, 1),
      conv(base_depth, 5, 2),
      conv(2 * base_depth, 5, 1),
      conv(2 * base_depth, 5, 2),
      conv(4 * latent_size, 3, padding="VALID"),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*latent_size, activation=None),
  ])

  def encoder(images):
    images = tf.reshape(images, (-1, WAV_SHAPE[0], 1))
    net = encoder_net(images)
    print("NET SHAPE: {}".format(net.shape))

    return tfd.MultivariateNormalDiag(
        loc=net[..., :latent_size],
        scale_diag=tf.nn.softplus(net[..., latent_size:] +
                                  _softplus_inverse(1.0)),
        name="code")
    """
    return tfd.Normal(
        loc=net[..., :latent_size],
        scale=tf.nn.softplus(net[..., latent_size:] + _softplus_inverse(1.0)),
        name="code")
    """

  return encoder

## Decoders aka Generative Models

In [190]:
def make_cnn_decoder(activation, latent_size, output_shape, base_depth):
  """Creates the decoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: Dimensionality of the encoding.
    output_shape: The output image shape.
    base_depth: Smallest depth for a layer.
  Returns:
    decoder: A `callable` mapping a `Tensor` of encodings to a
      `tfd.Distribution` instance over images.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="CAUSAL", activation=activation)
    
  decoder_net = tf.keras.Sequential([
      conv(2 * base_depth, 7, padding="VALID"),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*WAV_SHAPE[0], activation=None),
  ])

  def decoder(codes):
    original_shape = tf.shape(codes)
    codes = tf.reshape(codes, (-1, latent_size, 1))
    net = decoder_net(codes)
    print("DECODER NET SHAPE: {}".format(net.shape))
    """
    return tfd.Independent(tfd.Bernoulli(logits=logits),
                           reinterpreted_batch_ndims=len(output_shape),
                           name="image")
    """
    
    return tfd.Normal(
        loc=net[..., :WAV_SHAPE[0]],
        scale=tf.nn.softplus(net[..., WAV_SHAPE[0]:] + _softplus_inverse(1.0)),
        name="wav")

    """
    return tfd.MultivariateNormalDiag(
        loc=net[..., :WAV_SHAPE[0]],
        scale_diag=tf.nn.softplus(net[..., WAV_SHAPE[0]:] +
                                  _softplus_inverse(1.0)),
        name="wav")
    """
    

  return decoder

### Tensorflow Estimator model_fn

In [None]:
def cnn_model_fn(features, labels, mode, params, config):
  """Builds the model function for use in an estimator.
  Arguments:
    features: The input features for the estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.
  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """


  encoder = make_cnn_encoder(params["activation"],
                             params["latent_size"],
                             params["base_depth"])
  decoder = make_cnn_decoder(params["activation"],
                             params["latent_size"],
                             WAV_SHAPE,
                             params["base_depth"])
  latent_prior = tfd.MultivariateNormalDiag(
        loc=tf.zeros([params["latent_size"]]),
        scale_identity_multiplier=1.0
  )

  approx_posterior = encoder(features)
  approx_posterior_sample = approx_posterior.sample(1)#params["n_samples"])
  decoder_likelihood = decoder(approx_posterior_sample)

  # `distortion` is just the negative log likelihood.
  distortion = -decoder_likelihood.log_prob(features)
  avg_distortion = tf.reduce_mean(distortion)
  tf.summary.scalar("distortion", avg_distortion)

  approx_posterior_log_prob = approx_posterior.log_prob(approx_posterior_sample)
  latent_prior_log_prob = latent_prior.log_prob(approx_posterior_sample)
  rate = (approx_posterior_log_prob -  latent_prior_log_prob)
  avg_rate = tf.reduce_mean(rate)
  tf.summary.scalar("rate", avg_rate)

  elbo_local = -(rate + distortion)
  elbo = tf.reduce_mean(elbo_local)
  #elbo = -(avg_rate + avg_distortion)
  loss = -elbo
  tf.summary.scalar("elbo", elbo)


  importance_weighted_elbo = tf.reduce_mean(
      tf.reduce_logsumexp(elbo_local, axis=0) -
      tf.log(tf.to_float(1)))
  tf.summary.scalar("elbo/importance_weighted", importance_weighted_elbo)

    
  # random_wav = decoder(latent_prior.sample(16))
  random_wav = decoder(approx_posterior.sample(16))
  tf.summary.audio("random/sample", random_wav.sample(), sample_rate=22050)
  tf.summary.audio("random/mean", random_wav.mean(), sample_rate=22050)

  # Perform variational inference by minimizing the -ELBO.
  global_step = tf.train.get_or_create_global_step()
  learning_rate = tf.train.cosine_decay(params["learning_rate"], global_step,
                                        params["max_steps"])
  tf.summary.scalar("learning_rate", learning_rate)
  optimizer = tf.train.AdamOptimizer(learning_rate)
  train_op = optimizer.minimize(loss, global_step=global_step)

  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        'encoded_sample': approx_posterior.sample(1), 
        'encoded_mean': approx_posterior.mean(), 
        'reconstructed_sample': decoder_likelihood.sample(1), 
        'reconstructed_mean': decoder_likelihood.mean(),
        'log_likelihood': -avg_distortion,
        'approx_posterior': approx_posterior,
        'decoder_likelihood': decoder_likelihood,
    }
  else:
    predictions = None

  return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops={
          "elbo": tf.metrics.mean(elbo),
          "elbo/importance_weighted": tf.metrics.mean(importance_weighted_elbo),
          "rate": tf.metrics.mean(avg_rate),
          "distortion": tf.metrics.mean(avg_distortion),
      },
      predictions=predictions,
  )

## Train Model

In [192]:
def train_cnn(train_input_fn, eval_input_fn):

    params = {
        'learning_rate': LEARNING_RATE,
        'batch_size': BATCH_SIZE,
        'latent_size': LATENT_DIMENSIONS,
        'activation': ACTIVATION,
        'base_depth': BASE_DEPTH,
        'max_steps': MAX_STEPS,
    }
    params["activation"] = getattr(tf.nn, params["activation"])
    """
    if FLAGS.delete_existing and tf.gfile.Exists(MODEL_DIR):
        tf.logging.warn("Deleting old log directory at {}".format(MODEL_DIR))
        tf.gfile.DeleteRecursively(MODEL_DIR)
        tf.gfile.MakeDirs(MODEL_DIR)
    """
    
    estimator = tf.estimator.Estimator(
      cnn_model_fn,
      params=params,
      config=tf.estimator.RunConfig(
          model_dir=MODEL_DIR,
          save_checkpoints_steps=VIZ_STEPS,
      ),
    )
    for _ in range(MAX_STEPS // VIZ_STEPS):
        estimator.train(input_fn=train_input_fn, steps=VIZ_STEPS)
        estimator.evaluate(input_fn=eval_input_fn)
    return estimator

## Load Training Data

In [11]:
def load_preprocessed_data(file_name):
    import feather  # Super fast way to read/write tabular data
    training_data = feather.read_dataframe(file_name).set_index('index')
    return training_data

In [None]:
estimators = []

In [125]:
#for label in ['Acoustic_guitar', 'Clarinet', 'Flute', 'Applause', 'Laughter']:
for label in ['Clarinet', 'Flute', 'Applause', 'Laughter']:
    train_data = load_preprocessed_data('audio_train/padded_train_15s_{}_160.feather'.format(label))
    y_train = train_data['label']
    x_train = train_data.drop(['label', 'manually_verified'], axis=1)
    test_data = load_preprocessed_data('audio_train/padded_test_15s_{}.feather'.format(label))
    y_test = test_data['label']
    x_test = test_data.drop(['label', 'manually_verified'], axis=1)
    
    train_input_fn = numpy_input_fn(
        x_train.values.astype(np.float32), 
        shuffle=True, 
        batch_size=1
    )
    
    eval_input_fn = numpy_input_fn(
        x_test.values.astype(np.float32), 
        shuffle=True, 
        batch_size=1
    )
    
    estimator = train_cnn(train_input_fn, eval_input_fn)
    estimators.append((label, estimator))

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,330742,330743,330744,330745,330746,330747,330748,330749,label,manually_verified
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
de6ee8f7.wav,-0.004095,-0.005225,-0.003361913,-0.00287,-0.001796,-0.001761,-0.001772,-0.002821,-0.004164,-0.006202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
be7844f0.wav,-3e-05,-1.4e-05,-2.419148e-05,-4.6e-05,-4e-06,-2.3e-05,-1.8e-05,1e-05,2e-06,8e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
367ad7b1.wav,0.006932,0.030688,0.06090414,0.08822,0.117746,0.148811,0.173088,0.192097,0.202241,0.214423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,1
c1316531.wav,-5e-06,7e-06,0.0002292423,-0.000129,-0.000548,2e-06,0.000246,-6.4e-05,-1.6e-05,0.000328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
4a0a261e.wav,-3.2e-05,-1e-05,7.591314e-07,-1.7e-05,-4e-06,-3.3e-05,-3.2e-05,-2.6e-05,-3e-05,-3.4e-05,...,-0.018334,-0.003756,0.001596,0.002484,0.00611,0.00303,0.006669,0.0211,Acoustic_guitar,1


In [193]:
train_data = load_preprocessed_data('audio_train/padded_train_15s_Acoustic_guitar_160.feather')
y_train = train_data['label']
x_train = train_data.drop(['label', 'manually_verified'], axis=1)
train_data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,330742,330743,330744,330745,330746,330747,330748,330749,label,manually_verified
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
de6ee8f7.wav,-0.004095,-0.005225,-0.003361913,-0.00287,-0.001796,-0.001761,-0.001772,-0.002821,-0.004164,-0.006202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
be7844f0.wav,-3e-05,-1.4e-05,-2.419148e-05,-4.6e-05,-4e-06,-2.3e-05,-1.8e-05,1e-05,2e-06,8e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
367ad7b1.wav,0.006932,0.030688,0.06090414,0.08822,0.117746,0.148811,0.173088,0.192097,0.202241,0.214423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,1
c1316531.wav,-5e-06,7e-06,0.0002292423,-0.000129,-0.000548,2e-06,0.000246,-6.4e-05,-1.6e-05,0.000328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,0
4a0a261e.wav,-3.2e-05,-1e-05,7.591314e-07,-1.7e-05,-4e-06,-3.3e-05,-3.2e-05,-2.6e-05,-3e-05,-3.4e-05,...,-0.018334,-0.003756,0.001596,0.002484,0.00611,0.00303,0.006669,0.0211,Acoustic_guitar,1


In [126]:
train_data.shape

(160, 330752)

In [127]:
test_data = load_preprocessed_data('audio_train/padded_test_15s_Acoustic_guitar.feather')
y_test = test_data['label']
x_test = test_data.drop(['label', 'manually_verified'], axis=1)
test_data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,330742,330743,330744,330745,330746,330747,330748,330749,label,manually_verified
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bb54b5c9.wav,0.000886,0.000981,0.000141,-0.000155,7.5e-05,0.000193,0.000123,0.000847,0.001696,0.002418,...,-1.3e-05,-1.7e-05,-1.8e-05,-1.5e-05,-9e-06,1.9e-05,-1.3e-05,-6e-06,Acoustic_guitar,0
44fb3eeb.wav,0.040266,-0.104471,-0.224931,-0.281608,-0.24598,-0.281943,-0.306655,-0.196842,-0.08335,-0.034618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,1
ed125cf8.wav,3e-06,-2.3e-05,-4.9e-05,-6e-06,-3.6e-05,-2.9e-05,2.9e-05,3.5e-05,-3.1e-05,-1.7e-05,...,-0.159538,-0.152375,-0.141169,-0.131866,-0.123631,-0.116339,-0.107984,-0.098099,Acoustic_guitar,1
d8a257e5.wav,-0.000527,-0.000586,-0.000373,-0.00059,-0.000463,-0.000592,-0.000681,-0.000682,-0.000362,-0.000446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,1
75f988e6.wav,0.002974,0.003516,0.006435,0.011007,0.009738,0.01126,0.00424,0.007511,0.013776,0.019825,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Acoustic_guitar,1


In [None]:
#mport feather
#f = feather.read_dataframe('librosa_train.feather').set_index('index')
#f.loc[df['label'] == 'Acoustic_guitar']

In [128]:
test_data.shape

(20, 330752)

In [194]:
train_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=True, 
    batch_size=1
)

In [195]:
eval_input_fn = numpy_input_fn(
    x_test.values.astype(np.float32), 
    shuffle=True, 
    batch_size=1
)

In [196]:
estimator = train_cnn(train_input_fn, eval_input_fn)

INFO:tensorflow:Using config: {'_model_dir': '/data/tensorflow/vae_acoustic_guitar3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f578a2f2860>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (?, 661500)
INFO:tensorflow:Done calling model_fn.
INFO

KeyboardInterrupt: 

## Model Criticism

In [None]:
def predict_estimators(new_point, estimator_tuples):
    max_likelihood = None
    pred_label = None
    for label, estimator in estimator_tuples:
        pred = estimator.predict(new_point)
        likelihood = pred['log_likelihood']
        if max_likelihood is None or likelihood > max_likelihood:
            pred_label = label
            max_likelihood = likelihood
    return pred_label

In [84]:
predict_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=False, 
    batch_size=1
)

In [85]:
predictions = list(
    estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)
)

INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/vae3/model.ckpt-150
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [86]:
predictions[0]

{'encoded_sample': array([[[  2.603995 , -23.583975 ,  27.688742 , -16.355078 ,
            2.0609016,  36.320007 ,  26.086355 ,  -6.1149073]]],
       dtype=float32),
 'encoded_mean': array([[  2.6034892, -23.58398  ,  27.707104 , -16.35479  ,   6.4797626,
           8.231646 ,  26.08634  ,  -3.708294 ]], dtype=float32),
 'reconstructed_sample': array([[[-0.11386764, -0.18411264,  0.29934588, ...,  0.30444548,
          -0.58563066, -0.2591611 ]]], dtype=float32),
 'reconstructed_mean': array([[-0.02832055,  0.02835023, -0.03375824, ..., -0.01279413,
         -0.00240331, -0.00963796]], dtype=float32),
 'log_likelihood': -30853.375,
 'avg_distortion': 30853.375}

In [87]:
sample = x_train.iloc[0]

In [19]:
predicted_encodings = pd.DataFrame([predictions[i]['encoded_sample'][0][0] for i in range(len(predictions))])

In [20]:
y_df = y_train.to_frame()
y_df['label_cat'] = y_df['label'].astype('category')
y_df['label_int'] = y_df['label_cat'].cat.codes

In [22]:
lr = LogisticRegression()
lr.fit(predicted_encodings.values, y_df['label_int'].values)
lr.score(predicted_encodings.values, y_df['label_int'].values)

0.08658536585365853

Clearly, the latent representation is not linearly separable!

In [25]:
r = RandomForestClassifier()
r.fit(predicted_encodings.values, y_df['label_int'].values)
r.score(predicted_encodings.values, y_df['label_int'].values)

0.998780487804878

That's more like it!

### TODO repeat the tests on *OUT OF SAMPLE* data

## Data Preprocessing

In [33]:
import librosa

In [28]:
def audio_padding(descriptive_df, mode='train'):
    # '' using 15 second as our standard for padded audio files
    # '' this function only returns a list of np.arrays without any label association (for faster run time)
    names = []
    audio = []
    seconds = 15
    sample_rate = 22050
    max_len = seconds * sample_rate
    for row in descriptive_df.itertuples():
        file_path = 'audio_{}/audio_{}/{}'.format(mode, mode, row.Index)
        data = librosa.load(file_path)[0][:max_len]
        duration = data.shape[0]
        padding_len = max_len - duration
        padding = np.zeros(padding_len)
        data = np.append(data, padding)
        audio.append(data)
        names.append(row.Index)
    return pd.DataFrame(data=audio, index=names)

In [50]:
def get_df(label, count=None):
    data = descriptive_df.loc[descriptive_df.label == label]
    if count:
        data = data.sample(n=count, replace=False)
    return audio_padding(data)

In [30]:
descriptive_df = pd.read_csv('train_descriptive.csv', index_col=0).set_index('fname')

In [31]:
descriptive_df.head()

Unnamed: 0_level_0,label,manually_verified,length,mean,min,max,std,rms,skewness,kurtosis,...,mfcc_19_kurt,centroid_kurt,bandwidth_kurt,contrast_kurt,rolloff_kurt,flatness_kurt,zcr_kurt,nframes,frame_rate,duration
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00044347.wav,Hi-hat,0,617400.0,-2.8e-05,-0.567657,0.554474,0.036831,0.036831,-0.024415,58.395596,...,6.157727,0.15747,0.09813,0.071289,0.511719,2.018521,3.71163,617400,44100,14.0
001ca53d.wav,Saxophone,1,455112.0,-1.5e-05,-0.384308,0.479156,0.097552,0.097552,0.817412,1.961728,...,146.944106,0.021231,0.01238,0.008789,0.145996,4.477385,30.205433,455112,44100,10.32
002d256b.wav,Trumpet,0,19404.0,-3.5e-05,-0.019409,0.021149,0.002002,0.002003,0.008723,25.091358,...,10.77598,0.057926,0.025934,0.01416,0.133301,1.13151,1.722913,19404,44100,0.44
0033e230.wav,Glockenspiel,1,352800.0,-2.5e-05,-0.254669,0.24054,0.007908,0.007908,-0.019489,45.271655,...,-1.632461,0.21743,0.213069,0.047363,0.529785,0.53429,-1.671211,352800,44100,8.0
00353774.wav,Cello,1,199332.0,-1e-06,-0.39151,0.361877,0.09173,0.09173,-0.334202,1.349436,...,24.914697,0.035015,0.014186,0.0,0.154297,3.124701,28.114944,199332,44100,4.52


In [34]:
# Get a random 20 WAV files for each label
train_df = pd.concat([get_df(l, 20) for l in descriptive_df['label'].unique()])

In [35]:
train_df = train_df.join(descriptive_df[['label', 'manually_verified']])
train_df = train_df.reset_index()
train_df.columns = [str(col) for col in train_df.columns]
train_df.to_feather('padded_train_15_sample_v2.feather')

In [53]:
for label in descriptive_df['label'].unique():
    print("Prepping training set for label: {}".format(label))
    train_df = get_df(label)
    train_df = train_df.join(descriptive_df[['label', 'manually_verified']])
    train_df = train_df.reset_index()
    train_df.columns = [str(col) for col in train_df.columns]
    train_df.to_feather('audio_train/padded_train_15_{}.feather'.format(label))

Prepping training set for label: Hi-hat
Prepping training set for label: Saxophone
Prepping training set for label: Trumpet


KeyboardInterrupt: 