In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [4]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', category=ImportWarning)

import functools
import os
import sys


import matplotlib.pyplot as plt
plt.style.use("ggplot")

import numpy as np
import seaborn as sns
import pandas as pd

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import edward2 as ed
from tensorflow.estimator.inputs import numpy_input_fn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

tfd = tfp.distributions

## Global Defaults

In [16]:
WAV_SECONDS = 15
WAV_SAMPLE_RATE = 22050
WAV_SHAPE = [WAV_SECONDS * WAV_SAMPLE_RATE, 1]  # Time-steps X Features

LEARNING_RATE = 0.0005
BATCH_SIZE = 1
BASE_DEPTH = 12
LATENT_DIMENSIONS = 8
ACTIVATION = "leaky_relu"


MODEL_DIR="/data/tensorflow/individual_vae"
DATA_DIR="{}/data".format(MODEL_DIR)
MAX_STEPS=40
VIZ_STEPS=20

## Helper Functions

In [17]:
def _softplus_inverse(x):
  """Helper which computes the function inverse of `tf.nn.softplus`."""
  return tf.log(tf.math.expm1(x))


## Encoders aka Inference Networks

In [18]:
def make_cnn_encoder(activation, latent_size, base_depth):
  """Creates the encoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: The dimensionality of the encoding.
    base_depth: The lowest depth for a layer.
  Returns:
    encoder: A `callable` mapping a `Tensor` of images to a
      `tfd.Distribution` instance over encodings.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="CAUSAL", activation=activation)

  encoder_net = tf.keras.Sequential([
      conv(base_depth, 5, 1),
      conv(base_depth, 5, 2),
      conv(2 * base_depth, 5, 1),
      conv(2 * base_depth, 5, 2),
      conv(4 * latent_size, 3, padding="VALID"),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*latent_size, activation=None),
  ])

  def encoder(images):
    images = tf.reshape(images, (-1, WAV_SHAPE[0], 1))
    net = encoder_net(images)
    print("NET SHAPE: {}".format(net.shape))

    return tfd.MultivariateNormalDiag(
        loc=net[..., :latent_size],
        scale_diag=tf.nn.softplus(net[..., latent_size:] +
                                  _softplus_inverse(1.0)),
        name="code")
    """
    return tfd.Normal(
        loc=net[..., :latent_size],
        scale=tf.nn.softplus(net[..., latent_size:] + _softplus_inverse(1.0)),
        name="code")
    """

  return encoder

## Decoders aka Generative Models

In [19]:
def make_cnn_decoder(activation, latent_size, output_shape, base_depth):
  """Creates the decoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: Dimensionality of the encoding.
    output_shape: The output image shape.
    base_depth: Smallest depth for a layer.
  Returns:
    decoder: A `callable` mapping a `Tensor` of encodings to a
      `tfd.Distribution` instance over images.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="CAUSAL", activation=activation)
    
  decoder_net = tf.keras.Sequential([
      conv(2 * base_depth, 7, padding="VALID"),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*WAV_SHAPE[0], activation=None),
  ])

  def decoder(codes):
    original_shape = tf.shape(codes)
    codes = tf.reshape(codes, (-1, latent_size, 1))
    net = decoder_net(codes)
    print("DECODER NET SHAPE: {}".format(net.shape))
    """
    return tfd.Independent(tfd.Bernoulli(logits=logits),
                           reinterpreted_batch_ndims=len(output_shape),
                           name="image")
    """
    
    return tfd.Normal(
        loc=net[..., :WAV_SHAPE[0]],
        scale=tf.nn.softplus(net[..., WAV_SHAPE[0]:] + _softplus_inverse(1.0)),
        name="wav")

    """
    return tfd.MultivariateNormalDiag(
        loc=net[..., :WAV_SHAPE[0]],
        scale_diag=tf.nn.softplus(net[..., WAV_SHAPE[0]:] +
                                  _softplus_inverse(1.0)),
        name="wav")
    """
    

  return decoder

### Tensorflow Estimator model_fn

In [20]:
def cnn_model_fn(features, labels, mode, params, config):
  """Builds the model function for use in an estimator.
  Arguments:
    features: The input features for the estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.
  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """


  encoder = make_cnn_encoder(params["activation"],
                             params["latent_size"],
                             params["base_depth"])
  decoder = make_cnn_decoder(params["activation"],
                             params["latent_size"],
                             WAV_SHAPE,
                             params["base_depth"])
  latent_prior = tfd.MultivariateNormalDiag(
        loc=tf.zeros([params["latent_size"]]),
        scale_identity_multiplier=1.0
  )

  approx_posterior = encoder(features)
  approx_posterior_sample = approx_posterior.sample(1)#params["n_samples"])
  decoder_likelihood = decoder(approx_posterior_sample)

  # `distortion` is just the negative log likelihood.
  distortion = -decoder_likelihood.log_prob(features)
  avg_distortion = tf.reduce_mean(distortion)
  tf.summary.scalar("distortion", avg_distortion)

  approx_posterior_log_prob = approx_posterior.log_prob(approx_posterior_sample)
  latent_prior_log_prob = latent_prior.log_prob(approx_posterior_sample)
  rate = (approx_posterior_log_prob -  latent_prior_log_prob)
  avg_rate = tf.reduce_mean(rate)
  tf.summary.scalar("rate", avg_rate)

  elbo_local = -(rate + distortion)
  elbo = tf.reduce_mean(elbo_local)
  #elbo = -(avg_rate + avg_distortion)
  loss = -elbo
  tf.summary.scalar("elbo", elbo)


  importance_weighted_elbo = tf.reduce_mean(
      tf.reduce_logsumexp(elbo_local, axis=0) -
      tf.log(tf.to_float(1)))
  tf.summary.scalar("elbo/importance_weighted", importance_weighted_elbo)

    
  random_wav = decoder(latent_prior.sample(16))
  #random_wav = decoder(approx_posterior.sample(16))
  tf.summary.audio("random/sample", random_wav.sample(), sample_rate=22050)
  tf.summary.audio("random/mean", random_wav.mean(), sample_rate=22050)

  # Perform variational inference by minimizing the -ELBO.
  global_step = tf.train.get_or_create_global_step()
  learning_rate = tf.train.cosine_decay(params["learning_rate"], global_step,
                                        params["max_steps"])
  tf.summary.scalar("learning_rate", learning_rate)
  optimizer = tf.train.AdamOptimizer(learning_rate)
  train_op = optimizer.minimize(loss, global_step=global_step)

  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        'encoded_sample': approx_posterior.sample(1), 
        'encoded_mean': approx_posterior.mean(), 
        'reconstructed_sample': decoder_likelihood.sample(1), 
        'reconstructed_mean': decoder_likelihood.mean(),
        'log_likelihood': -avg_distortion,
        'random_wav': random_wav.mean(),
    }
  else:
    predictions = None

  return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops={
          "elbo": tf.metrics.mean(elbo),
          "elbo/importance_weighted": tf.metrics.mean(importance_weighted_elbo),
          "rate": tf.metrics.mean(avg_rate),
          "distortion": tf.metrics.mean(avg_distortion),
      },
      predictions=predictions,
  )

## Train Model

In [21]:
def train_cnn(train_input_fn, eval_input_fn=None, model_dir=MODEL_DIR):

    params = {
        'learning_rate': LEARNING_RATE,
        'batch_size': BATCH_SIZE,
        'latent_size': LATENT_DIMENSIONS,
        'activation': ACTIVATION,
        'base_depth': BASE_DEPTH,
        'max_steps': MAX_STEPS,
    }
    params["activation"] = getattr(tf.nn, params["activation"])
    """
    if FLAGS.delete_existing and tf.gfile.Exists(MODEL_DIR):
        tf.logging.warn("Deleting old log directory at {}".format(MODEL_DIR))
        tf.gfile.DeleteRecursively(MODEL_DIR)
        tf.gfile.MakeDirs(MODEL_DIR)
    """
    
    estimator = tf.estimator.Estimator(
      cnn_model_fn,
      params=params,
      config=tf.estimator.RunConfig(
          model_dir=model_dir,
          save_checkpoints_steps=VIZ_STEPS,
      ),
    )
    for _ in range(MAX_STEPS // VIZ_STEPS):
        estimator.train(input_fn=train_input_fn, steps=VIZ_STEPS)
        if eval_input_fn:
            estimator.evaluate(input_fn=eval_input_fn)
    return estimator

## Load Training Data

In [11]:
def load_preprocessed_data(file_name):
    import feather  # Super fast way to read/write tabular data
    training_data = feather.read_dataframe(file_name).set_index('index')
    return training_data

In [12]:
import os

In [13]:
estimators = []

In [22]:
for label in ['Acoustic_guitar', 'Clarinet', 'Flute', 'Applause', 'Laughter']:
#for label in ['Clarinet', 'Flute', 'Applause', 'Laughter']:
    train_data = load_preprocessed_data('audio_train/padded_train_15s_{}_160.feather'.format(label))
    y_train = train_data['label']
    x_train = train_data.drop(['label', 'manually_verified'], axis=1)
    test_data = load_preprocessed_data('audio_train/padded_test_15s_{}.feather'.format(label))
    y_test = test_data['label']
    x_test = test_data.drop(['label', 'manually_verified'], axis=1)
    
    train_input_fn = numpy_input_fn(
        x_train.values.astype(np.float32), 
        shuffle=True, 
        batch_size=1
    )
    
    eval_input_fn = numpy_input_fn(
        x_test.values.astype(np.float32), 
        shuffle=True, 
        batch_size=1
    )
    model_dir = os.path.join(MODEL_DIR,label)
    print("Model dir: {}".format(model_dir))
    
    estimator = train_cnn(train_input_fn, eval_input_fn, model_dir)
    estimators.append((label, estimator))

Model dir: /data/tensorflow/individual_vae/Acoustic_guitar
INFO:tensorflow:Using config: {'_model_dir': '/data/tensorflow/individual_vae/Acoustic_guitar', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 20, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa30af4ea20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODE

INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /data/tensorflow/individual_vae/Flute/model.ckpt.
INFO:tensorflow:loss = 0.9150141, step = 1
INFO:tensorflow:Saving checkpoints for 20 into /data/tensorflow/individual_vae/Flute/model.ckpt.
INFO:tensorflow:Loss for final step: 11.559071.
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-28-04:39:18
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/individual_vae/Flute/model.ckpt-20
INFO:tensorflow:Running local_init_op.
INFO:tensorfl

NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-28-04:43:20
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/individual_vae/Laughter/model.ckpt-20
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-28-04:43:25
INFO:tensorflow:Saving dict for global step 20: distortion = 0.907132, elbo = -1.2392744, elbo/importance_weighted = -1.2392744, global_step = 20, loss = 1.2392744, rate = 0.33210343
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 20: /data/tensorflow/individual_vae/Laughter/model.ckpt-20
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
I

In [None]:
import feather

In [None]:
ag = feather.read_dataframe('padded_train_sample.feather').set_index('index')

In [None]:
acoustic = ag.loc[ag['label']=='Acoustic_guitar']

In [None]:
x_train = acoustic.drop(['label', 'manually_verified'], axis=1)

In [None]:
train_data = load_preprocessed_data('audio_train/padded_train_15s_Acoustic_guitar_160.feather')
y_train = train_data['label']
x_train = train_data.drop(['label', 'manually_verified'], axis=1)
train_data.head()

In [None]:
train_data.shape

In [None]:
test_data = load_preprocessed_data('audio_train/padded_test_15s_Acoustic_guitar.feather')
y_test = test_data['label']
x_test = test_data.drop(['label', 'manually_verified'], axis=1)
test_data.head()

In [None]:
#mport feather
#f = feather.read_dataframe('librosa_train.feather').set_index('index')
#f.loc[df['label'] == 'Acoustic_guitar']

In [None]:
test_data.shape

In [None]:
train_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=True, 
    batch_size=1
)

In [None]:
eval_input_fn = numpy_input_fn(
    x_test.values.astype(np.float32), 
    shuffle=True, 
    batch_size=1
)

In [None]:
estimator = train_cnn(train_input_fn, None)#eval_input_fn)

## Model Criticism

In [35]:
import numpy as np

In [39]:
ex = np.array([x_test.iloc[i].astype(np.float32)])

In [45]:
predict_input_fn = numpy_input_fn(
    ex, 
    shuffle=False, 
    batch_size=1
)

In [48]:
list(estimators[0][1].predict(predict_input_fn, yield_single_examples=False))

INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/individual_vae/Acoustic_guitar/model.ckpt-40
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'encoded_sample': array([[[-0.6095326 , -0.652659  , -0.69579613,  0.29838806,
            0.25552207,  0.09149103, -1.6592124 ,  0.29482937]]],
        dtype=float32),
  'encoded_mean': array([[-0.10565343, -0.07591414,  0.01060685,  0.03267853,  0.16968602,
           0.14862823, -0.07775313,  0.05574958]], dtype=float32),
  'reconstructed_sample': array([[[-0.37584585, -2.3290699 , -0.978498  , ..., -0.11236154,
           -0.7556611 ,  0.48820338]]], dtype=float32),
  'reconstructed_mean': array([[ 0.00229887, -0.00315949, -0.00090564, ...,  0.00077478,
          -0.00399355, -0.00254196]], dtype=float32),
  'log_likelihood': -0.86055076,
  'random_wav': array([[ 2.0726852e-03, -3.2971250e-03, -1.3523323e-03, ...,
           8.1298471e-04, -3.7084175e-03, -2.5208769e-03],
         [ 2.3172922e-03, -3.6662705e-03, -1.1740256e-03, ...,
           1.0616041e-03, -4.3817814e-03, -2.5132145e-03],
         [ 2.0424835e-03, -3.9937277e-03, -1.8210706e-03, ...,
           1.3002637e-03, 

In [44]:
x_test.shape

(20, 330750)

In [43]:
ex.shape

(1, 330750)

In [40]:
eval_input_fn = numpy_input_fn(
    ex, 
    shuffle=false, 
    batch_size=1
)

In [51]:
def predict_estimators(new_point, estimator_tuples):
    max_likelihood = None
    pred_label = None
    ex = np.array([x_test.iloc[i].astype(np.float32)])
    predict_input_fn = numpy_input_fn(
        ex, 
        shuffle=False, 
        batch_size=1
    )
    for label, estimator in estimator_tuples:
        pred = list(estimator.predict(predict_input_fn, yield_single_examples=False))[0]
        likelihood = pred['log_likelihood']
        if max_likelihood is None or likelihood > max_likelihood:
            pred_label = label
            max_likelihood = likelihood
    return pred_label

In [None]:
correct = 0
incorrect = 0
pred_labels = []
for label in ['Acoustic_guitar', 'Clarinet', 'Flute', 'Applause', 'Laughter']:
    test_data = load_preprocessed_data('audio_train/padded_test_15s_{}.feather'.format(label))
    y_test = test_data['label']
    x_test = test_data.drop(['label', 'manually_verified'], axis=1)
    for i in range(x_test.shape[0]):
        pred_label = predict_estimators(x_test.iloc[i].values, estimators)
        pred_labels.append(pred_label)
        if pred_label == y_test.iloc[i]:
            correct += 1
        else:
            incorrect += 1
        print("CORRECT: {}, INCORRECT: {}, LABEL: {}, PRED_LABEL: {}".format(correct, incorrect, label, pred_label))

INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/individual_vae/Acoustic_guitar/model.ckpt-40
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/individual_vae/Clarinet/model.ckpt-40
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling model_fn.
NET SHAPE: (?, 16)
DECODER NET SHAPE: (?, 661500)
DECODER NET SHAPE: (16, 661500)
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflo

In [55]:
pred_labels

['Flute']

In [None]:
predict_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=False, 
    batch_size=1
)

In [None]:
predictions = list(
    estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)
)

In [None]:
predictions[0]['random_wav'][0].shape

In [None]:
import librosa

In [None]:
for i in range(len(predictions)):
    sample = predictions[i]['random_wav'][0]
    librosa.output.write_wav('generated/{}.wav'.format(i), sample, WAV_SAMPLE_RATE)

In [None]:
sample = x_train.iloc[0]

In [None]:
predicted_encodings = pd.DataFrame([predictions[i]['encoded_sample'][0][0] for i in range(len(predictions))])

In [None]:
y_df = y_train.to_frame()
y_df['label_cat'] = y_df['label'].astype('category')
y_df['label_int'] = y_df['label_cat'].cat.codes

In [None]:
lr = LogisticRegression()
lr.fit(predicted_encodings.values, y_df['label_int'].values)
lr.score(predicted_encodings.values, y_df['label_int'].values)

Clearly, the latent representation is not linearly separable!

In [None]:
r = RandomForestClassifier()
r.fit(predicted_encodings.values, y_df['label_int'].values)
r.score(predicted_encodings.values, y_df['label_int'].values)

That's more like it!

### TODO repeat the tests on *OUT OF SAMPLE* data

## Data Preprocessing

In [None]:
import librosa

In [None]:
def audio_padding(descriptive_df, mode='train'):
    # '' using 15 second as our standard for padded audio files
    # '' this function only returns a list of np.arrays without any label association (for faster run time)
    names = []
    audio = []
    seconds = 15
    sample_rate = 22050
    max_len = seconds * sample_rate
    for row in descriptive_df.itertuples():
        file_path = 'audio_{}/audio_{}/{}'.format(mode, mode, row.Index)
        data = librosa.load(file_path)[0][:max_len]
        duration = data.shape[0]
        padding_len = max_len - duration
        padding = np.zeros(padding_len)
        data = np.append(data, padding)
        audio.append(data)
        names.append(row.Index)
    return pd.DataFrame(data=audio, index=names)

In [None]:
def get_df(label, count=None):
    data = descriptive_df.loc[descriptive_df.label == label]
    if count:
        data = data.sample(n=count, replace=False)
    return audio_padding(data)

In [None]:
descriptive_df = pd.read_csv('train_descriptive.csv', index_col=0).set_index('fname')

In [None]:
descriptive_df.head()

In [None]:
# Get a random 20 WAV files for each label
train_df = pd.concat([get_df(l, 20) for l in descriptive_df['label'].unique()])

In [None]:
train_df = train_df.join(descriptive_df[['label', 'manually_verified']])
train_df = train_df.reset_index()
train_df.columns = [str(col) for col in train_df.columns]
train_df.to_feather('padded_train_15_sample_v2.feather')

In [None]:
for label in descriptive_df['label'].unique():
    print("Prepping training set for label: {}".format(label))
    train_df = get_df(label)
    train_df = train_df.join(descriptive_df[['label', 'manually_verified']])
    train_df = train_df.reset_index()
    train_df.columns = [str(col) for col in train_df.columns]
    train_df.to_feather('audio_train/padded_train_15_{}.feather'.format(label))