In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [5]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', category=ImportWarning)

import functools
import os
import sys


import matplotlib.pyplot as plt
plt.style.use("ggplot")

import numpy as np
import seaborn as sns
import pandas as pd

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import edward2 as ed
from tensorflow.estimator.inputs import numpy_input_fn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

tfd = tfp.distributions

## Global Defaults

In [6]:
WAV_SECONDS = 15
WAV_SAMPLE_RATE = 22050
WAV_SHAPE = [WAV_SECONDS * WAV_SAMPLE_RATE, 1]  # Time-steps X Features

LEARNING_RATE = 0.0001
BATCH_SIZE = 1
BASE_DEPTH = 12
LATENT_DIMENSIONS = 8
ACTIVATION = "leaky_relu"


MODEL_DIR="/data/tensorflow/vae"
DATA_DIR="/data/tensorflow/vae/data"
MAX_STEPS=501
VIZ_STEPS=50

## Helper Functions

In [7]:
def _softplus_inverse(x):
  """Helper which computes the function inverse of `tf.nn.softplus`."""
  return tf.log(tf.math.expm1(x))


## Encoders aka Inference Networks

In [8]:
def make_cnn_encoder(activation, latent_size, base_depth):
  """Creates the encoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: The dimensionality of the encoding.
    base_depth: The lowest depth for a layer.
  Returns:
    encoder: A `callable` mapping a `Tensor` of images to a
      `tfd.Distribution` instance over encodings.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="SAME", activation=activation)

  encoder_net = tf.keras.Sequential([
      conv(base_depth, 5, 1),
      conv(base_depth, 5, 2),
      conv(2 * base_depth, 5, 1),
      conv(2 * base_depth, 5, 2),
      conv(4 * latent_size, 3, padding="VALID"),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*latent_size, activation=None),
  ])

  def encoder(images):
    images = tf.reshape(images, (-1, WAV_SHAPE[0], 1))
    net = encoder_net(images)
    """
    return tfd.MultivariateNormalDiag(
        loc=net[..., :latent_size],
        scale_diag=tf.nn.softplus(net[..., latent_size:] +
                                  _softplus_inverse(1.0)),
        name="code")
    """
    return tfd.Normal(
        loc=net[..., :latent_size],
        scale=tf.nn.softplus(net[..., latent_size:] + _softplus_inverse(1.0)),
        name="code")

  return encoder

## Decoders aka Generative Models

In [9]:
def make_cnn_decoder(activation, latent_size, output_shape, base_depth):
  """Creates the decoder function.
  Args:
    activation: Activation function in hidden layers.
    latent_size: Dimensionality of the encoding.
    output_shape: The output image shape.
    base_depth: Smallest depth for a layer.
  Returns:
    decoder: A `callable` mapping a `Tensor` of encodings to a
      `tfd.Distribution` instance over images.
  """
  conv = functools.partial(
      tf.keras.layers.Conv1D, padding="SAME", activation=activation)
    
  decoder_net = tf.keras.Sequential([
      conv(2 * base_depth, 7, padding="VALID"),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(2 * base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5, 2),
      tf.keras.layers.UpSampling1D(size=2),
      conv(base_depth, 5),
      tf.keras.layers.UpSampling1D(size=2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2*WAV_SHAPE[0], activation=None),
  ])

  def decoder(codes):
    original_shape = tf.shape(codes)
    codes = tf.reshape(codes, (-1, latent_size, 1))
    net = decoder_net(codes)
    """
    return tfd.Independent(tfd.Bernoulli(logits=logits),
                           reinterpreted_batch_ndims=len(output_shape),
                           name="image")
    """
    return tfd.Normal(
        loc=net[..., :WAV_SHAPE[0]],
        scale=tf.nn.softplus(net[..., WAV_SHAPE[0]:] + _softplus_inverse(1.0)),
        name="wav")

  return decoder

### Tensorflow Estimator model_fn

In [10]:
def cnn_model_fn(features, labels, mode, params, config):
  """Builds the model function for use in an estimator.
  Arguments:
    features: The input features for the estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.
  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """


  encoder = make_cnn_encoder(params["activation"],
                             params["latent_size"],
                             params["base_depth"])
  decoder = make_cnn_decoder(params["activation"],
                             params["latent_size"],
                             WAV_SHAPE,
                             params["base_depth"])
  latent_prior = tfd.MultivariateNormalDiag(
        loc=tf.zeros([params["latent_size"]]),
        scale_identity_multiplier=1.0
  )

  approx_posterior = encoder(features)
  approx_posterior_sample = approx_posterior.sample(1)#params["n_samples"])
  decoder_likelihood = decoder(approx_posterior_sample)

  # `distortion` is just the negative log likelihood.
  distortion = -decoder_likelihood.log_prob(features)
  avg_distortion = tf.reduce_mean(distortion)
  tf.summary.scalar("distortion", avg_distortion)

  rate = (approx_posterior.log_prob(approx_posterior_sample)
        - latent_prior.log_prob(approx_posterior_sample))
  avg_rate = tf.reduce_mean(rate)
  tf.summary.scalar("rate", avg_rate)

  #elbo_local = -(rate + distortion)

  #elbo = tf.reduce_mean(elbo_local)
  elbo = -(avg_rate + avg_distortion)
  loss = -elbo
  tf.summary.scalar("elbo", elbo)

  """
  importance_weighted_elbo = tf.reduce_mean(
      tf.reduce_logsumexp(elbo_local, axis=0) -
      tf.log(tf.to_float(params["n_samples"])))
  tf.summary.scalar("elbo/importance_weighted", importance_weighted_elbo)
  """
    
  random_wav = decoder(latent_prior.sample(16))
  tf.summary.audio("random/sample", random_wav.sample(), sample_rate=22050)
  tf.summary.audio("random/mean", random_wav.mean(), sample_rate=22050)

  # Perform variational inference by minimizing the -ELBO.
  global_step = tf.train.get_or_create_global_step()
  learning_rate = tf.train.cosine_decay(params["learning_rate"], global_step,
                                        params["max_steps"])
  tf.summary.scalar("learning_rate", learning_rate)
  optimizer = tf.train.AdamOptimizer(learning_rate)
  train_op = optimizer.minimize(loss, global_step=global_step)

  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        'encoded_sample': approx_posterior.sample(1), 
        'encoded_mean': approx_posterior.mean(), 
        'reconstructed_sample': decoder_likelihood.sample(1), 
        'reconstructed_mean': decoder_likelihood.mean(),
    }
  else:
    predictions = None

  return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops={
          "elbo": tf.metrics.mean(elbo),
          #"elbo/importance_weighted": tf.metrics.mean(importance_weighted_elbo),
          "rate": tf.metrics.mean(avg_rate),
          "distortion": tf.metrics.mean(avg_distortion),
      },
      predictions=predictions,
  )

## Load Training Data

In [11]:
def load_preprocessed_data(file_name):
    import feather  # Super fast way to read/write tabular data
    training_data = feather.read_dataframe(file_name).set_index('index')
    return training_data

In [12]:
data = load_preprocessed_data('padded_train_15_sample.feather')
y_train = data['label']
x_train = data.drop(['label', 'manually_verified'], axis=1)
data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,330742,330743,330744,330745,330746,330747,330748,330749,label,manually_verified
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30114da8.wav,-0.002574,0.006757,-0.00687,0.009043,-0.008641,0.013116,0.004403,-0.032082,0.020017,-0.009521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hi-hat,1
288d0dff.wav,0.002158,-0.000712,0.004873,-0.001874,0.005291,0.001182,0.001999,0.003193,0.001588,0.006986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hi-hat,1
0ef9a602.wav,-0.074835,0.162536,-0.357083,0.246719,-0.096885,-0.116554,0.189366,-0.277244,0.122169,0.095103,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hi-hat,1
f6a82a6c.wav,-0.002069,0.003097,0.000513,0.005921,0.002216,0.001272,0.006814,0.011007,0.003781,0.004145,...,-0.007527,-0.006673,-0.008705,-0.012213,-0.013543,-0.010741,-0.006116,-0.002419,Hi-hat,0
f01d4739.wav,-3.4e-05,-1.9e-05,-1.8e-05,-8.4e-05,1.6e-05,-3.8e-05,0.000111,5.1e-05,-0.000261,-0.000125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hi-hat,0


In [16]:
train_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=True, 
    batch_size=1
)

## Train Model

In [14]:
def train_cnn(input_fn):

    params = {
        'learning_rate': LEARNING_RATE,
        'batch_size': BATCH_SIZE,
        'latent_size': LATENT_DIMENSIONS,
        'activation': ACTIVATION,
        'base_depth': BASE_DEPTH,
        'max_steps': MAX_STEPS,
    }
    params["activation"] = getattr(tf.nn, params["activation"])
    """
    if FLAGS.delete_existing and tf.gfile.Exists(MODEL_DIR):
        tf.logging.warn("Deleting old log directory at {}".format(MODEL_DIR))
        tf.gfile.DeleteRecursively(MODEL_DIR)
        tf.gfile.MakeDirs(MODEL_DIR)
    """
    
    estimator = tf.estimator.Estimator(
      cnn_model_fn,
      params=params,
      config=tf.estimator.RunConfig(
          model_dir=MODEL_DIR,
          save_checkpoints_steps=VIZ_STEPS,
      ),
    )
    for _ in range(MAX_STEPS // VIZ_STEPS):
        estimator.train(input_fn=input_fn, steps=VIZ_STEPS)
    return estimator

In [15]:
estimator = train_cnn(train_input_fn)

INFO:tensorflow:Using config: {'_model_dir': '/data/tensorflow/vae', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f58967b54a8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tenso

## Model Criticism

In [17]:
predict_input_fn = numpy_input_fn(
    x_train.values.astype(np.float32), 
    shuffle=False, 
    batch_size=1
)

In [18]:
predictions = list(
    estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /data/tensorflow/vae/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [19]:
predicted_encodings = pd.DataFrame([predictions[i]['encoded_sample'][0][0] for i in range(len(predictions))])

In [20]:
y_df = y_train.to_frame()
y_df['label_cat'] = y_df['label'].astype('category')
y_df['label_int'] = y_df['label_cat'].cat.codes

In [22]:
lr = LogisticRegression()
lr.fit(predicted_encodings.values, y_df['label_int'].values)
lr.score(predicted_encodings.values, y_df['label_int'].values)

0.08658536585365853

Clearly, the latent representation is not linearly separable!

In [25]:
r = RandomForestClassifier()
r.fit(predicted_encodings.values, y_df['label_int'].values)
r.score(predicted_encodings.values, y_df['label_int'].values)

0.998780487804878

That's more like it!

### TODO repeat the tests on *OUT OF SAMPLE* data

## Data Preprocessing

In [None]:
def audio_padding(descriptive_df, mode='train'):
    # '' using 15 second as our standard for padded audio files
    # '' this function only returns a list of np.arrays without any label association (for faster run time)
    names = []
    audio = []
    seconds = 15
    sample_rate = 22050
    max_len = seconds * sample_rate
    for row in descriptive_df.itertuples():
        file_path = 'audio_{}/audio_{}/{}'.format(mode, mode, row.Index)
        data = librosa.load(file_path)[0][:max_len]
        duration = data.shape[0]
        padding_len = max_len - duration
        padding = np.zeros(padding_len)
        data = np.append(data, padding)
        audio.append(data)
        names.append(row.Index)
    return pd.DataFrame(data=audio, index=names)

In [None]:
def get_df(label, count):
    return audio_padding(descriptive_df.loc[descriptive_df.label == label].sample(n=count, replace=False))

In [None]:
descriptive_df = pd.read_csv('train_descriptive.csv', index_col=0).set_index('fname')

In [None]:
descriptive_df.head()

In [None]:
# Get a random 20 WAV files for each label
train_df = pd.concat([get_df(l, 20) for l in descriptive_df['label'].unique()])

In [None]:
train_df = train_df.join(descriptive_df[['label', 'manually_verified']])
train_df = train_df.reset_index()
train_df.columns = [str(col) for col in train_df.columns]
train_df.to_feather('padded_train_15_sample.feather')