<a href="https://colab.research.google.com/github/mikashaw/ML-Projects/blob/main/MusicNet/Autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, ReLU, BatchNormalization, Flatten, Dense, Reshape, Conv2DTranspose, Activation, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mse


import numpy as np
import os
import pickle

import soundfile as sf

In [2]:
tf.compat.v1.disable_eager_execution()

In [44]:
from tensorflow.python.keras.losses import MeanSquaredError
class Autoencoder:
  """
  Autoencoder represents a Deep Convolutional autoencoder architecture with mirrored encoder and decoder components
  """

  def __init__(self, 
               input_shape,
               conv_filters,
               conv_kernels,
               conv_strides,
               latent_space_dim):
    self.input_shape = input_shape # [28, 28, 1]
    self.conv_filters = conv_filters # [2,4,8]
    self.conv_kernels = conv_kernels # [3,5,3]
    self.conv_strides = conv_strides # [1,2,2]
    self.latent_space_dim = latent_space_dim # 2
    self._shape_before_bottleneck = None
    self.reconstruction_loss_weight = 1000000

    self.encoder = None
    self.decoder = None
    self.model = None
    self.mu = None 
    self.log_variance = None

    self._num_conv_layers = len(conv_filters)
    self._model_input = None

    self._build()

  def summary(self):
    self.encoder.summary()
    self.decoder.summary()
    self.model.summary()

  def compile(self, learning_rate=0.0001):
    optimizer = Adam(learning_rate= learning_rate)
    
    self.model.compile(optimizer=optimizer, 
                       loss = self._calculate_combined_loss, 
                       metrics = ['accuracy'])#self._calculate_reconstruction_loss])
                                 # self._calculate_kl_loss])

  def train(self, x_train, batch_size, num_epochs):
    self.model.fit(x_train, 
                   x_train,
                   batch_size=batch_size,
                   epochs = num_epochs,
                   shuffle=True)
    
    
  def save(self, save_folder=os.getcwd()):
    self._create_folder_if_it_doesnt_exist(save_folder)
    self._save_parameters(save_folder)
    self._save_weights(save_folder)

  def _create_folder_if_it_doesnt_exist(self, folder):
    if not os.path.exists(os.path.join(os.getcwd(),folder)):
      os.makedirs(folder)

  def _save_parameters(self, save_folder):
    parameters = [
      self.input_shape, # [28, 28, 1]
      self.conv_filters, # [2,4,8]
      self.conv_kernels, # [3,5,3]
      self.conv_strides,# [1,2,2]
      self.latent_space_dim# 2       
    ]
    save_path = os.path.join(save_folder, "parameters.pkl")
    with open(save_path, "wb") as f:
      pickle.dump(parameters, f)

  def _save_weights(self, save_folder):
    save_path = os.path.join(save_folder, "weights.h5")
    self.model.save_weights(save_path)

  def load_weights(self, weights_path):
    self.model.load_weights(weights_path)

  @classmethod 
  def load(cls, save_folder="."):
    parameters_path = os.path.join(save_folder, "parameters.pkl")
    with open(parameters_path, "rb") as f:
      parameters = pickle.load(f)
    autoencoder = Autoencoder(*parameters)
    weights_path = os.path.join(save_folder, "weights.h5")
    autoencoder.load_weights(weights_path)
    return autoencoder

  def _calculate_reconstruction_loss(self, y_target, y_predicted):
    error = y_target - y_predicted 
    reconstruction_loss = K.mean(K.square(error), axis=[1,2,3])
    return reconstruction_loss

  def _calculate_kl_loss(self, y_target, y_predicted):
    kl_loss = -0.5 * K.sum(1+self.log_variance - K.square(self.mu) - K.exp(self.log_variance), axis= 1)
    return kl_loss

  def _calculate_combined_loss(self, y_target, y_predicted):
    reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
    kl_loss = self._calculate_kl_loss(y_target, y_predicted)
    combined_loss = self.reconstruction_loss_weight * reconstruction_loss + kl_loss 
    return combined_loss 


  def _build(self):
    self._build_encoder()
    self._build_decoder()
    self._build_autoencoder()

  def _build_autoencoder(self):
    model_input = self._model_input
    model_output = self.decoder(self.encoder(model_input))
    self.model = Model(model_input, model_output, name="Autoencoder")

  def reconstruct(self, images):
        latent_representations = self.encoder.predict(images)
        reconstructed_images = self.decoder.predict(latent_representations)
        return reconstructed_images, latent_representations

  def _build_decoder(self):
    decoder_input = self._add_decoder_input()
    dense_layer = self._add_dense_layer(decoder_input)
    reshape_layer = self._add_reshape_layer(dense_layer)
    conv_transpose_layers = self._add_conv_transpose_layers(reshape_layer)
    decoder_output = self._add_decoder_output(conv_transpose_layers)
    self.decoder = Model(decoder_input, decoder_output, name ="decoder")

  def _add_decoder_input(self):
    return Input(shape=self.latent_space_dim, name = "decoder_input")

  def _add_dense_layer(self, decoder_input):
    num_neurons = np.prod(self._shape_before_bottleneck) #[1,2,4] -> 8
    dense_layer = Dense(num_neurons, name = "decoder_dense")(decoder_input)
    return dense_layer

  def _add_reshape_layer(self, dense_layer):
    reshape_layer = Reshape(self._shape_before_bottleneck)(dense_layer)
    return reshape_layer

  def _add_conv_transpose_layers(self, x):
    """Add conv transpose blocks"""
    # loop through all the conv later in reverse order and stop at the first layer
    for layer_index in reversed(range(1, self._num_conv_layers)):
      # [0,1,2] -> [2,1,0]
      x = self._add_conv_transpose_layer(layer_index, x)
    return x

  def _add_conv_transpose_layer(self, layer_index, x):
    layer_num = self._num_conv_layers - layer_index
    conv_transpose_layer = Conv2DTranspose(
        filters = self.conv_filters[layer_index],
        kernel_size = self.conv_kernels[layer_index],
        strides = self.conv_strides[layer_index],
        padding="same",
        name=f"decoder_conv_transpose_layer_{layer_num}"
    )

    x = conv_transpose_layer(x)
    x = ReLU(name=f"decoder_relu_{layer_num}")(x)
    x = BatchNormalization(name=f"decoder_bn_{layer_num}")(x)
    return x

  def _add_decoder_output(self, x):
    conv_transpose_layer = Conv2DTranspose(
        filters = 1,
        kernel_size = self.conv_kernels[0],
        strides = self.conv_strides[0],
        padding = "same",
        name=f"decoder_conv_transpose_layer_{self._num_conv_layers}"
    )
    x = conv_transpose_layer(x)
    output_layer = Activation("sigmoid", name = "sigmoid_layer")(x)
    return output_layer

  def _build_encoder(self):
    encoder_input = self._add_encoder_input()
    conv_layers = self._add_conv_layers(encoder_input)
    bottleneck = self._add_bottleneck(conv_layers)
    self._model_input = encoder_input
    self.encoder = Model(encoder_input, bottleneck, name = "encoder")

  def _add_encoder_input(self):
    return Input(shape = self.input_shape, name="encoder_input")

  def _add_conv_layers(self, encoder_input):
    """
    Creates all convolutionals block in encoder
    """
    x = encoder_input
    for layer_index in range(self._num_conv_layers):
      x = self._add_conv_layer(layer_index, x)
    return x

  def _add_conv_layer(self, layer_index, x):
    """
    Adds a convolutional block to a graph of layers, consisting of a conv 2d + ReLU + batch normalization.
    """
    layer_number = layer_index + 1
    conv_layer = Conv2D(
        filters = self.conv_filters[layer_index],
        kernel_size = self.conv_kernels[layer_index],
        strides = self.conv_strides[layer_index],
        padding= "same",
        name=f"encoder_conv_layer_{layer_number}"
    
    )
    x = conv_layer(x)
    x = ReLU(name = f"encoder_relu_{layer_number}")(x)
    x = BatchNormalization(name=f"encoder_bn_{layer_number}")(x)
    return x

  def _add_bottleneck(self, x):
    """Flatten data and add bottle neck with Gaussian sampling"""
    self._shape_before_bottleneck = K.int_shape(x)[1:] #[2,7,7,1]
    x = Flatten()(x)
    # find mu 
    self.mu = Dense(self.latent_space_dim, name="mu")(x)
    self.log_variance = Dense(self.latent_space_dim, name="log_variance")(x)

    def sample_point_from_normal_distribution(args):
      mu, log_variance = args
      epsilon = K.random_normal(shape=K.shape(self.mu), mean = 0., stddev=1.)
      sampled_point = mu + K.exp(log_variance / 2) * epsilon
      return sampled_point

    x = Lambda(sample_point_from_normal_distribution, name="encoder_output")([self.mu, self.log_variance])
    return x





In [5]:
autoencoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 28, 28, 1)]  0           []                               
                                                                                                  
 encoder_conv_layer_1 (Conv2D)  (None, 28, 28, 32)   320         ['encoder_input[0][0]']          
                                                                                                  
 encoder_relu_1 (ReLU)          (None, 28, 28, 32)   0           ['encoder_conv_layer_1[0][0]']   
                                                                                                  
 encoder_bn_1 (BatchNormalizati  (None, 28, 28, 32)  128         ['encoder_relu_1[0][0]']         
 on)                                                                                        

In [6]:
from tensorflow.keras.datasets import mnist

In [7]:
LEARNING_RATE = 0.0005
BATCH_SIZE = 64
EPOCHS = 150

def load_mnist():
  (x_train, y_train), (x_test, y_test) = mnist.load_data()
  x_train = x_train.astype("float32") / 255
  x_train = x_train.reshape(x_train.shape+ (1,))
  x_test = x_test.astype("float32") / 255
  x_test = x_test.reshape(x_test.shape+ (1,))

  return x_train, y_train, x_test, y_test


 


#curr_path = os.getcwd()
#path = os.path.join(curr_path, )
#x_train, _, _, _ = load_mnist()
#a#utoencoder = train(x_train[:500], LEARNING_RATE, BATCH_SIZE, EPOCHS)
#autoencoder.save("model")
#autoencoder2 = Autoencoder.load("model")
#autoencoder2.summary()

#Preprocessing Pipeline 

In [8]:
import librosa

In [9]:
class Loader:
  """Loader is responsible for loading an audio file"""

  def __init__(self, sample_rate, duration, mono):
    self.sample_rate = sample_rate
    self.duration = duration
    self.mono = mono

  def load(self, file_path):
    signal = librosa.load(file_path, 
                          sr = self.sample_rate,
                          duration= self.duration, 
                          mono = self.mono)[0]
    return signal


In [10]:
class Padder:
  """Padder is responsible to apply padding to an array."""

  def __init__(self, mode="constant"):
    self.mode = mode 

  def left_pad(self, array, num_missing_items):
    padded_array = np.pad(array,
                          (num_missing_items, 0),
                          mode = self.mode)
    return padded_array

  def right_pad(self, array, num_missing_items):
    padded_array = np.pad(array,
                          (num_missing_items,0),
                           mode = self.mode)
    return padded_array



In [11]:
class LogSpectrogramExtractor:
  """LogSpectrogramExtractor extracts log spectrograms (in dB) 
  from a time-series signal"""

  def __init__(self, frame_size, hop_length):
    self.frame_size = frame_size
    self.hop_length = hop_length

  def extract(self, signal):
    stft = librosa.stft(signal,
                        n_fft = self.frame_size,
                        hop_length = self.hop_length)[:-1] # (1+grame_size / 2, num_grames)
    spectrogram = np.abs(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram 
    
    


In [12]:
class MinMaxNormaliser:
  """MinMaxNormaliser applies min max normalization to an array"""

  def __init__(self, min_val, max_val):
    self.min = min_val
    self.max = max_val 

  def normalise(self, array):
    norm_array = (array - array.min()) / (array.max() - array.min()) # ->
    norm_array = norm_array * (self.max - self.min) + self.min
    return norm_array
  
  def denormalise(self, norm_array, original_min, original_max):
    array = (norm_array - self.min) / (self.max - self.min)
    array = array * (original_max - original_min) + original_min
    return array

In [13]:
class PreprocessingPipeline:
  """PreprocessingPipeline processes audio files in a directory , applying the 
  following steps to each file:
  1- Load a file
  1- Pad the sifnal
  3- Extract log spectrogram
  4- Normalize
  5- Save the normalized spectrogram
  """

  def __init__(self):
    self._loader = None 
    self.padder = None
    self.extractor = None 
    self.normaliser = None
    self.saver = None 
    self.min_max_values = {}
    self._num_expected_samples = None 

  @property 
  def loader(self):
    return self._loader 

  @loader.setter
  def loader(self, loader):
    self._loader = loader 
    self._num_expected_samples = int(loader.sample_rate * loader.duration)



  def process(self, audio_files_dir):
    for root, _, files in os.walk(audio_files_dir):
      for file in files:
        file_path = os.path.join(root, file)
        self._process_file(file_path)
        print("Processed file {file_path}")
    self.saver.save_min_max_values(self.min_max_values)

  def _process_file(self, file_path):
    signal = self.loader.load(file_path)
    if self._is_padding_necessary(signal):
      signal = self._apply_padding(signal)
    feature = self.extractor.extract(signal)
    norm_feature = self.normaliser.normalise(feature)
    save_path  = self.saver.save_feature(norm_feature, file_path)
    self._store_min_max_value(save_path, feature.min(), feature.max())

  def _is_padding_necessary(self, signal):
    
    if len(signal) < self._num_expected_samples:
      return True
    return False

  def _apply_padding(self, signal):
    num_missing_samples = self._num_expected_samples - len(signal)
    padded_signal = self.padder.right_pad(signal, num_missing_samples)
    return padded_signal 

  def _store_min_max_value(self, save_path, min_val, max_val):
    self.min_max_values[save_path] = {
        "min": min_val,
        "max": max_val,

    }



In [14]:
class Saver:
  """saver is responsible to save featrues, and the min max values"""

  def __init__(self, feature_save_dir, min_max_values_save_dir):
    self.feature_save_dir = feature_save_dir
    self.min_max_values_save_dir = min_max_values_save_dir

  def save_feature(self, feature, file_path):
    save_path = self._generate_save_path(file_path)
    np.save(save_path, feature)
    return save_path

  def save_min_max_values(self, min_max_values):
    save_path = os.path.join(self.min_max_values_save_dir, "min_max_values.pkl")
    self._save(min_max_values, save_path)

  @staticmethod
  def _save(data, save_path):
    with open(save_path, "wb") as f:
      pickle.dump(data, f)


  def _generate_save_path(self, file_path):
    file_name = os.path.split(file_path)[1]
    save_path = os.path.join(self.feature_save_dir, file_name + ".npy")
    return save_path





In [15]:
!mkdir spectrograms
!mkdir min_max_values

In [16]:
# constants

FRAME_SIZE = 512
HOP_LENGTH = 256
DURATION = 0.74 #get a nice number of frames
SAMPLE_RATE = 22050 
MONO = True

SPECTROGRAMS_SAVE_DIR = "/content/spectrograms"
MIN_MAX_VALUES_SAVE_DIR = "/content/min_max_values"
FILES_DIR = "/content/free-spoken-digit-dataset/recordings" # change later


In [17]:
# instantiate all objects

loader = Loader(SAMPLE_RATE, DURATION, MONO)
padder = Padder()
log_spectrogram_extractor = LogSpectrogramExtractor(FRAME_SIZE, HOP_LENGTH)
min_max_normalizer = MinMaxNormaliser(0,1)
saver = Saver(SPECTROGRAMS_SAVE_DIR, MIN_MAX_VALUES_SAVE_DIR)

preprocessing_pipeline = PreprocessingPipeline()
preprocessing_pipeline.loader = loader 
preprocessing_pipeline.padder = padder
preprocessing_pipeline.extractor = log_spectrogram_extractor
preprocessing_pipeline.normaliser = min_max_normalizer
preprocessing_pipeline.saver = saver


In [18]:
!git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git 

Cloning into 'free-spoken-digit-dataset'...
remote: Enumerating objects: 4234, done.[K
remote: Counting objects: 100% (562/562), done.[K
remote: Compressing objects: 100% (550/550), done.[K
remote: Total 4234 (delta 31), reused 524 (delta 12), pack-reused 3672[K
Receiving objects: 100% (4234/4234), 30.45 MiB | 24.04 MiB/s, done.
Resolving deltas: 100% (108/108), done.


In [19]:
preprocessing_pipeline.process(FILES_DIR)

Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
Processed file {file_path}
P

In [29]:
LEARNING_RATE = 0.0005 
BATCH_SIZE = 32 
EPOCHS = 100

SPECTROGRAMS_PATH = '/content/spectrograms'

def load_fsdd(spectrograms_path):
  x_train = []
  for root, _, file_names in os.walk(spectrograms_path):
    for file_name in file_names: 
      file_path = os.path.join(root, file_name)
      spectrogram = np.load(file_path) #(n_bins, n_frames, 1)
      x_train.append(spectrogram)

  x_train = np.array(x_train)
  x_train = x_train[..., np.newaxis] # -> (3000, 256, 64, 1)
  return x_train 

def train(x_train, learning_rate, batch_size, epochs):
  autoencoder = Autoencoder(
    input_shape = (256,64,1),
    conv_filters = (512,256,128,64,32),
    conv_kernels = (3,3,3,3,3),
    conv_strides = (2,2,2,2, (2,1)),
    latent_space_dim=128
  )
  autoencoder.summary()
  autoencoder.compile(learning_rate)
  autoencoder.train(x_train, batch_size, epochs)
  return autoencoder

x_train = load_fsdd(SPECTROGRAMS_PATH)
 
autoencoder = train(x_train, LEARNING_RATE, BATCH_SIZE, EPOCHS)
autoencoder.save("model")


Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 256, 64, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 encoder_conv_layer_1 (Conv2D)  (None, 128, 32, 512  5120        ['encoder_input[0][0]']          
                                )                                                                 
                                                                                                  
 encoder_relu_1 (ReLU)          (None, 128, 32, 512  0           ['encoder_conv_layer_1[0][0]']   
                                )                                                           

Running Inference

In [61]:
class SoundGenerator:

  def __init__(self, vae, hop_length):
    self.vae = vae
    self.hop_length = hop_length
    self._min_max_normaliser = MinMaxNormaliser(0, 1)

  def generate(self, spectrograms, min_max_values):
    
    generated_spectrograms, latent_representations = self.vae.reconstruct(spectrograms)
    signals = self.convert_spectrograms_to_audio(generated_spectrograms, min_max_values)
    return signals, latent_representations

  def convert_spectrograms_to_audio(self, spectrograms, min_max_values):
    signals = []
    for spectrogram, min_max_value in zip(spectrograms, min_max_values):
    # reshape the log spectrogram
      log_spectrogram = spectrogram[:,:,0]
    # apply denormalisation
      denorm_log_spec = self._min_max_normaliser.denormalise(
        log_spectrogram,
        min_max_value["min"],
        min_max_value["max"]
    )
    # log spectrogram -> spectrogram
      spec = librosa.db_to_amplitude(denorm_log_spec)
    # apply Griffin-Lim
      signal = librosa.istft(spec, hop_length = self.hop_length)
      signals.append(signal)
    return signals


  

In [25]:
!mkdir original 
!mkdir generated

In [50]:
HOP_LENGTH = 256
SAVE_DIR_ORIGINAL = "/content/original/"
SAVE_DIR_GENERATED = "/content/generated/"
MIN_MAX_VALUES_PATH = "/content/min_max_values/min_max_values.pkl"

In [32]:
# utility functions

def select_spectrograms(spectrograms,
                        file_paths,
                        min_max_values,
                        num_spectrograms=2):
    sampled_indexes = np.random.choice(range(len(spectrograms)), num_spectrograms)
    sampled_spectrogrmas = spectrograms[sampled_indexes]
    file_paths = [file_paths[index] for index in sampled_indexes]
    sampled_min_max_values = [min_max_values[file_path] for file_path in
                           file_paths]
    print(file_paths)
    print(sampled_min_max_values)
    return sampled_spectrogrmas, sampled_min_max_values


def save_signals(signals, save_dir, sample_rate=22050):
    for i, signal in enumerate(signals):
        save_path = os.path.join(save_dir, str(i) + ".wav")
        sf.write(save_path, signal, sample_rate)


In [6]:
def main(autoencoder):
  # initialize the sound generator
  vae = autoencoder.load("model")
  sound_generator = SoundGenerator(vae, HOP_LENGTH)

  # load spectrograms + min max vals
  with open(MIN_MAX_VALUES_PATH, "rb") as f:
    min_max_values = pickle.load(f)

  specs, file_paths = load_fsdd(SPECTROGRAMS_PATH)

  # sample spectrigrams + min_max_values
  sampled_specs, sampled_min_max_values = select_spectrograms(specs,
                                                              file_paths,
                                                              min_max_values,
                                                              5)
  # generate audio for sampled specs
  signals, _ = sound_generator.generate(sampled_specs,
                                        sampled_min_max_values)
  #convert spectrogram sampled to audio
  original_signals = sound_generator.convert_spectrograms_to_audio(
      sampled_specs, sampled_min_max_values
  )

  save_signals(signals, SAVE_DIR_GENERATED)
  save_signals(orignal_signals, SAVE_DIR_ORIGINAL)

In [42]:
def load_fsdd(spectrograms_path):
  x_train = []
  file_paths = []
  for root, _, file_names in os.walk(spectrograms_path):
    for file_name in file_names: 
      file_path = os.path.join(root, file_name)
      spectrogram = np.load(file_path) #(n_bins, n_frames, 1)
      file_paths.append(file_path)
      x_train.append(spectrogram)

  x_train = np.array(x_train)
  x_train = x_train[..., np.newaxis] # -> (3000, 256, 64, 1)
  return x_train, file_paths

In [46]:
autoencoder = Autoencoder(
    input_shape = (256,64,1),
    conv_filters = (512,256,128,64,32),
    conv_kernels = (3,3,3,3,3),
    conv_strides = (2,2,2,2, (2,1)),
    latent_space_dim=128
  )

In [62]:
# initialize the sound generator
vae = autoencoder.load("model")
sound_generator = SoundGenerator(vae, HOP_LENGTH)

  # load spectrograms + min max vals
with open(MIN_MAX_VALUES_PATH, "rb") as f:
  min_max_values = pickle.load(f)

specs, file_paths = load_fsdd(SPECTROGRAMS_PATH)

  # sample spectrigrams + min_max_values
sampled_specs, sampled_min_max_values = select_spectrograms(specs,
                                                              file_paths,
                                                              min_max_values,
                                                              5)
  # generate audio for sampled specs
signals, _ = sound_generator.generate(sampled_specs,
                                        sampled_min_max_values)
  #convert spectrogram sampled to audio
original_signals = sound_generator.convert_spectrograms_to_audio(
      sampled_specs, sampled_min_max_values
  )

save_signals(signals, SAVE_DIR_GENERATED)
save_signals(original_signals, SAVE_DIR_ORIGINAL)

['/content/spectrograms/4_theo_29.wav.npy', '/content/spectrograms/3_yweweler_5.wav.npy', '/content/spectrograms/5_jackson_49.wav.npy', '/content/spectrograms/2_lucas_26.wav.npy', '/content/spectrograms/1_lucas_20.wav.npy']
[{'min': -64.27804, 'max': 15.721961}, {'min': -70.025925, 'max': 9.974078}, {'min': -54.105003, 'max': 25.894997}, {'min': -56.426495, 'max': 23.573505}, {'min': -57.38911, 'max': 22.61089}]


  updates=self.state_updates,
