In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import logging
import os
from scipy.io import wavfile
import vae

# Configure the logging settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [4]:
# Check available GPUs
physical_devices = tf.config.experimental.list_physical_devices('GPU')

In [3]:
if len(physical_devices) > 0:
    # Ensure TensorFlow uses the first GPU
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    with tf.device('/GPU:0'):
        
else:
    print("No GPU devices found.")

IndentationError: expected an indented block (185336717.py, line 8)

In [6]:

# Step 1: Load, preprocess, and pad audio data
def preprocess_audio(audio_file, target_length=300):
    #logging.info(f'Processing audio file: {audio_file}')
    audio, _ = librosa.load(audio_file, sr=22050)
    audio_mfcc = librosa.feature.mfcc(y=audio, sr=22050)

    # Calculate the current length of audio data
    current_length = audio_mfcc.shape[1]

    if current_length < target_length:
        # If the audio is shorter than the target length, pad it
        pad_width = target_length - current_length
        audio_mfcc = np.pad(audio_mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        # If the audio is longer, truncate it
        audio_mfcc = audio_mfcc[:, :target_length]

    return audio_mfcc

# Generator function for lazy loading of audio data
def audio_data_generator(audio_files):
    for audio_file in audio_files:
        yield preprocess_audio(audio_file)

# Load a list of audio files
audio_files_df = pd.read_csv('data_files_summary.csv')
audio_files = audio_files_df['full_path'].to_list()

In [21]:
# Create a generator to load and preprocess audio data on-the-fly
data_generator = audio_data_generator(audio_files)
data_iterator = iter(data_generator)  # Convert the generator to an iterator

# To train an autoencoder, you need target data, which is the same as the input data
# So, use the same data for both input and target
X_train = np.array(list(data_iterator))

In [7]:
import tensorflow as tf

# Step 2: Define the autoencoder architecture
input_audio = tf.keras.layers.Input(shape=(20, 300, 1))  # Input shape with one channel
encoded = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(input_audio)
encoded = tf.keras.layers.MaxPooling2D((2, 2), padding='same')(encoded)
encoded = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
encoded = tf.keras.layers.MaxPooling2D((2, 2), padding='same')(encoded)
# Flatten the encoded layer before using it in the LSTM
encoded = tf.keras.layers.Flatten()(encoded)
encoded = tf.keras.layers.Reshape((20, 75 * 8))(encoded)
decoded = tf.keras.layers.LSTM(300, return_sequences=True)(encoded)

In [8]:
# Reshape the decoded layer to match the original input shape
decoded = tf.keras.layers.Reshape((20, 300, 1))(decoded)
autoencoder = tf.keras.models.Model(input_audio, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [9]:
autoencoder = vae.VAE(
        input_shape=(20, 300, 1),
        conv_filters=(32, 64, 64, 64),
        conv_kernels=(3, 3, 3, 3),
        conv_strides=(1, 2, 2, 1),
        latent_space_dim=2
    )
autoencoder.compile()

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


AttributeError: 'method' object has no attribute '_from_serialized'

In [63]:
logging.info('Autoencoder model built.')
autoencoder.summary()

2023-10-19 20:10:20,873 - root - INFO - Autoencoder model built.


Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 20, 300, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 encoder_conv_layer_1 (Conv2D)  (None, 20, 300, 32)  320         ['encoder_input[0][0]']          
                                                                                                  
 encoder_relu_1 (ReLU)          (None, 20, 300, 32)  0           ['encoder_conv_layer_1[0][0]']   
                                                                                                  
 encoder_bn_1 (BatchNormalizati  (None, 20, 300, 32)  128        ['encoder_relu_1[0][0]']   

In [61]:
# Step 3: Train the autoencoder using audio data
logging.info('Training the autoencoder...')
autoencoder.train(X_train, batch_size=32, num_epochs=300)
logging.info('Training complete.')

2023-10-19 20:09:34,048 - root - INFO - Training the autoencoder...


AttributeError: 'method' object has no attribute '_from_serialized'

In [26]:
from scipy.io import wavfile
# Step 4: Stream audio files through the trained autoencoder and save the encoded audio as WAV
def stream_audio_through_autoencoder(audio_file, autoencoder, output_path):
    input_audio = preprocess_audio(audio_file)
    encoded_audio = autoencoder.predict(np.expand_dims(input_audio, axis=0))
    
    # Inverse transform the encoded audio back to the waveform
    decoded_audio = librosa.feature.inverse.mfcc_to_audio(encoded_audio[0], sr=22050)
    
    # Scale the audio data to the appropriate range
    decoded_audio = (decoded_audio * np.iinfo(np.int16).max).astype(np.int16)
    
    # Save the decoded audio as a WAV file
    wavfile.write(output_path, 22050, decoded_audio)

# Specify the output path for the saved WAV file
output_path = "encoded_audio.wav"

# Process an audio file and save the encoded audio as a WAV
stream_audio_through_autoencoder("Recording (42).m4a", autoencoder, output_path)



  audio, _ = librosa.load(audio_file, sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


