In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import logging
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="FALSE"

from scipy.io import wavfile


# Configure the logging settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [9]:

# Step 1: Load, preprocess, and pad audio data
def preprocess_audio(audio_file, target_length=300):
    #logging.info(f'Processing audio file: {audio_file}')
    audio, _ = librosa.load(audio_file, sr=22050)
    audio_mfcc = librosa.feature.mfcc(y=audio, sr=22050)

    # Calculate the current length of audio data
    current_length = audio_mfcc.shape[1]

    if current_length < target_length:
        # If the audio is shorter than the target length, pad it
        pad_width = target_length - current_length
        audio_mfcc = np.pad(audio_mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        # If the audio is longer, truncate it
        audio_mfcc = audio_mfcc[:, :target_length]

    return audio_mfcc

# Generator function for lazy loading of audio data
def audio_data_generator(audio_files):
    for audio_file in audio_files:
        yield preprocess_audio(audio_file)

# Load a list of audio files
audio_files_df = pd.read_csv('data_files_summary.csv')
audio_files = audio_files_df['full_path'].to_list()

In [10]:
# Create a generator to load and preprocess audio data on-the-fly
data_generator = audio_data_generator(audio_files)
data_iterator = iter(data_generator)  # Convert the generator to an iterator

# To train an autoencoder, you need target data, which is the same as the input data
# So, use the same data for both input and target
X_train = np.array(list(data_iterator))

In [11]:
# Step 2: Define the autoencoder architecture
encoding_dim = 1024

input_audio = tf.keras.layers.Input(shape=(20, 300))  # Variable-length input
flattened_input = tf.keras.layers.Flatten()(input_audio)  # Flatten the input
encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(flattened_input)
# Define the decoder with the final layer

In [12]:
decoded = tf.keras.layers.Dense(20 * 300, activation='linear')(encoded)
decoded = tf.keras.layers.Reshape((20, 300))(decoded)

In [13]:
autoencoder = tf.keras.models.Model(input_audio, decoded)
autoencoder.compile(optimizer='adam', loss='mse')
logging.info('Autoencoder model built.')
autoencoder.summary()

2023-10-19 18:42:17,274 - root - INFO - Autoencoder model built.


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20, 300)]         0         
                                                                 
 flatten_1 (Flatten)         (None, 6000)              0         
                                                                 
 dense_2 (Dense)             (None, 1024)              6145024   
                                                                 
 dense_3 (Dense)             (None, 6000)              6150000   
                                                                 
 reshape_1 (Reshape)         (None, 20, 300)           0         
                                                                 
Total params: 12,295,024
Trainable params: 12,295,024
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Step 3: Train the autoencoder using audio data
logging.info('Training the autoencoder...')
autoencoder.fit(X_train, X_train, epochs=300, batch_size=32)
logging.info('Training complete.')

2023-10-19 18:42:17,300 - root - INFO - Training the autoencoder...


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
 33/197 [====>.........................] - ETA: 5s - loss: 96.7588

In [None]:


# Step 4: Stream audio files through the trained autoencoder and save the encoded audio as WAV
def stream_audio_through_autoencoder(audio_file, autoencoder, output_path):
    input_audio = preprocess_audio(audio_file)
    encoded_audio = autoencoder.predict(np.expand_dims(input_audio, axis=0))
    
    # Inverse transform the encoded audio back to the waveform
    decoded_audio = librosa.feature.inverse.mfcc_to_audio(encoded_audio[0], sr=22050)
    
    # Scale the audio data to the appropriate range
    decoded_audio = (decoded_audio * np.iinfo(np.int16).max).astype(np.int16)
    
    # Save the decoded audio as a WAV file
    wavfile.write(output_path, 22050, decoded_audio)

# Specify the output path for the saved WAV file
output_path = "encoded_audio.wav"

# Process an audio file and save the encoded audio as a WAV
stream_audio_through_autoencoder("sample-3s.wav", autoencoder, output_path)