# Data loading and Preprocessing

This script is for loading and preprocessing audio data for a deep learning-based audio compression project.
It includes functions for loading audio files, preprocessing them into suitable formats for training, and
splitting the data into training, validation, and test sets.

Instructions:
1. Ensure you have downloaded the dataset and set the correct paths for the training and testing directories.
2. Adjust the number of files to be loaded as per your requirements.
3. Install the required dependencies listed in the `requirements.txt` file.

Dependencies: os, numpy, librosa, random, sklearn, tensorflow, matplotlib, IPython, skimage, time, ipywidgets


In [1]:
import os
import numpy as np
import librosa
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from IPython.display import Audio, display
from skimage.metrics import structural_similarity as ssim
import time
import ipywidgets as widgets

# Helper function to load and preprocess audio data
def load_and_preprocess_data(directory, num_files, target_length=16000):
    data = []
    files = [f for f in os.listdir(directory) if f.endswith('.wav')]
    selected_files = random.sample(files, num_files)
    
    for file in selected_files:
        file_path = os.path.join(directory, file)
        signal, _ = librosa.load(file_path, sr=None)
        if len(signal) < target_length:
            # Pad with zeros if the signal is shorter than target length
            pad_length = target_length - len(signal)
            signal = np.pad(signal, (0, pad_length), 'constant')
        elif len(signal) > target_length:
            # Truncate if the signal is longer than target length
            signal = signal[:target_length]
        data.append(signal)
    return np.array(data)

# Helper function to preprocess the dataset into TensorFlow format
def preprocess_dataset(data, batch_size=16):
    dataset = tf.data.Dataset.from_tensor_slices((data, data))  # Ensure (inputs, targets)
    dataset = dataset.map(lambda x, y: (tf.convert_to_tensor(x, dtype=tf.float32), tf.convert_to_tensor(y, dtype=tf.float32)))
    dataset = dataset.map(lambda x, y: (tf.expand_dims(x, -1), tf.expand_dims(y, -1)))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# Load and preprocess data
num_train_files = 23075
num_test_files = 1100

# Load training and testing data
clean_speech_train = load_and_preprocess_data('C:/Users/anand/Downloads/Dataset/clean_train', num_train_files)
clean_speech_test = load_and_preprocess_data('C:/Users/anand/Downloads/Dataset/clean_test', num_test_files)

# Split the training data into training and validation sets
clean_speech_train, clean_speech_val = train_test_split(clean_speech_train, test_size=0.2, random_state=42)

# Convert to TensorFlow datasets
train_dataset = preprocess_dataset(clean_speech_train)
val_dataset = preprocess_dataset(clean_speech_val)
test_dataset = preprocess_dataset(clean_speech_test)

# Print the number of files loaded in each dataset
print(f"Training data files loaded: {len(clean_speech_train)}")
print(f"Validation data files loaded: {len(clean_speech_val)}")
print(f"Test data files loaded: {len(clean_speech_test)}")

# Print the shapes of the datasets
print("\nShapes of datasets:")
print(f"Training dataset shape: {clean_speech_train.shape}")
print(f"Validation dataset shape: {clean_speech_val.shape}")
print(f"Test dataset shape: {clean_speech_test.shape}")

print("\nData loading and preprocessing is complete.")

  "class": algorithms.Blowfish,


Training data files loaded: 18460
Validation data files loaded: 4615
Test data files loaded: 1100

Shapes of datasets:
Training dataset shape: (18460, 16000)
Validation dataset shape: (4615, 16000)
Test dataset shape: (1100, 16000)

Data loading and preprocessing is complete.


# Model Building 

This script defines a deep learning-based autoencoder model for audio compression using ResNet blocks in the encoder and decoder. The autoencoder compresses 1-second audio samples into a latent representation and then reconstructs the audio from this compressed representation.

The script consists of:
1. ResNet block definition
2. Encoder model definition
3. Decoder model definition
4. Autoencoder model definition
5. Building and compiling the autoencoder model
6. Displaying model summaries
7. Example input and output shapes

Ensure you have TensorFlow and other dependencies installed before running this script.

In [2]:
# Define the ResNet block
def resnet_block(inputs, filters, kernel_size, strides):
    # First convolutional layer
    x = tf.keras.layers.Conv1D(filters, kernel_size, strides=strides, padding='same')(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    
    # Second convolutional layer
    x = tf.keras.layers.Conv1D(filters, kernel_size, strides=1, padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    # Shortcut connection
    shortcut = tf.keras.layers.Conv1D(filters, 1, strides=strides, padding='same')(inputs)
    x = tf.keras.layers.Add()([x, shortcut])
    x = tf.keras.layers.Activation('relu')(x)
    return x

# Define the encoder
def build_encoder(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    
    # Initial convolutional layer
    x = tf.keras.layers.Conv1D(64, 3, strides=2, padding='same')(inputs)
    x = tf.keras.layers.Activation('relu')(x)
    
    # ResNet blocks
    for _ in range(4):
        x = resnet_block(x, 64, 3, 2)
    
    # LSTM layer and dense layer
    x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    
    encoder = tf.keras.Model(inputs, x, name='encoder')
    return encoder

# Define the decoder
def build_decoder(latent_dim, output_shape):
    inputs = tf.keras.Input(shape=(None, latent_dim))
    
    # Dense layer and LSTM layer
    x = tf.keras.layers.Dense(128, activation='relu')(inputs)
    x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
    
    # Transposed convolutional layers
    for _ in range(4):
        x = tf.keras.layers.Conv1DTranspose(64, 3, strides=2, padding='same')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('relu')(x)
    
    # Final transposed convolutional layer
    x = tf.keras.layers.Conv1DTranspose(1, 3, strides=2, padding='same')(x)
    outputs = tf.keras.layers.Activation('tanh')(x)
    
    decoder = tf.keras.Model(inputs, outputs, name='decoder')
    return decoder

# Define the autoencoder
def build_autoencoder(encoder, decoder):
    inputs = encoder.input
    encoded = encoder(inputs)
    decoded = decoder(encoded)
    autoencoder = tf.keras.Model(inputs, decoded, name='autoencoder')
    return autoencoder

# Build and compile the autoencoder
input_shape = (16000, 1)  # Example input shape for 1 second of audio at 16kHz
latent_dim = 128
output_shape = (16000, 1)
encoder = build_encoder(input_shape)
decoder = build_decoder(latent_dim, output_shape)
autoencoder = build_autoencoder(encoder, decoder)
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError())

# Print summaries
print("Encoder Summary:")
encoder.summary()

print("\nDecoder Summary:")
decoder.summary()

print("\nAutoencoder Summary:")
autoencoder.summary()

# Example input and output shapes
example_input = tf.random.normal((1, 16000, 1))  # Batch size of 1, input length of 16000, 1 channel
encoded_output = encoder(example_input)
decoded_output = decoder(encoded_output)

print("\nExample Input Shape:", example_input.shape)
print("Encoded Output Shape:", encoded_output.shape)
print("Decoded Output Shape:", decoded_output.shape)

print("\nModel building is complete.")

Encoder Summary:



Decoder Summary:



Autoencoder Summary:



Example Input Shape: (1, 16000, 1)
Encoded Output Shape: (1, 500, 128)
Decoded Output Shape: (1, 16000, 1)

Model building is complete.


# Model training 

This script continues from the data preprocessing phase and focuses on training the autoencoder model for audio compression. It includes:
1. Printing the shapes of the training, validation, and test datasets.
2. Setting up an early stopping callback to prevent overfitting.
3. Training the autoencoder model with the early stopping callback.
4. Printing the training history.
5. Plotting the training and validation loss over epochs.
6. Saving the trained model.
7. Displaying the final autoencoder model summary.

Ensure you have already run the data preprocessing script to prepare the datasets before executing this script.

In [None]:
# Print the shapes of the datasets
print(f"Training data shape: {clean_speech_train.shape}")
print(f"Validation data shape: {clean_speech_val.shape}")
print(f"Test data shape: {clean_speech_test.shape}")

# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = autoencoder.fit(
    train_dataset,
    epochs=20,  # Adjusted to 20 for sufficient training
    validation_data=val_dataset,
    callbacks=[early_stopping]
)

# Print training history
print("\nTraining history:")
print(history.history)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss', color='blue')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot validation loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Save the trained model
model_path = 'E:/Mini Project/Dataset/Git/f_resnet_autoencoder_model.h5'
autoencoder.save(model_path)
print('Saved model in path: ' + model_path)

# Final model summary
print("\nAutoencoder Model Summary:")
autoencoder.summary()

print("Model training is complete.")