In [1]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import numpy as np
import pandas as pd
#import librosa
import logging
import os
from scipy.io import wavfile

#os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# Configure the logging settings
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [18]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

def audio_to_spectrogram(audio_file_path, max_length=6*22500, sr=22500):
    # Load the audio file
    y, sr = librosa.load(audio_file_path, sr=sr)
    
    if len(y) > max_length:
        # If the audio is longer than the maximum length, cut it
        y = y[:max_length]
    elif len(y) < max_length:
        # If the audio is shorter, pad it with zeros
        pad_length = max_length - len(y)
        y = np.pad(y, (0, pad_length), mode='constant')
    
    # Calculate the STFT
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    
    return D


In [7]:
# Generator function for lazy loading of audio data
def audio_data_generator(audio_files):
    for audio_file in audio_files:
        yield audio_to_spectrogram(audio_file)

# Load a list of audio files
audio_files_df = pd.read_csv('data_files_summary.csv')
audio_files = audio_files_df['full_path'].to_list()

In [11]:
for audio_file in audio_files[:2]:
    print(audio_to_spectrogram(audio_file))

[[-61.096313 -55.76928  -54.597748 ... -80.       -80.       -80.      ]
 [-62.884403 -60.52305  -60.369137 ... -80.       -80.       -80.      ]
 [-68.10806  -78.98971  -80.       ... -80.       -80.       -80.      ]
 ...
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]]
[[-61.27263  -56.059315 -55.41803  ... -80.       -80.       -80.      ]
 [-63.211212 -61.038136 -60.47317  ... -80.       -80.       -80.      ]
 [-68.78768  -80.       -73.29738  ... -80.       -80.       -80.      ]
 ...
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]]


In [21]:
%%time
# Create a generator to load and preprocess audio data on-the-fly
data_generator = audio_data_generator(audio_files[:4500])
data_iterator = iter(data_generator)  # Convert the generator to an iterator

# To train an autoencoder, you need target data, which is the same as the input data
# So, use the same data for both input and target
X_train = np.array(list(data_iterator))
#X_train.shape

CPU times: total: 47.4 s
Wall time: 48 s


In [22]:
X_train.shape

(4500, 1025, 264)

In [23]:
X_train = X_train.reshape(4500, 1025, 264, 1)

In [24]:
X_train_scaled = ((X_train + 80)/80)

In [27]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Conv2DTranspose, MaxPooling2D, UpSampling2D, Normalization, Reshape, Dense, Flatten, Cropping2D
from tensorflow.keras.models import Model

# Define the input shape
input_shape = (1025, 264, 1)  # You can adjust the input shape based on your data

# Define the encoder
input_layer = Input(shape=input_shape)
x = input_layer

x = Conv2D(128, kernel_size = (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

x = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Cropping2D(cropping=((3, 0), (0, 0)))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_layer, decoded)

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mse')
# Print the model summary
autoencoder.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1025, 264, 1)]    0         
                                                                 
 conv2d_5 (Conv2D)           (None, 1025, 264, 128)    1280      
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 513, 132, 128)     0         
 g2D)                                                            
                                                                 
 conv2d_6 (Conv2D)           (None, 513, 132, 32)      36896     
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 257, 66, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_7 (Conv2D)           (None, 257, 66, 32)       9248

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='loss', patience=5, verbose=1, restore_best_weights=True)
# Step 3: Train the autoencoder using audio data
logging.info('Training the autoencoder...')
autoencoder.fit(X_train_scaled, X_train_scaled, batch_size=32, epochs=300, callbacks=[early_stopping])
logging.info('Training complete.')

2023-10-20 00:55:13,570 - root - INFO - Training the autoencoder...


Epoch 1/300
  1/141 [..............................] - ETA: 3:17:23 - loss: 0.1916

In [None]:
autoencoder.save('autoencoder_model')

In [None]:
import soundfile as sf  # Soundfile library for saving the WAV file
def spectrogram_to_audio(spectrogram, sr=22500, max_length=6*22500):
    # Inverse operation to recover the magnitude spectrum
    magnitude = librosa.db_to_amplitude(D)#, ref=np.max)
    
    # Inverse Short-Time Fourier Transform (iSTFT) to obtain the time-domain signal
    y_reconstructed = librosa.istft(magnitude)

    return y_reconstructed

# Example usage
spectrogram = audio_to_spectrogram("converted.wav", sr=48000).reshape((1, 1025, 264, 1))
spectrogram = (spectrogram+80)/80
out_spectre = autoencoder.predict(spectrogram)
out_spectre = (out_spectre*80)-80
out_spectre = out_spectre.reshape((1025, 308))
reconstructed_audio = spectrogram_to_audio(out_spectre, 22500) * 1e3
sf.write("reconstructed_audio.wav", np.ravel(reconstructed_audio), 22500)  # Adjust the sample rate as needed


In [None]:
autoencoder.summary()

In [None]:
y, sr = librosa.load('converted.wav', sr=22500, duration=112500)


In [None]:
plt.plot(y)

In [None]:
plt.plot(reconstructed_audio)

In [None]:
plt.imshow(out_spectre)

In [None]:
plt.imshow(spectrogram[0])