In [55]:
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
import h5py
import soundfile as sf
import torch

In [64]:
example_log_spec = None
example_sr = None

def plot_mfcc_spectrogram(log_mel_spectrogram, sr):
    plt.figure(figsize=(10, 5))
    librosa.display.specshow(log_mel_spectrogram, sr=sr, hop_length=256, x_axis='time', y_axis='mel')
    print(log_mel_spectrogram.shape)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log Mel Spectrogram')
    plt.show()

def audio_to_spectrogram(src_dir, dest_dir):
    h5_arr = []
    j = 0
    with h5py.File(dest_dir, 'w') as hf:
        # Iterate over the FLAC files in the source directory
        print(src_dir)
        for root, _, files in os.walk(src_dir, topdown=False):
            print('going')
            for file_name in files:
                if file_name.endswith('.flac') or file_name.endswith('.mp3'):
                    try:
                        # Load the FLAC file
                        input_file = os.path.join(root, file_name)
                        audio, sr = librosa.load(input_file, sr=48000)
                        print(sr)
                        sec = sr * 20

                        # Generate the spectrogram batches
                        for i in range(audio.shape[0]//(sec)):
                            batch = audio[i*sec:(i+1)*sec]
                            
                            mel_spectrogram = librosa.feature.melspectrogram(y=batch, sr=sr, n_fft=1024, hop_length=256, n_mels=80)
                            
                            log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
                            h5_arr.append(log_mel_spectrogram)
                            
                            # testing
                            # if j == 0:
                            #     print(batch.shape)
                            #     print(mel_spectrogram.shape)
                            #     print(log_mel_spectrogram.shape)
                            #     global example_log_spec
                            #     example_log_spec = log_mel_spectrogram
                            #     global example_sr 
                            #     example_sr = sr;
                                # mel_spectrogram_linear = librosa.db_to_power(log_mel_spectrogram, ref=np.max)
                                # stft =  np.abs(librosa.stft(mel_spectrogram_linear))
                                # spectral_centroids = librosa.feature.spectral_centroid(S=stft, sr=22050)
                                # print(np.mean(spectral_centroids))
                                # j += 1
                            ## larger nfft better for frequency resolution
                            ## higher hop length for time resolution
                    except AttributeError as e:
                        print(f"Error processing file: {file_name} - {e}")
                        continue
        h5_arr = np.array(h5_arr)
        print(h5_arr.shape)
        hf.create_dataset("audio", data=h5_arr)


In [None]:
## windows
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
audio_to_spectrogram(os.path.join(parent_dir, 'code', 'iu'),'iu3.h5')
audio_to_spectrogram(os.path.join(parent_dir, 'bruno_mars'),'bruno3.h5')
plot_mfcc_spectrogram(example_log_spec, example_sr)

In [None]:
print(example_log_spec.shape)
waveform = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(example_log_spec), sr=example_sr, n_fft=1024, hop_length=256)

# Save the reconstructed waveform as a .wav file
output_path = 'reconstructed_audio.wav'
#librosa.output.write_wav(output_path, waveform, example_sr)
sf.write(output_path, waveform, example_sr)

In [70]:
def spectrogram_to_wav(spectrogram):
    waveform = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(example_log_spec), sr=example_sr, n_fft=1024, hop_length=256)

    # Save the reconstructed waveform as a .wav file
    output_path = 'reconstructed_audio.wav'
    #librosa.output.write_wav(output_path, waveform, example_sr)
    sf.write(output_path, waveform, example_sr)
    return waveform

In [None]:
# for debuggging

num_frames = 100
num_bins = 128
spectrogram = np.random.rand(num_bins, num_frames)  # Replace this with your actual spectrogram data

# Plot the spectrogram
plt.figure(figsize=(10, 6))
plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='jet')

# Set labels and title
plt.xlabel('Frame')
plt.ylabel('Frequency Bin')
plt.title('Dummy Spectrogram')

# Add a colorbar
plt.colorbar(format='%+2.0f dB')

# Show the plot
plt.show()