In [55]:
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
import h5py
import torch

In [4]:
example_log_spec = None
example_sr = None

def plot_mfcc_spectrogram(log_mel_spectrogram, sr):
    plt.figure(figsize=(10, 5))
    librosa.display.specshow(log_mel_spectrogram, sr=sr, hop_length=256, x_axis='time', y_axis='mel')
    print(log_mel_spectrogram.shape)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Log Mel Spectrogram')
    plt.show()

def audio_to_spectrogram(src_dir, dest_dir):
    h5_arr = list()
    with h5py.File(dest_dir, 'w') as hf:
        # Iterate over the FLAC files in the source directory
        print(src_dir)
        for root, _, files in os.walk(src_dir, topdown=False):
            print('going')
            for file_name in files:
                if file_name.endswith('.flac') or file_name.endswith('.mp3'):
                    try:
                        input_file = os.path.join(root, file_name)
                        audio, sr = librosa.load(input_file, sr=22050)
                        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=4096, hop_length=512, n_mels=80)
                        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
                        if log_mel_spectrogram.shape[-1] >= 64:    # training sample consists of 64 randomly cropped frames
                            h5_arr.append(log_mel_spectrogram)
                        # larger nfft better for frequency resolution
                        # higher hop length for time resolution
                    except AttributeError as e:
                        print(f"Error processing file: {file_name} - {e}")
                        continue
    
    mel_concatenated = np.concatenate(h5_arr, axis=1)
    mel_mean = np.mean(mel_concatenated, axis=1, keepdims=True)
    mel_std = np.std(mel_concatenated, axis=1, keepdims=True) + 1e-9

    mel_normalized = list()
    for mel in h5_arr:
        assert mel.shape[-1] >= 64, f"Mel spectogram length must be greater than 64 frames, but was {mel.shape[-1]}"
        app = (mel - mel_mean) / mel_std
        mel_normalized.append(app)

    # save to pickle instead of h5.
    # h5_arr = np.array(h5_arr)
    # print(h5_arr.shape)
    hf.create_dataset("audio", data=h5_arr)

In [None]:
audio_to_spectrogram('../iu','iu_test.h5')
audio_to_spectrogram('../bruno_mars','bruno_test.h5')

In [9]:
SAMPLING_RATE = 22050  # Fixed sampling rate



def normalize_mel(src_dir, dest_dir):
    h5_arr = []
    j = 0
    vocoder = torch.hub.load("descriptinc/melgan-neurips", 'load_melgan')
    with h5py.File(dest_dir, 'w') as hf:
        # Iterate over the FLAC files in the source directory
        for root, _, files in os.walk(src_dir, topdown=False):
            for file_name in files:
                if file_name.endswith('.flac') or file_name.endswith('.mp3'):
                    mel_list = list()
                    try:
                        # Load the FLAC file
                        input_file = os.path.join(root, file_name)
                        print(input_file)
                        audio, _ = librosa.load(input_file, sr=22050, mono=True)
                        spec_t, _ = vocoder(torch.tensor([audio]))
                        spec = librosa.feature.melspectrogram(y=audio, sr=22050, n_fft=1024, win_length=1024, hop_length=256, n_mels=80)
                        
                        print(spec.shape)

                        if spec.shape[-1] >= 64:    # training sample consists of 64 randomly cropped frames
                            print(spec.detach().numpy()[0])
                            mel_list.append(spec.cpu().detach().numpy()[0])
                    
                    except AttributeError as e:
                        print(f"Error processing file: {file_name} - {e}")
                        continue

    mel_concatenated = np.concatenate(mel_list, axis=1)
    mel_mean = np.mean(mel_concatenated, axis=1, keepdims=True)
    mel_std = np.std(mel_concatenated, axis=1, keepdims=True) + 1e-9

    mel_normalized = list()
    for mel in mel_list:
        assert mel.shape[-1] >= 64, f"Mel spectogram length must be greater than 64 frames, but was {mel.shape[-1]}"
        app = (mel - mel_mean) / mel_std
        mel_normalized.append(app)

    hf.close()

    return mel_normalized, mel_mean, mel_std

In [None]:
normalize_mel('../iu','aaaa')