In [1]:
import os
import librosa
import numpy as np
import soundfile as sf
from PIL import Image

In [2]:
def scale_to_uint8(array):
    array = array.astype(float)
    if np.max(array) == np.min(array):
        return np.zeros(array.shape, dtype=np.uint8)
    scaled_array = (array - np.min(array)) / (np.max(array) - np.min(array))
    return (scaled_array * 255).astype(np.uint8)

def spect_create_224(file_path, output_folder):
    """
    Creates 512 square spectrograms, resizes them to an intermediate
    dimension, then crops the edges to reach the 224 final dimension.
    """

    # Parameters for Image Processing Pipeline
    HI_RES_DIM = 512
    INTERMEDIATE_DIM = 228
    CROP_PIXELS = 2 # pixels to crop from each side
    FINAL_DIM = INTERMEDIATE_DIM - (CROP_PIXELS * 2)

    # Parameters for Audio Processing
    HOP_LENGTH = 512
    N_FFT = 2048
    N_MELS = HI_RES_DIM
    CHUNK_DURATION_SEC = (HI_RES_DIM * HOP_LENGTH) / 16000.0
    OVERLAP_SEC = CHUNK_DURATION_SEC / 4
    HOP_SEC = CHUNK_DURATION_SEC - OVERLAP_SEC
    TOP_DB = None


    print("Starting spectrogram generation...")
    print(f" - Pipeline: {HI_RES_DIM}x{HI_RES_DIM} -> resize to {INTERMEDIATE_DIM}x{INTERMEDIATE_DIM} -> crop to {FINAL_DIM}x{FINAL_DIM}")

    os.makedirs(output_folder, exist_ok=True)

    try:
        with sf.SoundFile(file_path, 'r') as audio_file:
            sr = audio_file.samplerate
            if sr != 16000:
                print(f"Warning: Sample rate is {sr}Hz, but calculations assume 16000Hz.")

            chunk_size_frames = int(CHUNK_DURATION_SEC * sr)
            hop_size_frames = int(HOP_SEC * sr)
            total_frames = audio_file.frames

            total_chunks = int(np.ceil((total_frames - chunk_size_frames) / hop_size_frames)) + 1

            print(f"Audio detected at {sr} Hz. Total chunks to process: {total_chunks}")

            chunk_num = 0
            while True:
                start_frame = chunk_num * hop_size_frames
                if start_frame + chunk_size_frames > total_frames:
                    break

                audio_file.seek(start_frame)
                y_chunk = audio_file.read(chunk_size_frames)

                if len(y_chunk) == 0:
                    break

                if (chunk_num + 1) % 20 == 0 or chunk_num == 0:
                    print(f"Processing chunk {chunk_num + 1}/{total_chunks}...")

                peak_amplitude = np.max(np.abs(y_chunk))
                y_normalized = y_chunk / peak_amplitude if peak_amplitude > 0 else y_chunk

                S = librosa.feature.melspectrogram(
                    y=y_normalized, sr=sr, n_fft=N_FFT,
                    hop_length=HOP_LENGTH, n_mels=N_MELS
                )

                S_dB = librosa.power_to_db(S, ref=np.max, top_db=TOP_DB)
                img_array = scale_to_uint8(S_dB)
                img_array = 255 - img_array # invert colors
                img = Image.fromarray(img_array, 'L')

                # Resize the high-resolution image down to the intermediate dimension
                img_intermediate = img.resize((INTERMEDIATE_DIM, INTERMEDIATE_DIM), Image.Resampling.LANCZOS)

                # Crop the specified number of pixels from each side
                crop_box = (
                    CROP_PIXELS,
                    CROP_PIXELS,
                    INTERMEDIATE_DIM - CROP_PIXELS,
                    INTERMEDIATE_DIM - CROP_PIXELS
                )
                final_img = img_intermediate.crop(crop_box)

                output_filename = os.path.join(output_folder, f"spectrogram_{chunk_num:04d}.png")
                final_img.save(output_filename)

                chunk_num += 1

                if chunk_num == 100:
                    break

        print(f"\nProcessing complete. {chunk_num} spectrograms saved to '{output_folder}'.")

    except Exception as e:
        print(f"An error occurred: {e}")



In [3]:
if __name__ == '__main__':
    AUDIO_FILE = 'MARS-20230921T000000Z-16kHz.flac'
    OUTPUT_SPECTROGRAM_FOLDER = 'spect_test_224'
    spect_create_224(AUDIO_FILE, OUTPUT_SPECTROGRAM_FOLDER)

Starting spectrogram generation...
 - Pipeline: 512x512 -> resize to 228x228 -> crop to 224x224
Audio detected at 16000 Hz. Total chunks to process: 7031
Processing chunk 1/7031...
Processing chunk 20/7031...
Processing chunk 40/7031...
Processing chunk 60/7031...
Processing chunk 80/7031...
Processing chunk 100/7031...

Processing complete. 100 spectrograms saved to 'spect_test_224'.


# 512 script

In [4]:
def spect_create_512(file_path, output_folder):
    """
    Creates 516 square spectrograms then crops the edges to reach the 512 final dimension.
    """

    # Parameters for Image Processing Pipeline
    HI_RES_DIM = 516
    CROP_PIXELS = 2 # pixels to crop from each side
    FINAL_DIM = HI_RES_DIM - (CROP_PIXELS * 2)

    # Parameters for Audio Processing
    HOP_LENGTH = 516
    N_FFT = 2048
    N_MELS = HI_RES_DIM
    CHUNK_DURATION_SEC = (HI_RES_DIM * HOP_LENGTH) / 16000.0
    OVERLAP_SEC = CHUNK_DURATION_SEC / 4
    HOP_SEC = CHUNK_DURATION_SEC - OVERLAP_SEC
    TOP_DB = None

    print("Starting spectrogram generation...")
    print(f" - Pipeline: {HI_RES_DIM}x{HI_RES_DIM} -> crop to {FINAL_DIM}x{FINAL_DIM}")

    os.makedirs(output_folder, exist_ok=True)

    try:
        with sf.SoundFile(file_path, 'r') as audio_file:
            sr = audio_file.samplerate
            if sr != 16000:
                print(f"Warning: Sample rate is {sr}Hz, but calculations assume 16000Hz.")

            chunk_size_frames = int(CHUNK_DURATION_SEC * sr)
            hop_size_frames = int(HOP_SEC * sr)
            total_frames = audio_file.frames

            total_chunks = int(np.ceil((total_frames - chunk_size_frames) / hop_size_frames)) + 1

            print(f"Audio detected at {sr} Hz. Total chunks to process: {total_chunks}")

            chunk_num = 0
            while True:
                start_frame = chunk_num * hop_size_frames
                if start_frame + chunk_size_frames > total_frames:
                    break

                audio_file.seek(start_frame)
                y_chunk = audio_file.read(chunk_size_frames)

                if len(y_chunk) == 0:
                    break

                if (chunk_num + 1) % 20 == 0 or chunk_num == 0:
                    print(f"Processing chunk {chunk_num + 1}/{total_chunks}...")

                peak_amplitude = np.max(np.abs(y_chunk))
                y_normalized = y_chunk / peak_amplitude if peak_amplitude > 0 else y_chunk

                S = librosa.feature.melspectrogram(
                    y=y_normalized, sr=sr, n_fft=N_FFT,
                    hop_length=HOP_LENGTH, n_mels=N_MELS
                )

                S_dB = librosa.power_to_db(S, ref=np.max, top_db=TOP_DB)
                img_array = scale_to_uint8(S_dB)
                img_array = 255 - img_array
                img = Image.fromarray(img_array, 'L')

                # Crop the specified number of pixels from each side
                crop_box = (
                    CROP_PIXELS,
                    CROP_PIXELS,
                    HI_RES_DIM - CROP_PIXELS,
                    HI_RES_DIM - CROP_PIXELS
                )
                final_img = img.crop(crop_box)

                output_filename = os.path.join(output_folder, f"spectrogram_{chunk_num:04d}.png")
                final_img.save(output_filename)

                chunk_num += 1

                if chunk_num == 100:
                    break

        print(f"\nProcessing complete. {chunk_num} spectrograms saved to '{output_folder}'.")

    except Exception as e:
        print(f"An error occurred: {e}")



In [5]:
if __name__ == '__main__':
    AUDIO_FILE = 'MARS-20230921T000000Z-16kHz.flac'
    OUTPUT_SPECTROGRAM_FOLDER = 'spect_test_512'
    spect_create_512(AUDIO_FILE, OUTPUT_SPECTROGRAM_FOLDER)

Starting spectrogram generation...
 - Pipeline: 516x516 -> crop to 512x512
Audio detected at 16000 Hz. Total chunks to process: 6923
Processing chunk 1/6923...
Processing chunk 20/6923...
Processing chunk 40/6923...
Processing chunk 60/6923...
Processing chunk 80/6923...
Processing chunk 100/6923...

Processing complete. 100 spectrograms saved to 'spect_test_512'.
