In [2]:
import os
import sys
import pandas as pd

Keep only audio in 'audio' folder

In [5]:
# write code to access the raw/audio folder, move the json files to a new folder 'Json' and delete the jpg files
def move_json_files():
    # get the current working directory
    cwd = os.getcwd()
    # get the path to the raw/audio folder
    path = os.path.join(cwd, 'raw', 'audio')
    # get the list of files in the raw/audio folder
    files = os.listdir(path)
    # create a new folder 'Json' in the raw/audio folder
    new_folder = os.path.join(path, 'Json')
    os.makedirs(new_folder, exist_ok=True)
    # iterate through the files in the raw/audio folder
    for file in files:
        # check if the file is a json file
        if file.endswith('.json'):
            # move the json file to the new folder 'Json'
            os.rename(os.path.join(path, file), os.path.join(new_folder, file))
        # check if the file is a jpg file
        if file.endswith('.jpg'):
            # delete the jpg file
            os.remove(os.path.join(path, file))

 

In [6]:
# run move_json_files function
#move_json_files()

Transoform audio to spectograms

In [None]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
#import traceback  # Import for detailed error handling

def mp3_to_spectrogram(mp3_file, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        # Load the MP3 file using librosa
        y, sr = librosa.load(mp3_file, sr=None) # y signal, sr sample rate

        # Check if the file is empty or too short
        if len(y) == 0:
            print(f"File {mp3_file} is empty or too short.")
            return

        print(f"Loaded {mp3_file}, Sample Rate: {sr}, Audio Shape: {y.shape}")
        
        # Compute the Short-Time Fourier Transform (STFT)
        stft = librosa.stft(y)
        specto = np.abs(stft)
        D = librosa.amplitude_to_db(specto, ref=np.max)
        
        # Normalize values to 0-255 range and convert to uint8
        #S = cv2.normalize(S, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        # Convert to RGB and save as PNG
        #S = cv2.cvtColor(S, cv2.COLOR_GRAY2RGB)
        #cv2.imwrite(save_path, S)
        
        # Generate the spectrogram plot
        plt.figure(figsize=(10, 6))
        librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='inferno')
        plt.colorbar(format='%+2.0f dB')
        plt.title(f'Spectrogram of {os.path.basename(mp3_file)}')
        
        # Save the spectrogram image
        output_image_path = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(mp3_file))[0]}.png")
        plt.savefig(output_image_path)
        plt.close()
        
        print(f"Spectrogram saved to {output_image_path}")
        return output_image_path

    except Exception as e:
        # Print full stack trace to understand the exact error
        print(f"Error processing {mp3_file}:")
        #traceback.print_exc()  # This prints the full error traceback

def process_all_mp3_in_folder(folder_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all MP3 files in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".mp3"):
            mp3_file = os.path.join(folder_path, file)
            print(f"Processing file: {mp3_file}")
            mp3_to_spectrogram(mp3_file, output_folder)




# Example usage for processing multiple MP3 files
folder_path = "raw/audio"  # Folder containing MP3 files
output_folder = "raw/spectrograms"  # Folder to save spectrogram images
process_all_mp3_in_folder(folder_path, output_folder)


SEGMENT SPECTOGRAM


Transform audio and segment spectograms


In [None]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment  # To save segments as MP3 files

def mp3_to_spectrogram(mp3_file, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        # Load the MP3 file using librosa
        y, sr = librosa.load(mp3_file, sr=None)  # y: signal, sr: sample rate

        # Check if the file is empty or too short
        if len(y) == 0:
            print(f"File {mp3_file} is empty or too short.")
            return

        print(f"Loaded {mp3_file}, Sample Rate: {sr}, Audio Shape: {y.shape}")
        
        # Compute the Short-Time Fourier Transform (STFT)
        stft = librosa.stft(y)
        specto = np.abs(stft)
        D = librosa.amplitude_to_db(specto, ref=np.max)
        
        # Calculate duration of the audio in seconds
        duration = librosa.get_duration(y=y, sr=sr)
        
        # Split the spectrogram into 3 equal segments
        split_duration = duration / 3  # Length of each segment in seconds

        for i in range(3):
            start_time = i * split_duration
            end_time = (i + 1) * split_duration

            # Convert start and end time to frames
            start_frame = librosa.time_to_frames(start_time, sr=sr)
            end_frame = librosa.time_to_frames(end_time, sr=sr)

            # Extract the spectrogram segment
            D_segment = D[:, start_frame:end_frame]

            # Plot and save the spectrogram segment
            plt.figure(figsize=(10, 6))
            librosa.display.specshow(D_segment, sr=sr, x_axis='time', y_axis='log', cmap='inferno')
            plt.colorbar(format='%+2.0f dB')
            plt.title(f'Spectrogram Segment {i + 1} of {os.path.basename(mp3_file)}')
            
            # Save the spectrogram image for this segment
            output_image_path = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(mp3_file))[0]}_segment_{i+1}.png")
            plt.savefig(output_image_path)
            plt.close()

            print(f"Spectrogram Segment {i+1} saved to {output_image_path}")

            # Save the corresponding audio segment as MP3
            start_ms = int(start_time * 1000)  # Convert start time to milliseconds
            end_ms = int(end_time * 1000)      # Convert end time to milliseconds
            audio_segment = AudioSegment.from_file(mp3_file)[start_ms:end_ms]

            output_audio_path = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(mp3_file))[0]}_segment_{i+1}.mp3")
            audio_segment.export(output_audio_path, format="mp3")
            print(f"Audio Segment {i+1} saved to {output_audio_path}")

    except Exception as e:
        print(f"Error processing {mp3_file}: {e}")

def process_all_mp3_in_folder(folder_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all MP3 files in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".mp3"):
            mp3_file = os.path.join(folder_path, file)
            print(f"Processing file: {mp3_file}")
            mp3_to_spectrogram(mp3_file, output_folder)

# Example usage for processing multiple MP3 files
folder_path = "raw/audio"  # Folder containing MP3 files
output_folder = "raw/spectograms_segmented"  # Folder to save spectrogram images and segments
process_all_mp3_in_folder(folder_path, output_folder)


Transform audio and segment spectograms
(without legends and mp3 files)

In [None]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

def mp3_to_spectrogram(mp3_file, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        # Load the MP3 file using librosa
        y, sr = librosa.load(mp3_file, sr=None)  # y: signal, sr: sample rate

        # Check if the file is empty or too short
        if len(y) == 0:
            print(f"File {mp3_file} is empty or too short.")
            return

        print(f"Loaded {mp3_file}, Sample Rate: {sr}, Audio Shape: {y.shape}")
        
        # Compute the Short-Time Fourier Transform (STFT)
        stft = librosa.stft(y)
        specto = np.abs(stft)
        D = librosa.amplitude_to_db(specto, ref=np.max)
        
        # Calculate duration of the audio in seconds
        duration = librosa.get_duration(y=y, sr=sr)
        
        # Split the spectrogram into 3 equal segments
        split_duration = duration / 3  # Length of each segment in seconds

        for i in range(3):
            start_time = i * split_duration
            end_time = (i + 1) * split_duration

            # Convert start and end time to frames
            start_frame = librosa.time_to_frames(start_time, sr=sr)
            end_frame = librosa.time_to_frames(end_time, sr=sr)

            # Extract the spectrogram segment
            D_segment = D[:, start_frame:end_frame]

            # Plot and save the spectrogram segment without extra elements
            plt.figure(figsize=(10, 6))
            librosa.display.specshow(D_segment, sr=sr, cmap='inferno')
            plt.axis('off')  # Disable axis
            plt.savefig(os.path.join(output_folder, f"{os.path.splitext(os.path.basename(mp3_file))[0]}_segment_{i+1}.png"),
                        bbox_inches='tight', pad_inches=0)  # Save without padding
            plt.close()

            print(f"Spectrogram Segment {i+1} saved.")

    except Exception as e:
        print(f"Error processing {mp3_file}: {e}")

def process_all_mp3_in_folder(folder_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all MP3 files in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".mp3"):
            mp3_file = os.path.join(folder_path, file)
            print(f"Processing file: {mp3_file}")
            mp3_to_spectrogram(mp3_file, output_folder)

# Example usage for processing multiple MP3 files
folder_path = "raw/audio"  # Folder containing MP3 files
output_folder = "raw/spectrograms_csegm"  # Folder to save spectrogram images and segments
process_all_mp3_in_folder(folder_path, output_folder)


Processing file: raw/audio\000CC8EParg64OmTxVnZ0p.mp3


  y, sr = librosa.load(mp3_file, sr=None)  # y: signal, sr: sample rate
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded raw/audio\000CC8EParg64OmTxVnZ0p.mp3, Sample Rate: 44100, Audio Shape: (7567360,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\000RDCYioLteXcutOjeweY.mp3


  y, sr = librosa.load(mp3_file, sr=None)  # y: signal, sr: sample rate
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded raw/audio\000RDCYioLteXcutOjeweY.mp3, Sample Rate: 44100, Audio Shape: (9453568,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\001pyq8FLNSL1C8orNLI0b.mp3
Loaded raw/audio\001pyq8FLNSL1C8orNLI0b.mp3, Sample Rate: 44100, Audio Shape: (9496576,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\001YQlnDSduXd5LgBd66gT.mp3
Loaded raw/audio\001YQlnDSduXd5LgBd66gT.mp3, Sample Rate: 44100, Audio Shape: (768,)
Error processing raw/audio\001YQlnDSduXd5LgBd66gT.mp3: index -1 is out of bounds for axis 0 with size 0
Processing file: raw/audio\003vvx7Niy0yvhvHt4a68B.mp3




Loaded raw/audio\003vvx7Niy0yvhvHt4a68B.mp3, Sample Rate: 44100, Audio Shape: (10037248,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\0068lzo1xXa9ED8ThypHU1.mp3
Loaded raw/audio\0068lzo1xXa9ED8ThypHU1.mp3, Sample Rate: 44100, Audio Shape: (6815744,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\006Bi4j0yzwOc3y69GOlYV.mp3
Loaded raw/audio\006Bi4j0yzwOc3y69GOlYV.mp3, Sample Rate: 44100, Audio Shape: (8522544,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\006rHBBNLJMpQs8fRC2GDe.mp3
Loaded raw/audio\006rHBBNLJMpQs8fRC2GDe.mp3, Sample Rate: 44100, Audio Shape: (11589632,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\006tmNZLXEXPqdb23wwSN1.mp3
Loaded raw/audio\006tmNZLXEXPqdb23wwSN1.mp3, Sample Rate: 44100, Audio Shape: 



Loaded raw/audio\00hTMcTeaaMtjBCV30yAm9.mp3, Sample Rate: 44100, Audio Shape: (11180848,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\00hTw7P6jPUio5Qgojw38w.mp3
Loaded raw/audio\00hTw7P6jPUio5Qgojw38w.mp3, Sample Rate: 44100, Audio Shape: (8847360,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\00I41xsW6SunZDJ5fB8KAd.mp3
Loaded raw/audio\00I41xsW6SunZDJ5fB8KAd.mp3, Sample Rate: 44100, Audio Shape: (7329792,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\00iwQgjIgoGO94BRBDyClo.mp3
Loaded raw/audio\00iwQgjIgoGO94BRBDyClo.mp3, Sample Rate: 44100, Audio Shape: (8607744,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\00j7NgyKytDYlBEpIv4QPQ.mp3
Loaded raw/audio\00j7NgyKytDYlBEpIv4QPQ.mp3, Sample Rate: 44100, Audio Shape: (



Loaded raw/audio\018zT8xThRbMH2QeUoMlKQ.mp3, Sample Rate: 44100, Audio Shape: (12660736,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\01AE1MgL6L86WQzh4KGDUa.mp3
Loaded raw/audio\01AE1MgL6L86WQzh4KGDUa.mp3, Sample Rate: 44100, Audio Shape: (11898880,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\01AwohzqShtjlhyjYoWKHM.mp3
Loaded raw/audio\01AwohzqShtjlhyjYoWKHM.mp3, Sample Rate: 44100, Audio Shape: (9299968,)
Spectrogram Segment 1 saved.
Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\01Bjn3lj74LedUImE5ceFe.mp3
Loaded raw/audio\01Bjn3lj74LedUImE5ceFe.mp3, Sample Rate: 48000, Audio Shape: (768,)
Error processing raw/audio\01Bjn3lj74LedUImE5ceFe.mp3: index -1 is out of bounds for axis 0 with size 0
Processing file: raw/audio\01bMpqmvH031R417l3AQTA.mp3
Loaded raw/audio\01bMpqmvH031R417l3AQTA.mp3, Sample Rate: 44100, 

  plt.figure(figsize=(10, 6))


Loaded raw/audio\01WbKbstEeqV9OY2JMFQe4.mp3, Sample Rate: 44100, Audio Shape: (9019392,)
Error processing raw/audio\01WbKbstEeqV9OY2JMFQe4.mp3: Unable to allocate 68.9 MiB for an array with shape (1025, 17617) and data type float32
Processing file: raw/audio\01XbMBVX5A3DIsGHuUsBSe.mp3
Loaded raw/audio\01XbMBVX5A3DIsGHuUsBSe.mp3, Sample Rate: 44100, Audio Shape: (7471104,)
Error processing raw/audio\01XbMBVX5A3DIsGHuUsBSe.mp3: Unable to allocate 57.1 MiB for an array with shape (1025, 14593) and data type float32
Processing file: raw/audio\01xdiGuSuIf5qZulm1uqkY.mp3
Loaded raw/audio\01xdiGuSuIf5qZulm1uqkY.mp3, Sample Rate: 44100, Audio Shape: (1920,)
Spectrogram Segment 1 saved.




Spectrogram Segment 2 saved.
Spectrogram Segment 3 saved.
Processing file: raw/audio\01XFgRZfZI7oBagNf1Loml.mp3
Loaded raw/audio\01XFgRZfZI7oBagNf1Loml.mp3, Sample Rate: 44100, Audio Shape: (8609792,)
Error processing raw/audio\01XFgRZfZI7oBagNf1Loml.mp3: Unable to allocate 65.8 MiB for an array with shape (1025, 16817) and data type float32
Processing file: raw/audio\01XyRFxUKPLVOWglamSV2K.mp3
Loaded raw/audio\01XyRFxUKPLVOWglamSV2K.mp3, Sample Rate: 44100, Audio Shape: (12187648,)
Error processing raw/audio\01XyRFxUKPLVOWglamSV2K.mp3: Unable to allocate 186. MiB for an array with shape (1025, 23805) and data type complex64
Processing file: raw/audio\01Y6Ty0YJVGXWiJOK4zi0q.mp3
Loaded raw/audio\01Y6Ty0YJVGXWiJOK4zi0q.mp3, Sample Rate: 44100, Audio Shape: (10768384,)
Error processing raw/audio\01Y6Ty0YJVGXWiJOK4zi0q.mp3: Unable to allocate 82.2 MiB for an array with shape (1025, 21033) and data type float32
Processing file: raw/audio\01yDjImCcOaHDT4uy6VOPw.mp3
Loaded raw/audio\01yDjImCc

Transform spectogram to audio

In [None]:
import numpy as np
import librosa
import librosa.display
from pydub import AudioSegment

# Function to convert spectrogram to audio and save it as MP3
def spectrogram_to_mp3(spec, sr, filename="output.mp3"):
    # Step 1: Invert the spectrogram to a time-domain audio signal
    # If the spectrogram is in dB, first convert back to amplitude
    if np.max(spec) < 0:  # Checking if it's in dB scale
        spec = librosa.db_to_amplitude(spec)
    
    # If the spectrogram is a magnitude spectrogram, use librosa's inverse STFT
    audio_signal = librosa.istft(spec)
    
    # Step 2: Save the audio signal as a WAV file
    librosa.output.write_wav("temp.wav", audio_signal, sr)
    
    # Step 3: Convert WAV to MP3 using pydub
    sound = AudioSegment.from_wav("temp.wav")
    sound.export(filename, format="mp3")
    
    print(f"Saved MP3 as {filename}")

# Example usage
if __name__ == "__main__":
    # Load an example audio file to create a spectrogram
    y, sr = librosa.load(librosa.example('trumpet'))
    
    # Generate a spectrogram from the audio signal
    spec = librosa.stft(y)
    spec_db = librosa.amplitude_to_db(np.abs(spec), ref=np.max)
    
    # Convert the spectrogram back to MP3
    spectrogram_to_mp3(spec_db, sr, filename="output.mp3")


CHECK FROM SPECTOGRAM TO MP3

In [18]:
image_path = "raw/spectrograms/0A0RBBTrgfq9eClnw6ZXT7.png"


In [None]:
import numpy as np
import librosa
from PIL import Image
from pydub import AudioSegment
import soundfile as sf  # For saving audio files

def image_to_spectrogram(image_path):
    """
    Convert a spectrogram image to a numerical array representing the spectrogram.
    Assumes the spectrogram is grayscale, where pixel intensity represents dB values.
    """
    # Load the image
    img = Image.open(image_path).convert("L")  # Convert to grayscale ('L' mode)
    
    # Convert image to numpy array
    img_array = np.array(img)
    
    # Normalize the image to [0, 1] range (0 is minimum, 255 is maximum)
    img_array = img_array / 255.0

    # Convert the image intensities to dB scale (assuming they represent dB)
    # Rescale from [0, 1] to [-80 dB, 0 dB], as typical dB range for spectrograms is -80 to 0
    img_db = img_array * -80
    
    return img_db

def spectrogram_to_audio(spectrogram_db, sr=22050, hop_length=512):
    """
    Invert a dB-scaled spectrogram back to an audio waveform.
    """
    # Convert dB to amplitude
    spectrogram_amplitude = librosa.db_to_amplitude(spectrogram_db)
    
    # Perform the inverse Short-Time Fourier Transform (ISTFT) to recover the audio signal
    audio_signal = librosa.griffinlim(spectrogram_amplitude, hop_length=hop_length)
    
    return audio_signal

def save_as_mp3(audio_signal, sr, filename="output.mp3"):
    """
    Save the audio signal as an MP3 file using pydub.
    """
    # Save the audio signal as a temporary WAV file using soundfile
    sf.write("temp.wav", audio_signal, sr)  # Use soundfile to save as WAV
    
    # Convert the WAV file to MP3 using pydub
    sound = AudioSegment.from_wav("temp.wav")
    sound.export(filename, format="mp3")
    print(f"Saved MP3 as {filename}")

# Example usage
if __name__ == "__main__":
    # Path to the spectrogram image (PNG)
    image_path = "raw/spectrograms/0A0RBBTrgfq9eClnw6ZXT7.png"
    
    # Step 1: Convert the image to a spectrogram
    spectrogram_db = image_to_spectrogram(image_path)
    
    # Step 2: Convert the spectrogram to an audio signal
    audio_signal = spectrogram_to_audio(spectrogram_db, sr=22050)
    
    # Step 3: Save the audio signal as MP3
    save_as_mp3(audio_signal, sr=22050, filename="output.mp3")


In [1]:
import torch
torch.cuda.empty_cache()


audio shape ???
sample rate ???

Download everything in wav? or find a solution in another way ?