# Utilities

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Utility functions
import os
import shutil
import zipfile
!pip install pydub
from pydub import AudioSegment
import math
import soundfile as sf

# Utility function to create zip folders
def zip_folder(folder_path, output_zip_path):
    """Zips the contents of a folder into a zip file.

    Args:
        folder_path: The path to the folder to be zipped.
        output_zip_path: The path to the output zip file.
    """

    output_dir = os.path.dirname(output_zip_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with zipfile.ZipFile(output_zip_path, "w", zipfile.ZIP_DEFLATED) as zip_file:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zip_file.write(file_path)

# Utility function to delete directories
def delete_directory(directory):
    """Deletes a directory and all its contents.

    Args:
        directory (str): The path to the directory to be deleted.
    """
    try:
        shutil.rmtree(directory)
        print(f"Directory '{directory}' and all its contents have been successfully deleted.")
    except Exception as e:
        print(f"An error occurred while deleting the directory '{directory}': {e}")

# function to keep only 1st level of subdirs inside a given dir and copy all the files from sub_subdirs in the subdirs
def merge_subdirectories(main_path):
    # Iterate through all subdirectories of the main path
    for root, dirs, files in os.walk(main_path):
        for subdir in dirs:
            subdir_path = os.path.join(root, subdir)
            # Check if the subdirectory has a subdirectory itself
            sub_subdirs = [d for d in os.listdir(subdir_path) if os.path.isdir(os.path.join(subdir_path, d))]
            if sub_subdirs:
                for sub_subdir in sub_subdirs:
                  # Iterate through the contents of the sub-subdirectory
                  print("subdir : " + subdir_path + ", sub subdirs : " + str(sub_subdirs))
                  sub_subdir = os.path.join(subdir_path, sub_subdir)
                  for item in os.listdir(sub_subdir):
                      item_path = os.path.join(sub_subdir, item)
                      # Copy each item to the subdirectory of the main directory
                      destination_path = os.path.join(subdir_path, item)
                      print("moving item : " + item_path + " to " + destination_path)
                      shutil.move(item_path, destination_path)
                  # Delete the sub-subdirectory
                  os.rmdir(sub_subdir)

# function to convert mp3 file to wav and halving the sampling rate
def convert_mp3_to_wav(input_folder, output_folder):
    # Iterate through all subdirectories of the input folder
    for root, dirs, files in os.walk(input_folder):
        # Create corresponding subdirectories in the output folder
        relative_path = os.path.relpath(root, input_folder)
        output_subdir = os.path.join(output_folder, relative_path)
        os.makedirs(output_subdir, exist_ok=True)

        # Iterate through all files in the subdirectory
        for file in files:
            if file.endswith('.mp3'):
                # Load MP3 file
                mp3_path = os.path.join(root, file)
                sound = AudioSegment.from_mp3(mp3_path)

                # Resample to 22.05 kHz
                sound = sound.set_frame_rate(22050)

                # Save as WAV file
                wav_path = os.path.join(output_subdir, os.path.splitext(file)[0] + '.wav')
                sound.export(wav_path, format="wav")

# split a wav file in 10s pieces
def split_wav_file(input_file, output_folder):
    # Read the input WAV file
    data, samplerate = sf.read(input_file)
    total_samples = len(data)

    # Calculate the duration of the audio in seconds
    duration_seconds = total_samples / samplerate

    # Calculate the number of samples for each 10-second segment
    segment_samples = samplerate * 10

    # Calculate the total number of segments required
    total_segments = math.floor(duration_seconds / 10)

    # Iterate over the segments and save each as a separate WAV file
    for i in range(total_segments):
        start_sample = i * segment_samples
        end_sample = min((i + 1) * segment_samples, total_samples)
        output_file = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(input_file))[0]}-{i}.wav")
        sf.write(output_file, data[start_sample:end_sample], samplerate)


# function to split every wav file inside a directory and to create a new directory with all the splitted files
def process_folder(folder_path, output_path):
    # Iterate through all files and subfolders in the given folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file is a WAV file
            if file.endswith('.wav') and 'jazz.00054' not in file: # jazz.00054 is corrupted so ignore it
                input_file = os.path.join(root, file)
                # Create the corresponding output folder structure
                relative_path = os.path.relpath(root, folder_path)
                output_folder = os.path.join(output_path, relative_path)
                os.makedirs(output_folder, exist_ok=True)
                # Split the WAV file and save the segments
                split_wav_file(input_file, output_folder)



# Preprocess GTZAN

In [9]:
!unzip /content/drive/MyDrive/Università/NAML_project/datasets/gtzan.zip

Archive:  /content/drive/MyDrive/Università/NAML_project/datasets/gtzan.zip
   creating: genres_original/
   creating: genres_original/blues/
  inflating: genres_original/blues/blues.00000.wav  
  inflating: genres_original/blues/blues.00001.wav  
  inflating: genres_original/blues/blues.00002.wav  
  inflating: genres_original/blues/blues.00003.wav  
  inflating: genres_original/blues/blues.00004.wav  
  inflating: genres_original/blues/blues.00005.wav  
  inflating: genres_original/blues/blues.00006.wav  
  inflating: genres_original/blues/blues.00007.wav  
  inflating: genres_original/blues/blues.00008.wav  
  inflating: genres_original/blues/blues.00009.wav  
  inflating: genres_original/blues/blues.00010.wav  
  inflating: genres_original/blues/blues.00011.wav  
  inflating: genres_original/blues/blues.00012.wav  
  inflating: genres_original/blues/blues.00013.wav  
  inflating: genres_original/blues/blues.00014.wav  
  inflating: genres_original/blues/blues.00015.wav  
  inflati

In [10]:
input_folder = "/content/genres_original"
output_folder = "/content/gtzan"

process_folder(input_folder, output_folder)

In [11]:
zip_path = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan.zip'

zip_folder(output_folder, zip_path)

# Preprocess HOMBURG

In [4]:
!unzip /content/drive/MyDrive/Università/NAML_project/datasets/HOMBURG.zip

Archive:  /content/drive/MyDrive/Università/NAML_project/datasets/HOMBURG.zip
   creating: HOMBURG/
   creating: HOMBURG/alternative/
  inflating: HOMBURG/alternative/50_Minutes-Colours.mp3  
  inflating: HOMBURG/alternative/50_Minutes-Nightwalk.mp3  
  inflating: HOMBURG/alternative/50_Minutes-The_Great_Wall.mp3  
  inflating: HOMBURG/alternative/Abigail_Lapell-Dress_Rehearsal.mp3  
  inflating: HOMBURG/alternative/Albino_Catfish-Ice_Cream.mp3  
  inflating: HOMBURG/alternative/Alex_Niedt-But_I_Love_You___.mp3  
  inflating: HOMBURG/alternative/Anders_Tengdahl-I_came_here_to_see_you.mp3  
  inflating: HOMBURG/alternative/Anders_Tengdahl-When_You_Cry.mp3  
  inflating: HOMBURG/alternative/Back_Bone_Shiver-Give_It_A_Name.mp3  
  inflating: HOMBURG/alternative/Back_Bone_Shiver-Insect.mp3  
  inflating: HOMBURG/alternative/Barton_Gill-Reach_Out__acoustic_.mp3  
  inflating: HOMBURG/alternative/bluestarbaby-all_i_need.mp3  
  inflating: HOMBURG/alternative/Bondo-Just_Wanna_Get_Horny.mp3  

In [12]:
main_directory = "/content/HOMBURG"
merge_subdirectories(main_directory)

In [13]:
input_directory = "/content/HOMBURG"
output_directory = "/content/homburg"

convert_mp3_to_wav(input_directory, output_directory)

In [14]:
zip_path = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/homburg.zip'

zip_folder(output_directory, zip_path)

# ISMIR2004 preprocessing

In [15]:
# directory_to_delete = "/content/training"
# delete_directory(directory_to_delete)


!unzip /content/drive/MyDrive/Università/NAML_project/datasets/ismir2004_training.zip

Archive:  /content/drive/MyDrive/Università/NAML_project/datasets/ismir2004_training.zip
   creating: training/
   creating: training/classical/
   creating: training/classical/american_baroque/
   creating: training/classical/american_baroque/dances_and_suites_of_rameau_an/
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/16-air_suite_from_les_fetes_d_he.mp3  
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/21-les_fauvetes_plaintives_xivem.mp3  
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/26-l'oracle_suite_in_d_from_les.mp3  
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/27-pour_le_genie_de_mars_suite_i.mp3  
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/28-la_victoire_suite_in_d_from_l.mp3  
  inflating: training/classical/american_baroque/dances_and_suites_of_rameau_an/30-air_vif_de_lacdemoniens_suite.mp3  
  inflatin

In [21]:
input_directory = "/content/training"
merge_subdirectories(input_directory)

subdir : /content/training/world, sub subdirs : ['.ipynb_checkpoints']
subdir : /content/training/pop, sub subdirs : ['erase', 'nocturne']
moving item : /content/training/pop/erase/6-order.mp3 to /content/training/pop/6-order.mp3
moving item : /content/training/pop/erase/9-tied.mp3 to /content/training/pop/9-tied.mp3
moving item : /content/training/pop/erase/7-dawn.mp3 to /content/training/pop/7-dawn.mp3
subdir : /content/training/pop, sub subdirs : ['erase', 'nocturne']
moving item : /content/training/pop/nocturne/2-artificial.mp3 to /content/training/pop/2-artificial.mp3
moving item : /content/training/pop/nocturne/1-nocturne.mp3 to /content/training/pop/1-nocturne.mp3
moving item : /content/training/pop/nocturne/3-every_time.mp3 to /content/training/pop/3-every_time.mp3
subdir : /content/training/punk, sub subdirs : ['sick_songs', 'listen_up_baby', 'conquers_the_world!', 'party_patrol', 'the_time_is_now!', 'demo-lition_2004']
moving item : /content/training/punk/sick_songs/5-born_wi

In [22]:
input_folder = "/content/training"
output_folder = "/content/ismir2004_wav"

convert_mp3_to_wav(input_folder, output_folder)

input_folder = output_folder
output_folder = "/content/ismir2004"

process_folder(input_folder, output_folder)

In [23]:
zip_path = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004.zip'

zip_folder(output_folder, zip_path)

# Clear memory after saving processed zipped dataset to drive

In [None]:
directory_to_delete = "/content/training"
delete_directory(directory_to_delete)

directory_to_delete = "/content/HOMBURG"
delete_directory(directory_to_delete)

directory_to_delete = "/content/HOMBURG_wav"
delete_directory(directory_to_delete)

directory_to_delete = "/content/genres_original"
delete_directory(directory_to_delete)

directory_to_delete = "/content/gtzan"
delete_directory(directory_to_delete)

directory_to_delete = "/content/ismir2004_wav"
delete_directory(directory_to_delete)

directory_to_delete = "/content/ismir2004"
delete_directory(directory_to_delete)

An error occurred while deleting the directory '/content/training': [Errno 2] No such file or directory: '/content/training'
An error occurred while deleting the directory '/content/HOMBURG': [Errno 2] No such file or directory: '/content/HOMBURG'
An error occurred while deleting the directory '/content/HOMBURG_wav': [Errno 2] No such file or directory: '/content/HOMBURG_wav'
An error occurred while deleting the directory '/content/genres_original': [Errno 2] No such file or directory: '/content/genres_original'
An error occurred while deleting the directory '/content/gtzan_10s': [Errno 2] No such file or directory: '/content/gtzan_10s'
An error occurred while deleting the directory '/content/ismir2004_wav': [Errno 2] No such file or directory: '/content/ismir2004_wav'
Directory '/content/ismir2004_wav_10s' and all its contents have been successfully deleted.
