### Outline of the steps to build the dataset using the AcousticBrainz API



1.   Gather list of subgenres.
2.   Query MusicBrainz API for each subgenre to generate Python dictionary for each file, which includes info on the MBID, artist, title, tags.
3.   Using the MBID, query the AcousticBrainz API to obtain low_level and high_level data.
4.   Join the MusicBrainz info with the low_level and high_level data.  The MusicBrainz info contains the labels.
5.   Export each dictionary for each song as a .json file.  


**NOTE**: It's recommended that you also combine the dictionaries together into one large dictionary and then export as a large json file.  This will save time later instead of having to read each json file individually.  This option hasn't been implemented yet, but shouldn't be difficult to do.




In [1]:
# this gives this notebook access to the Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import requests
import json

Mounted at /content/drive


In [4]:
## This quick function calculates the size of a folder.  Google Drive does not natively provide this information.

def get_folder_size(folder_path):
    total_size = 0
    num_files = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)
            num_files += 1
    return num_files, total_size

In [None]:
import time
from urllib.parse import quote

def get_range_musicbrainz_recordings(genre,start,stop):
    '''
    This function queries the MusicBrainz API to retrieve a range of recordings for a given genre.
    Args:
        genre (str): The genre to search for.
        start (int): The index to start at.
        stop (int): The index to stop at.
    Returns:
        list: A list of all recording dictionaries.
    '''
    limit = 25
    offset = start
    recordings = []
    batch = []
    encoded_genre = quote(f'"{genre}"')

    while True:
      url = f'https://musicbrainz.org/ws/2/recording?query=tag:{encoded_genre}&fmt=json&limit={limit}&offset={offset}'
      response = requests.get(url)
      if response.status_code == 200:
          data = response.json()
          batch = data.get('recordings', [])
          if len(batch) <= 1 or offset >= stop:
            break
          recordings.extend(batch)
          if len(recordings) % 100 == 0:
            print(round(100* len(recordings) / (stop-start),2),'%', end=' ') # this is to show progress
            if len(recordings) % 1000 == 0:
              print()

          offset += limit
          time.sleep(1) # to avoid 104 error
      else:
          print(f"Error fetching MusicBrainz data: {response.status_code}")
          break

    return recordings

In [None]:
from urllib.parse import quote

def get_total_recordings_count(genre):
    '''
    This function queries the MusicBrainz API to retrieve the total number of recordings for a given genre.
    Args:
        genre (str): The genre to search for.
    Returns:
        int: The total number of recordings in the given genre.
    '''
    base_url = 'https://musicbrainz.org/ws/2/recording'
    encoded_genre = quote(f'"{genre}"')

    url = f'{base_url}?query=tag:{encoded_genre}&fmt=json&limit=1'  # Limit to 1 to just get the count

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        count = data.get('count', 0)
        return count
    else:
        print(f"Error fetching MusicBrainz data: {response.status_code}")
        return 0

In [None]:
import requests

def get_acousticbrainz_data(mbid,level='low'):
    '''
    This function queries the AcousticBrainz API to retrieve acoustic brainz data for a given recording.
      Args:
        mbid (str): The MusicBrainz ID of the recording.
        level (str): The level of data to retrieve. Default is 'low'. Other option is 'high'.
      Returns:
        dict: A dictionary of acoustic brainz data.
    '''
    url = f'https://acousticbrainz.org/api/v1/{mbid}/{level}-level'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else: # Some MBIDs are not in the AcousticBrainz data set.
        #print(f"Error fetching AcousticBrainz data: {response.status_code}")
        return None

In [None]:
from urllib.parse import quote
import json
import os
import time

def create_acousticbrainz_dataset(genre, folder_path, start=0, stop=-1):
    '''
    This function creates an AcousticBrainz dataset for a given genre.
    Each recording's low level and high level (from MusicBrainz) data is saved as a json file in the given
    folder_path.  Each file name is given by the mbid.
    Args:
        genre (str): The genre to create the dataset for.
        folder_path (str): The path to the folder where the dataset will be saved.
        start (int): The starting index of the recordings to retrieve. Default is 0.
        stop (int): The stopping index of the recordings to retrieve. Default is -1, which retrieves all recordings.
    Returns:
        None
    '''

    os.makedirs(folder_path, exist_ok=True) # Makes folder_path
    time.sleep(5) # 5 second delay so that the above folder appears (mainly an issue w/ Google Drive)
    if stop == -1:
      stop = get_total_recordings_count(genre)


    print(f'Number of MusicBrainz recordings: {stop-start}')
    recordings = get_range_musicbrainz_recordings(genre,start,stop)
    print('MBIDs sucessfully gathered.')
    print('Gathering AcousticBrainz data...')

    missing = 0
    saved = 0
    for recording in recordings:
        time.sleep(1)
        mbid = recording['id']
        data = get_acousticbrainz_data(mbid, level='low')
        if data is None:
          missing += 1
          continue

        file_name = mbid + '.json'
        data['mbdata'] = recording # this is the 'high-level' data

        with open(os.path.join(folder_path, file_name), 'w') as file:
            json.dump(data, file, indent=4)
        saved += 1
        # progress
        if saved % 10 == 0:
          print(round(100 * ((saved + missing) / (stop-start)),2),'%', end=' ')
          if saved % 100 == 0:
            print()

    print()
    print('Data set creation complete!')
    print(f'Total MusicBrainz recordings analyzed: {missing + saved}')
    print(f'\nMissing: {missing}\nSaved: {saved}')





Subgenre 'house' has completed downloading, obtaining 10219 recordings after querying MusicBrainz 'house' tags from 0 to 20000.

Subgenre 'trance' has completed downloading, obtaining 10387 recordings after querying MusicBrainz 'trance' tags from 0 to 19000.

Subgenre 'drum and bass' has completed downloading, obtaining 9412 recordings after querying MusicBrainz 'drum and bass' tags from 0 to 16003 (all).