# Music preparation
This notebook generates 10 second samples from a local music library and helps the user to generate labels for each song.

This notebook requires a manual step (labeling the data), please read the comment in each cell how it works.

Also, ffmpeg must be installed and available in the $PATH.

In [None]:
!pip install python-slugify

In [4]:
import os
import random
from slugify import slugify
import json

In [5]:
# set config variables here

# the directory of the music library. the notebook expects subdirectories 
# to have a specific genre which will be defined in a json file in the next cell
# if your music library is already sorted by genre, use the last cell in this notebook
DIRECTORY = './music_library/'

# the genres in the music library
GENRES = ['electro', 'rock']
MUSIC_FILE_TYPES = ['mp3', 'wma', 'ogg', 'flac', 'm4a']

SONGS_PER_GENRE = 400

In [6]:
# this cell writes out an initial artists.json file containing all direct
# subdirectories of DIRECTRY defined above
# copy this file for each of your genres in the music directory
# (e.g. electro.json, rock.json) and edit the file to only contain directories of the
# respective genre
directories = !find $DIRECTORY -maxdepth 1 -type d
directories = [f.replace(f'{DIRECTORY}/', '') for f in directories][1:]
with open('artists.json', 'w') as f:
    json.dump(directories, f, indent=2)

In [7]:
from collections import defaultdict

def ffmpeg_convert(source, dest):
    ret = os.system(f'ffmpeg -i "{source}" -ss 60 -t 10 -ac 1 -ar 44100 "{dest}"')
    if ret != 0:
        return False
    try:
        size = os.path.getsize(dest)
    except IOError as ex:
        return False
    # sample rate * seconds * sample size
    if size < 44100 * 10 * 2:
        os.unlink(dest)
        return False
    return True

def convert(files, artists, out_directory, count):
    genre_files = defaultdict(lambda: list())
    for f in files:
        artist = f.split('/')[0]
        if artist in artists:
            genre_files[artist].append(f)

    success_count = 0
    while True:
        artist, files = random.choice(list(genre_files.items()))
        song = random.choice(files)
        files.remove(song)
        if len(files) == 0:
            del genre_files[artist]
        source = f'{DIRECTORY}/{song}'
        dest = f'{out_directory}/{slugify(song)}.wav'
        ret = ffmpeg_convert(source, dest)
        if ret:
            success_count += 1
            if success_count == count:
                return

def filter_file_type(f):
    for file_type in MUSIC_FILE_TYPES:
        if f.lower().endswith(f'.{file_type}'):
            return True
    return False

In [8]:
files = !find $DIRECTORY -type f
files = [f.replace(f'{DIRECTORY}/', '') for f in files][1:]
files = list(filter(filter_file_type, files))

for genre in GENRES:
    !rm -rf $genre
    !mkdir $genre
    with open(f'{genre}.json') as f:
        artists = json.load(f)
    convert(files, artists, genre, SONGS_PER_GENRE)

In [10]:
# similar code but for a library where music is already sorted by genre
!rm -rf classic
!mkdir classic

CLASSIC_DIRECTORY = './classic_music/'
files = !find "$CLASSIC_DIRECTORY" -type f
files = [f.replace(f'{CLASSIC_DIRECTORY}/', '') for f in files][1:]
random.shuffle(files)
success_count = 0
i = 0
while True:
    song = files[i]
    i += 1
    source = f'{CLASSIC_DIRECTORY}/{song}'
    dest = f'classic/{slugify(song)}.wav'
    ret = ffmpeg_convert(source, dest)
    if ret:
        success_count += 1
        if success_count == SONGS_PER_GENRE:
            break