In [18]:
import glob
import os
import pandas as pd
import pretty_midi
from IPython.display import clear_output

In [20]:
DATA_PATH = 'data'
SEARCH_CRITERION = '**/*.mid'
# list all midi files
midi_files = glob.glob(os.path.join(DATA_PATH, SEARCH_CRITERION),
                       recursive=True)
print("For example, midi_files[0] correpsonds to :\n{}".format(midi_files[0]))

For example, midi_files[0] correpsonds to :
data/lmd_aligned/S/S/S/TRSSSJW128F146D5DC/5775cbedfbd4a93993bd333e33527a42.mid


# Simplifying the dataset

To begin with, let's list all the instruments present in the midi files by count, so we can take a decision on where to start.
We will want to only focus on training melodies for one set for now, so it makes sense to pick an instrument (program code), that is present in large quantities, and is likely to have long melodies to train on.

Initial guesses would be a Piano or Guitar program.

Some songs might have duplicates - we'll find a way to deal with this later if necessary.

[See here](https://soundprogramming.net/file-formats/general-midi-instrument-list/) for a list of instrument programs and their names

In [21]:
# Save the filepath for a second pass loading the piano 
# rolls for each instrument we will use

limit = -1

instrument_ary = [[]]
instrument_ary.append(['program', 'is_drum', 'name', 'filepath'])
# This might take a while...
for index, file in enumerate(midi_files[:limit]):
    clear_output(wait=True)
    print("file {}/{}: Loading and parsing {}".format(
        index, len(midi_files), os.path.basename(file)))
    try:
        pm = pretty_midi.PrettyMIDI(file)
        instruments = pm.instruments

        for instrument in instruments:
            instrument_ary.append([instrument.program, instrument.is_drum, instrument.name.replace(';',''), file])
    except:
        # For now, just ignore files we can't load.
        continue

116187/116189: Loading and parsing 66552a5273a615bdc470171492d80a94.mid.


In [24]:
df = pd.DataFrame(data=instrument_ary, columns=["program", "is_drum", "name", "filepath"])
df = df.dropna()
df.head()

Unnamed: 0,program,is_drum,name,filepath
1,program,is_drum,name,filepath
2,30,False,Track 1,data/lmd_aligned/S/S/S/TRSSSJW128F146D5DC/5775...
3,30,False,Track 2,data/lmd_aligned/S/S/S/TRSSSJW128F146D5DC/5775...
4,34,False,Track 3,data/lmd_aligned/S/S/S/TRSSSJW128F146D5DC/5775...
5,34,False,Track 3,data/lmd_aligned/S/S/S/TRSSSJW128F146D5DC/5775...


# Save the loaded instruments for further processing

In [25]:
file_name = 'instruments.csv'
df.to_csv(file_name, sep=';', encoding='utf-8')

## Show the instruments that occur most often in different files

We want to get an intuition of which songs might have the most attractive instruments to work on

In [46]:
# Show the instrument with the most unique filepath values
frequent_track = df.groupby('program').nunique().sort_values('filepath', ascending=False)
frequent_track.head()

Unnamed: 0_level_0,is_drum,name,filepath
program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,14923,92919
33,2,4475,34051
25,2,5008,32062
48,2,4545,32041
35,2,2772,27388


In [44]:
print("The most used program is present in {} midi files over {} documents.".format(
   frequent_track.iloc[0]['filepath'], len(midi_files)))

The most used program is present in 92919 midi files over 116189 documents.
