<a href="https://colab.research.google.com/github/asigalov61/Tegridy-MIDI-Dataset/blob/master/Simple_MIDI_Reducer_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple MIDI Reducer/Parser (Ver. 3.0)

***

### Project Los Angeles

### Tegridy Code 2020

***

## Setup Environment and Dependencies

In [None]:
#@title Install Pretty MIDI
!pip install pretty_midi

In [None]:
#@title Import Modules and Create IO dirs

import pretty_midi
import glob
import os
import random
import tqdm.auto
import IPython.display as ipd
from google.colab import drive
import numpy as np

#@title Choose names for Source and Destination folders for conversion
source_dir = "/content/S" #@param {type:"string"}
dest_dir = "/content/D" #@param {type:"string"}
source_path = source_dir
dest_path = dest_dir
if not os.path.exists(source_path):
    os.makedirs(source_path)
if not os.path.exists(dest_path):
    os.makedirs(dest_path)

In [None]:
#@title (Optional) Download and Unzip Alex MIDI Dataset (~400 MIDIs)
%cd /content/S
!wget 'https://github.com/asigalov61/AlexMIDIDataSet/raw/master/AlexMIDIDataSet-CC-BY-NC-SA.zip'
!unzip -j 'AlexMIDIDataSet-CC-BY-NC-SA.zip'
!rm 'AlexMIDIDataSet-CC-BY-NC-SA.zip'
%cd /content/

In [None]:
#@title (Optional) Download and Unzip Tegridy Special MIDI Dataset (Piano/Violin only)
%cd /content/S
!wget 'https://github.com/asigalov61/Tegridy-MIDI-Dataset/raw/master/Tegridy-MIDI-Dataset-CC-BY-NC-SA.zip'
!unzip -j 'Tegridy-MIDI-Dataset-CC-BY-NC-SA.zip'
!rm 'Tegridy-MIDI-Dataset-CC-BY-NC-SA.zip'
%cd /content/

## Reducer-Parser-Filter Loop/Main Function

In [None]:
#@title Run this to reduce/filter all files. Choose Extensions, Name Tag, MIDI Patch Instrument Name (refer to the list at the end of the colab), and drums/debug options.
input_files_extension = "*.mid" #@param ["*.mid", "*.midi", "*.kar", "*.*"]
output_extension = ".mid" #@param {type:"string"}
output_files_names_tag = "_melody_only.mid" #@param {type:"string"}
desired_instrument_to_reduce_to = "Acoustic Grand Piano" #@param {type:"string"}
is_it_drums = False #@param {type:"boolean"}
debug = False #@param {type:"boolean"}

if not os.path.exists(dest_path):
    os.makedirs(dest_path)

# Create a list of paths and files. Select desired MIDI extension
source_files = glob.glob(os.path.join(source_path, input_files_extension))
if debug: print(source_files)

# Define reducer/parser function and input desired instrument to reduce to { run: "auto", display-mode: "form" }

def filter_instrument(fname):
    out_midi = []
    time = 0
    # Load MIDI file into PrettyMIDI object
    midi_data = pretty_midi.PrettyMIDI(fname)
    midi_data.remove_invalid_notes()
    out_midiz = pretty_midi.PrettyMIDI()
    out_midi = pretty_midi.Instrument(program=out_midiz)
    # Print an empirical estimate of its global tempo
    if debug: print(midi_data.estimate_tempo())
    # Compute the relative amount of each semitone across the entire song,
    # a proxy for key
    total_velocity = sum(sum(midi_data.get_chroma()))
    if debug: print([sum(semitone)/total_velocity for semitone in midi_data.get_chroma()])
    # Shift all notes up by 5 semitones
    out_midi = pretty_midi.Instrument(program=pretty_midi.instrument_name_to_program(desired_instrument_to_reduce_to), is_drum=is_it_drums, name=desired_instrument_to_reduce_to)
    #out_midi.instrument.append(instr)
    for instrumentz in midi_data.instruments:
        # Don't want to shift drum notes
        if not instrumentz.is_drum:
            for note in instrumentz.notes:
                #if note.velocity is not 0 or note.pitch is not 0: out_midi.notes.append(note)
                if 0 < note.pitch < 128: out_midi.notes.append(note)
                #if debug: print('Written...', note)
                #else: print('Something else!!!', note)



    out_midiz.instruments.append(out_midi)

    out_midiz.write(str('/content/D/' + fname[11:] + output_files_names_tag))
    return pm, out_midi, added

print('Starting up...')
print('Reducing to', desired_instrument_to_reduce_to)
for fname in tqdm.auto.tqdm(source_files):
       if debug: print("Reading:", fname)
       try:
          pm, outmid, added = filter_instrument(fname)
          if added>0:
            outname = fname.replace(output_extension, output_files_names_tag).replace(source_path, dest_path)
            print(outmid)
            outmid.write(outname)
            
       except KeyboardInterrupt:
           break
       except:
         continue


In [None]:
#@title Analysis of the IO MIDI Datasets
MIDI_DIR = "/content/D/*.mid" #@param ["/content/S/*.mid", "/content/D/*.mid"]
### https://github.com/brennan2602/FYP

#This file reads in the midi files in a directory, converts them to a string representation
#when in a string representation it then gathers some statistics about the structure of the song

def get_piano_roll(midifile):
	midi_pretty_format = pretty_midi.PrettyMIDI(midifile)
	piano_midi = midi_pretty_format.instruments[0] # Get the piano channels
	piano_roll = piano_midi.get_piano_roll(fs=20)
	return piano_roll

#uses split encoding scheme (here only encoding the note values)
#works by looping through time increments of the piano roll array and writing the notes being played
#at a given time sample as a number on the corresponding line of a string # is written when no notes played for that
#sample
def encode(arr):
    timeinc=0
    outString=""
    for time in arr:
        notesinc = -1
        #print(time)
        if np.all(time==0):
            outString=outString+"#"
        for vel in arr[timeinc]:
            notesinc=notesinc+1
            if vel != 0:
                noteRep=str(notesinc) + " "
                #print(noteRep)
                outString=outString+noteRep
        outString=outString+"\n"
        timeinc = timeinc+1
    return outString


def getSilences(test):
    test=test[:-1] #removing last line in string (always blank)
    output=test.split("\n") #splitting into array
    res = len(output)
    #initialising counters
    maxcounter=0
    counter=0
    silenceCount=0

    for x in output:
        if x == "#": #when a "#" is seen nothing is being played that sample
            counter=counter+1 #this tracks a streak of silences
            silenceCount+=1 #this tracks total silences
        if x != "#":
            counter=0 #reseting streak
        if counter>maxcounter:
            maxcounter=counter #updating longest silence streak when appropriate
    return maxcounter,silenceCount


#by looking at the length of song and the amount of silences this returns % silence
def getPercentSilence(gen,silences):
    test = gen
    test = test[:-1]
    output = test.split("\n")
    res = len(output)
    percent=silences/res
    return percent


def getStatsNotes(test):
    test=test[:-1] #get rid of blank line at the end
    notes=[]
    output = test.split("\n") #split string on new lines

    #initial values updated while looping through
    maxPerSamp=0
    silenceSamp=0
    notesPlayed=0
    maxNotes=0
    maxVal=0
    minVal=127

    for x in output:
        samp=x.split(" ")
        samp=samp[:-1] #theres a blank result at the end of array from split this indexing removes it
        while "0" in samp:
            samp.remove("0") #sometimes 0 samples exist this removes them as they aren't notes played
        if len(samp)==0:
            silenceSamp+=1 #counting silences
        notesPlayed=notesPlayed+len(samp) #counting notes played
        if len(samp)>0:
            #getting max and min note values at this time step
            minimum=min(samp)
            maximum=max(samp)
            #updating max and min values note values for song if appropriate
            if int(minimum)<minVal:
                minVal=int(minimum)
            if int(maximum)>maxVal:
                maxVal=int(maximum)
        #updating maximum number of notes per sample if appropriate
        if len(samp)>maxNotes:
            maxNotes=len(samp)
    rangeNotes=maxVal-minVal #spread of notes
    avgNotes = notesPlayed / len(output) #average notes per sample
    adjNotes=notesPlayed /(len(output)-silenceSamp) #average notes per sample adjusted to remove silent samples
    return rangeNotes, maxVal, minVal,maxNotes,avgNotes,adjNotes


files=glob.glob(MIDI_DIR)#point towards directory with midi files (here same folder)
print(files)

for f in files:
    print(f)
    pr = get_piano_roll(f) #gets piano roll representation of the midi file
    arr = pr.T
    outString= encode(arr) #gets a string representation of the midi file
    maxsilences, silences = getSilences(outString) #by passing in the encoded string get longest silence and the total
                                                   #number of samples which are silent
    noteRange, maxVal, minVal, maxNotes, avgNotes, adjAvg =getStatsNotes(outString) # getting some stats by looping
                                                                                    # through encoded data
    percentSilence= getPercentSilence(outString,silences) # get % silence from silence / outString length

    #printing out to the user
    print("longest silence is ",maxsilences,"samples long")
    print("silence covers:",round(percentSilence,4),"%")
    print("notes span range:",noteRange)
    print("max note value:",maxVal)
    print("min note value:",minVal)
    print("average number of notes per sample:",round(avgNotes,4))
    print("average number of notes per sample (adjusted to remove silence samples):",round(adjAvg,4))
    print("max number of notes played in a sample:",maxNotes)
    print("\n")

#NOTE some minor discrepencies vs reading in from generated file directly
#However this does provide a uniform check to use for songs generated by both encoding schemes
#Can also be used to evaluate training file
#uses split encoding to get the text representation for ease of development

### Zip everything (root/content directory) for download :)

In [None]:
#@title Zip Root
%cd /content/
!zip -r /content/Simple-MIDI-Reducer-Parser-Output.zip /content

In [None]:
#@title Mount Google Drive (Standard GD Connect Code)
drive.mount('/content/drive')

In [None]:
#@title Dismount Google Drive if you need to zip the root
from google.colab import drive
drive.flush_and_unmount()

***
# General MIDI patch numbers

https://www.midi.org/specifications-old/item/gm-level-1-sound-set

***

## General MIDI Level 1 Instrument Families

### The General MIDI Level 1 instrument sounds are grouped by families. In each family are 8 specific instruments.

***

## PC #	Family Name

1-8	Piano

9-16	Chromatic Percussion

17-24	Organ

25-32	Guitar

33-40	Bass

41-48	Strings

49-56	Ensemble

57-64	Brass

65-72	Reed

73-80	Pipe

81-88	Synth Lead

89-96	Synth Pad

97-104	Synth Effects

105-112	Ethnic

113-120	Percussive

121-128	Sound Effects

***

Note: While GM1 does not define the actual characteristics of any sounds, the names in parentheses after each of the synth leads, pads, and sound effects are, in particular, intended only as guides).

***

### PC #	Instrument Name
1.	Acoustic Grand Piano
2.	Bright Acoustic Piano
3.	Electric Grand Piano
4.	Honky-tonk Piano
5.	Electric Piano 1
6.	Electric Piano 2
7.	Harpsichord
8.	Clavi
9.	Celesta
10.	Glockenspiel
11.	Music Box
12.	Vibraphone
13.	Marimba
14.	Xylophone
15.	Tubular Bells
16.	Dulcimer
17.	Drawbar Organ
18.	Percussive Organ
19.	Rock Organ
20.	Church Organ
21.	Reed Organ
22.	Accordion
23.	Harmonica
24.	Tango Accordion
25.	Acoustic Guitar (nylon)
26.	Acoustic Guitar (steel)
27.	Electric Guitar (jazz)
28.	Electric Guitar (clean)
29.	Electric Guitar (muted)
30.	Overdriven Guitar
31.	Distortion Guitar
32.	Guitar harmonics
33.	Acoustic Bass
34.	Electric Bass (finger)
35.	Electric Bass (pick)
36.	Fretless Bass
37.	Slap Bass 1
38.	Slap Bass 2
39.	Synth Bass 1
40.	Synth Bass 2
41.	Violin
42.	Viola
43.	Cello
44.	Contrabass
45.	Tremolo Strings
46.	Pizzicato Strings
47.	Orchestral Harp
48.	Timpani
49.	String Ensemble 1
50.	String Ensemble 2
51.	SynthStrings 1
52.	SynthStrings 2
53.	Choir Aahs
54.	Voice Oohs
55.	Synth Voice
56.	Orchestra Hit
57.	Trumpet
58.	Trombone
59.	Tuba
60.	Muted Trumpet
61.	French Horn
62.	Brass Section
63.	SynthBrass 1
64.	SynthBrass 2
65.	Soprano Sax
66.	Alto Sax
67.	Tenor Sax
68.	Baritone Sax
69.	Oboe
70.	English Horn
71.	Bassoon
72.	Clarinet
73.	Piccolo
74.	Flute
75.	Recorder
76.	Pan Flute
77.	Blown Bottle
78.	Shakuhachi
79.	Whistle
80.	Ocarina
81.	Lead 1 (square)
82.	Lead 2 (sawtooth)
83.	Lead 3 (calliope)
84.	Lead 4 (chiff)
85.	Lead 5 (charang)
86.	Lead 6 (voice)
87.	Lead 7 (fifths)
88.	Lead 8 (bass + lead)
89.	Pad 1 (new age)
90.	Pad 2 (warm)
91.	Pad 3 (polysynth)
92.	Pad 4 (choir)
93.	Pad 5 (bowed)
94.	Pad 6 (metallic)
95.	Pad 7 (halo)
96.	Pad 8 (sweep)
97.	FX 1 (rain)
98.	FX 2 (soundtrack)
99.	FX 3 (crystal)
100.	FX 4 (atmosphere)
101.	FX 5 (brightness)
102.	FX 6 (goblins)
103.	FX 7 (echoes)
104.	FX 8 (sci-fi)
105.	Sitar
106.	Banjo
107.	Shamisen
108.	Koto
109.	Kalimba
110.	Bag pipe
111.	Fiddle
112.	Shanai
113.	Tinkle Bell
114.	Agogo
115.	Steel Drums
116.	Woodblock
117.	Taiko Drum
118.	Melodic Tom
119.	Synth Drum
120.	Reverse Cymbal
121.	Guitar Fret Noise
122.	Breath Noise
123.	Seashore
124.	Bird Tweet
125.	Telephone Ring
126.	Helicopter
127.	Applause
128.	Gunshot


