In [2]:
import pandas
import mido
import re
import shutil
from multiprocessing import Pool
import ntpath
import os
import glob
import pretty_midi

In [3]:
# Use this to strip titles and instrument data of non-alphabetic characters
pattern = "[^a-zA-Z]+"

In [4]:
os.getcwd()

'/home/branchedelac/code/MarkBerkovics/BandIt/notebooks'

## Testing

In [99]:
# Get raw midi files from..
raw_data_folder = os.path.join(os.pardir, "test_data")
midi_file_paths = glob.glob(f"{raw_data_folder}/*.mid") + glob.glob(
    f"{raw_data_folder}/*/*.mid"
)
type(output_folder)

# Save all midi guitar and drum files to...
output_folder = os.path.join(os.pardir, "test_output")
print(output_folder)

type(output_folder)


../test_output


str

In [66]:
midi_file = pretty_midi.PrettyMIDI(f"{raw_data_folder}/Zombie_4/Zombie_4.mid")
guitar = list(range(24, 32))
midi_file.time_signature_changes


[TimeSignature(numerator=4, denominator=4, time=0.0)]

In [20]:
for instrument in midi_file.instruments:
    if instrument.is_drum:
        print("This is a drum!")
    elif instrument.program in guitar:
        print(f"This ({instrument.program}) is a guitar!")

This (27) is a guitar!
This (29) is a guitar!
This (29) is a guitar!
This is a drum!
This (28) is a guitar!


In [97]:
type(midi_path)

NameError: name 'midi_path' is not defined

## Class/methods

In [102]:
class FileProcessor:
    def __init__(self, output_path):
        self.saved_songs = []
        self.errors = 0
        self.num_processed = 0
        self.output_path = output_path
        self.guitar_codes = list(range(24, 32))

    def parse_file(self, midi_path: str) -> None:
        '''
        Given the path of a .mid file, checks if the MIDI object has guitar and
        drum, and a time signature of 4/4. If these conditions are fulfilled,
        copies the .mid file to a given output folder.
        '''
        try:
            midi_file = pretty_midi.PrettyMIDI(midi_path)

            if self.has_drum_and_guitar(midi_file.instruments) and self.is_four_by_four(
                midi_file.time_signature_changes
            ):
                filename = ntpath.split(midi_path)[1]
                shutil.copyfile(midi_path, f'{self.output_path}/{filename}')

                # If you wanT to write a new file instead:
                # midi_file.write(f"{self.output_path}/{filename}")

        except KeyError as ke:
            print(ke)
            self.errors += 1
        except AttributeError as ae:
            print(ae)
            self.errors += 1

    def has_drum_and_guitar(self, instruments: list) -> bool:
        '''
        Given a list of PrettyMIDI instruments, returns True if at least one
        of the instruments is a drum and at least one is a guitar.
        '''
        programs = set(i.program for i in instruments)
        is_drum = [i.is_drum for i in instruments]
        return programs.intersection(self.guitar_codes) and any(is_drum)

    def is_four_by_four(self, time_signatures: list) -> bool:
        '''
        Given a list of PrettyMIDI time signature changes, returns True if there
        is only one time signature and it is 4/4.
        '''
        return len(time_signatures) == 1 and (
            (time_signatures[0].numerator == 4)
            and (time_signatures[0].denominator == 4)
        )

## Running

In [7]:
# Get raw midi files from..
raw_data_folder = os.path.join(os.pardir, "raw_data", "Unzipped")
# raw_data_folder = os.path.join(os.pardir, "test_data")
midi_file_paths = glob.glob(f"{raw_data_folder}/*.mid") + glob.glob(
    f"{raw_data_folder}/*/*.mid"
)

# Save all midi guitar and drum files to...
output_folder = os.path.join(os.pardir, "data")
print(output_folder)

../data


In [103]:
%time
file_processor = FileProcessor(output_folder)

# Identify and save files with guitar and drum
print(f"Processing {len(midi_file_paths)} files in {raw_data_folder}...")
print(f"Will save to {output_folder}...")
pool = Pool(5)
result = pool.map(file_processor.parse_file, midi_file_paths)

# Remove duplicates
print(f"Removing duplicates from {output_folder}...")
title_pattern = "[^a-zA-Z]+"
already_seen = []
saved_files = glob.glob(f"{output_folder}/*.mid")
for midi_file in saved_files:
    title = re.sub(title_pattern, "", midi_file).lower()
    if title in already_seen:
        os.remove(midi_file)
    else:
        already_seen.append(title)

print("Total number of non-duplicate songs with drums and guitar saved:", len(already_seen))


CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.58 µs
Processing 5 files in ../test_data...
Will save to ../test_output...
Removing duplicates from ../test_output...
Total number of non-duplicate songs with drums and guitar saved: 3
