In [None]:
import pandas
import mido
import re
import shutil
from multiprocessing import Pool
import ntpath
import os
import glob
import pretty_midi

In [None]:
os.getcwd()

## Testing

## Class/methods

In [None]:
class FileProcessor:
    def __init__(self, output_folder):
        self.saved_songs = []
        self.errors = 0
        self.num_processed = 0
        self.output_folder = output_folder
        self.guitar_codes = list(range(24, 32))

    def parse_file(self, midi_path: str) -> None:
        '''
        Given the path of a .mid file, checks if the MIDI object has guitar and
        drum, and a time signature of 4/4. If these conditions are fulfilled,
        copies the .mid file to a given output folder.
        '''
        try:
            midi_file = pretty_midi.PrettyMIDI(midi_path)

            if self.has_drum_and_guitar(midi_file.instruments) and self.is_four_by_four(
                midi_file.time_signature_changes
            ):
                filename = ntpath.split(midi_path)[1]
                shutil.copyfile(midi_path, f'{self.output_folder}/{filename}')

                # If you wanT to write a new file instead:
                # midi_file.write(f"{self.output_folder}/{filename}")

        except KeyError as ke:
            print(f"Error for {midi_path}: {ke}")
            self.errors += 1
        except AttributeError as ae:
            print(f"Error for {midi_path}: {ae}")
            self.errors += 1
        except Exception as ee:
            print(f"Error for {midi_path}: {ee}")
            self.errors += 1

    def has_drum_and_guitar(self, instruments: list) -> bool:
        '''
        Given a list of PrettyMIDI instruments, returns True if at least one
        of the instruments is a drum and at least one is a guitar.
        '''
        programs = set(i.program for i in instruments)
        is_drum = [i.is_drum for i in instruments]
        return programs.intersection(self.guitar_codes) and any(is_drum)

    def is_four_by_four(self, time_signatures: list) -> bool:
        '''
        Given a list of PrettyMIDI time signature changes, returns True if there
        is only one time signature and it is 4/4.
        '''
        return len(time_signatures) == 1 and (
            (time_signatures[0].numerator == 4)
            and (time_signatures[0].denominator == 4)
        )

    def remove_duplicates(self) -> None:
        '''
        Loops through output folder, normalizes filenamds to alphabetic only,
        and removes subseqent files with the same normalized filename as a
        previously seen one.
        '''
        title_pattern = "[^a-zA-Z]+"
        already_seen = []
        saved_files = glob.glob(f"{self.output_folder}/*.mid")
        for midi_file in saved_files:
            title = re.sub(title_pattern, "", midi_file).lower()
            if title in already_seen:
                os.remove(midi_file)
            else:
                already_seen.append(title)
        print("Total number of non-duplicate songs with drums and guitar saved:", len(already_seen))


## Running

In [None]:
# Get raw midi files from..
raw_data_folder = os.path.join(os.pardir, "raw_data", "Unzipped")
# raw_data_folder = os.path.join(os.pardir, "test_data")
midi_file_paths = glob.glob(f"{raw_data_folder}/*.mid") + glob.glob(
    f"{raw_data_folder}/*/*.mid"
)

# Save all midi guitar and drum files to...
output_folder = os.path.join(os.pardir, "data")
print(output_folder)

In [123]:
%time
file_processor = FileProcessor(output_folder)

# Identify and save files with guitar and drum
print(f"Processing {len(midi_file_paths)} files in {raw_data_folder}...")
print(f"Will save to {output_folder}...")
pool = Pool(5)
result = pool.map(file_processor.parse_file, midi_file_paths)

# Remove duplicates
print(f"Removing duplicates from {output_folder}...")
file_processor.remove_duplicates()
print("\nDone!")

Error for ../raw_data/Unzipped/Rage_Against_the_Machine_-_Bullet_in_the_Head.mid: [Errno 2] No such file or directory: '../data/Rage_Against_the_Machine_-_Bullet_in_the_Head.mid'
Error for ../raw_data/Unzipped/Arlo_Guthrie_-_Alices_Restaurant.mid: [Errno 2] No such file or directory: '../data/Arlo_Guthrie_-_Alices_Restaurant.mid'
Error for ../raw_data/Unzipped/Eric_Clapton_-_Wonderful_Tonight.mid: [Errno 2] No such file or directory: '../data/Eric_Clapton_-_Wonderful_Tonight.mid'
Error for ../raw_data/Unzipped/Nine_Inch_Nails_-_Sin.mid: [Errno 2] No such file or directory: '../data/Nine_Inch_Nails_-_Sin.mid'
Error for ../raw_data/Unzipped/julian_casablancas_-_hard_to_explain.mid: [Errno 2] No such file or directory: '../data/julian_casablancas_-_hard_to_explain.mid'
Error for ../raw_data/Unzipped/Bryan_Adams_-_This_Time.mid: [Errno 2] No such file or directory: '../data/Bryan_Adams_-_This_Time.mid'
Error for ../raw_data/Unzipped/Whitesnake_-_Here_I_Go_Again.mid: [Errno 2] No such file 

In [None]:
print(len(glob.glob(f"{output_folder}/*.mid")))