# Generate Midi List and Clean
Generate a list of midi files in a directory, and clean (remove invalid files and duplicated files).

## Settings

In [1]:
data_dir = '../../data/merged-dataset'
suffixes = ('mid', )
midi_checker = 'default'
remove_invalid = True
remove_duplicated = True
save_path = '../../processed_data/info_note/merged-dataset.clean.deduplicated.txt'

## Preprocessing

In [2]:
import os
from midiprocessor import midi_utils, data_utils

In [3]:
data_dir = data_dir.replace('\\', '//')
assert '://' not in data_dir
if data_dir[-1] == '/':
    data_dir = data_dir[:-1]
print('Data Dir: %s' % data_dir)

Data Dir: ../../data/merged-dataset


## Processing

In [5]:
processed = set()
file_list = []

len_root = len(data_dir.split('/'))

for root_dir, dirs, files in os.walk(data_dir):
    for file_name in files:
        if suffixes is not None and len(suffixes) > 0:
            not_in = True
            for suffix in suffixes:
                if file_name.endswith(suffix):
                    not_in = False
                    break
            if not_in:
                continue
        
        file_path = os.path.join(root_dir, file_name).replace('\\', '/')
        relative_path = '/'.join(file_path.split('/')[len_root:])

        try:
            with open(file_path, 'rb') as f:
                md5 = data_utils.get_md5_sum(file_obj=f)
                if md5 in processed and remove_duplicated:
                    continue
                processed.add(md5)

                f.seek(0)
                midi_obj = midi_utils.load_midi(file=f, midi_checker=midi_checker)
        except:
            if not remove_invalid:
                file_list.append(relative_path)
        else:
            file_list.append(relative_path)

## Save

In [None]:
data_utils.dump_list(file_list, save_path)