In [2]:
!unzip -q Composer_Dataset.zip

replace __MACOSX/._Composer_Dataset? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [8]:
!pip install pretty_midi
!pip install mido librosa
import pretty_midi



In [9]:
import os
import numpy as np
import pretty_midi
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

 ## Function to Read MIDI Files as Piano Roll

 ## get_fixed_piano_roll():
 This function reads a single MIDI file and converts it into a fixed-size piano roll matrix, which is a common input format for deep learning models like CNNs.

In [10]:
def get_fixed_piano_roll(midi_path, max_time_steps=500):
    try:
        midi = pretty_midi.PrettyMIDI(midi_path)
        roll = midi.get_piano_roll(fs=100)[:128, :]  # only 128 pitches
        if roll.shape[1] < max_time_steps:
            pad = max_time_steps - roll.shape[1]
            roll = np.pad(roll, ((0, 0), (0, pad)), mode='constant')
        else:
            roll = roll[:, :max_time_steps]
        return roll
    except Exception as e:
        print(f"Skipped {midi_path}: {e}")
        return None

## process_and_save_all()
This function performs the following steps
- Read all .mid/.midi files in folders (organized by composer),

- Convert them into fixed-size piano roll matrices (via get_fixed_piano_roll()),

- Save each piano roll to disk as a .npy file,

- Create a CSV index file listing:

- the .npy file path

- the label (composer name)



In [11]:
def process_and_save_all(mid_root, save_dir, csv_path):
    os.makedirs(save_dir, exist_ok=True)
    rows = []
    for composer in sorted(os.listdir(mid_root)):
        comp_dir = os.path.join(mid_root, composer)
        if not os.path.isdir(comp_dir): continue
        for fname in os.listdir(comp_dir):
            if not fname.lower().endswith(('.mid', '.midi')): continue
            full_path = os.path.join(comp_dir, fname)
            piano_roll = get_fixed_piano_roll(full_path)
            if piano_roll is not None:
                save_name = f"{composer}_{fname.replace('.mid','').replace('.midi','')}.npy"
                save_path = os.path.join(save_dir, save_name)
                np.save(save_path, piano_roll)
                rows.append([save_path, composer])
    df = pd.DataFrame(rows, columns=['file_path', 'label'])
    df.to_csv(csv_path, index=False)
    print(f"Saved {len(df)} files to {save_dir} and index to {csv_path}")

In [7]:
import pandas as pd
process_and_save_all(
    mid_root="Composer_Dataset/NN_midi_files_extended/train",
    save_dir="saved_pianorolls/train",
    csv_path="saved_pianorolls/train_index.csv"
)

Saved 369 files to saved_pianorolls/train and index to saved_pianorolls/train_index.csv
