In [1]:
import numpy as np
np.int = int
import pretty_midi
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import pickle
import os
from pathlib import Path

# List of target composers
target_compsers = ['Bach', 'Beethoven', 'Chopin', 'Mozart']

# Folder containing the dataset
dataset_folder = Path('../data/midi/archive/midiclassics')

# Storing paths of MIDI files for each composer
dataset = []

# Avoid dublicate filenames
seen_filenames = set()

#Looping through each target composer folder
for composer in target_compsers:
    composer_folder = dataset_folder / composer

    # Recursively find all MIDI files in all subfolders in the composer's folder
    for file_path in composer_folder.rglob('*'): # Iterate over this subtree and yield all existing files
        if file_path.suffix.lower() in ['.mid', '.midi']: # Only consider MIDI files
            
            # Check if the filename has already been seen
            if file_path.name not in seen_filenames:
                seen_filenames.add(file_path.name)
                dataset.append((str(file_path), composer)) # Add the file path to the dataset

# Check total files found 
print(f"Total MIDI files found: {len(dataset)}")

# Display the first 10 file paths
for path, composer in dataset[1500:1550]:
    print(f"Composer: {composer}, File Path: {path}")


Total MIDI files found: 1630
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n02 K280.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n03 K281.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n04 K282.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n05 K283.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n06 K284.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n07 K309.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n08 K311.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n09 K310.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano S

# Let's back to seq=70 idea but change something minor 
boost Beethoven & Mozart
Keep transpose, but set aug_per_chunk=1
Turn timing jitter off (or p=0.25 max). It can blur stylistic cues.
64 -> 96 for a bit more capacity
Class weights (nudge B & M)
ReduceLROnPlateau

# Cell 1 — Parse MIDI → relative features


In [None]:
# --- Cell 1: Parse target composers' MIDI files (search recursively) ---
from pathlib import Path
from tqdm import tqdm
import pretty_midi
import numpy as np


if not hasattr(np, 'int'):
    np.int = int

target_composers = ['Bach', 'Beethoven', 'Chopin', 'Mozart']

def midi_to_feature_array(midi_path):
    try:
        pm = pretty_midi.PrettyMIDI(str(midi_path))
    except Exception as e:
        print(f"Error reading {midi_path}: {e}")
        return None

    rows = []
    for inst in pm.instruments:
        if inst.is_drum:
            continue
        for n in inst.notes:
            rows.append([n.start, n.end, n.pitch, n.velocity, inst.program])

    if not rows:
        return None

    notes = np.array(sorted(rows, key=lambda x: (x[0], x[2])), dtype=np.float32)
    starts, ends, pitch, vel, prog = notes.T
    dur = ends - starts

    keep = dur > 1e-4
    if not np.any(keep):
        return None
    starts, dur, pitch, vel, prog = starts[keep], dur[keep], pitch[keep], vel[keep], prog[keep]

    dt = np.diff(starts, prepend=starts[0])
    interval = np.diff(pitch, prepend=pitch[0])
    dt = np.log1p(dt)
    interval = np.clip(interval, -24, 24)

    pc = np.mod(pitch, 12)
    tnext = np.r_[starts[1:], starts[-1] + dur[-1]]
    time_to_next = np.clip(tnext - starts, 1e-3, None)
    dur_ratio = np.clip(dur / time_to_next, 0., 3.)

    feats = np.stack([dt, interval, pc, dur_ratio, vel, prog], axis=1).astype(np.float32)
    return feats

def load_midi_dataset(root_dir, target_composers):
    data_dict = {}
    root_dir = Path(root_dir)
    for composer in target_composers:
        composer_folder = root_dir / composer
        if not composer_folder.exists():
            print(f"Warning: Folder not found for {composer}")
            continue
        pieces = []
        midi_files = list(composer_folder.rglob("*.mid")) + list(composer_folder.rglob("*.midi"))
        for file_path in tqdm(midi_files, desc=f"Processing {composer}"):
            arr = midi_to_feature_array(file_path)
            if arr is not None:
                pieces.append(arr)
        data_dict[composer] = pieces
        print(f"Loaded {len(pieces)} pieces for {composer}")
    return data_dict

dataset_folder = Path("../data/midi/archive/midiclassics")
parsed_data = load_midi_dataset(dataset_folder, target_composers)


# Cell 2 — Chunk (+transpose + light jitter), balance, save



In [None]:
import pickle
import random
from collections import defaultdict
import numpy as np


# Augmentation helpers
def transpose_chunk(chunk, semitones):
    out = chunk.copy()
    out[:, 2] = np.mod(out[:, 2] + semitones, 12)  # pitch class only
    return out

def maybe_augment_chunks(chunks, label,
                         aug_per_chunk=1,
                         semis_choices=(-4, -2, 2, 4),
                         targets=("Beethoven", "Mozart")):
    if label not in targets or aug_per_chunk <= 0:
        return chunks
    out = []
    for ch in chunks:
        out.append(ch)
        for _ in range(aug_per_chunk):
            s = random.choice(semis_choices)
            out.append(transpose_chunk(ch, s))
    return out

def create_balanced_chunks(data_dict,
                           seq_len=70,
                           stride=35,
                           min_real_notes=50,
                           max_chunks_per_piece=20,
                           aug_per_chunk=1,
                           aug_semis=(-4, -2, 2, 4),
                           aug_targets=("Beethoven", "Mozart")):
    all_chunks = defaultdict(list)
    for composer, pieces in data_dict.items():
        for piece in pieces:
            n_notes = len(piece)
            chunks_for_piece = []
            if n_notes < seq_len:
                pad_len = seq_len - n_notes
                padded = np.vstack([piece, np.zeros((pad_len, piece.shape[1]))])
                if n_notes >= min_real_notes:
                    chunks_for_piece.append(padded)
            else:
                for start in range(0, n_notes - seq_len + 1, stride):
                    chunk = piece[start:start + seq_len]
                    if np.count_nonzero(np.any(chunk != 0, axis=1)) >= min_real_notes:
                        chunks_for_piece.append(chunk)
            chunks_for_piece = maybe_augment_chunks(
                chunks_for_piece, composer,
                aug_per_chunk=aug_per_chunk,
                semis_choices=aug_semis,
                targets=aug_targets
            )
            if len(chunks_for_piece) > max_chunks_per_piece:
                chunks_for_piece = random.sample(chunks_for_piece, max_chunks_per_piece)
            all_chunks[composer].extend(chunks_for_piece)

    min_count = min(len(chunks) for chunks in all_chunks.values())
    balanced_chunks = []
    for composer, chunks in all_chunks.items():
        sampled = random.sample(chunks, min_count)
        balanced_chunks.extend([(chunk, composer) for chunk in sampled])
    random.shuffle(balanced_chunks)
    return balanced_chunks

# usage

balanced_dataset = create_balanced_chunks(
    parsed_data,
    seq_len=70, stride=35, min_real_notes=50, max_chunks_per_piece=20,
    aug_per_chunk=1, aug_semis=(-4, -2, 2, 4), aug_targets=("Beethoven", "Mozart")
)

pkl_name = "../models/RNN/balanced_chunks_seq70_minorthings.pkl"
with open(pkl_name, "wb") as f:
    pickle.dump(balanced_dataset, f)
print(f"Balanced dataset saved to {pkl_name} with {len(balanced_dataset)} chunks.")

# checking file and shape

In [3]:
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Load the .pkl file
with open("../models/RNN/balanced_chunks_seq70_minorthings.pkl", "rb") as f:
    balanced_dataset = pickle.load(f)

# balanced_dataset is a list of tuples: (chunk_array, composer_label)
# Example: [(np.array shape (50, 6), "Bach"), (np.array shape (50, 6), "Mozart"), ...]

# Separate into X (features) and y (labels)
X = np.array([chunk for chunk, _ in balanced_dataset], dtype=np.float32)
y = np.array([label for _, label in balanced_dataset])

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(X.shape)          # (num_samples, 50, 6)
print(y_encoded.shape)  # (num_samples,)
print(le.classes_)      # ['Bach' 'Beethoven' 'Chopin' 'Mozart']

(10152, 70, 6)
(10152,)
['Bach' 'Beethoven' 'Chopin' 'Mozart']


#  (sanity check)

In [4]:
# Peek at distributions (sanity check)
import numpy as np

stacked = np.vstack([c for c,_ in balanced_dataset])  # (N*L, 6)
names = ["dt","interval","pc","dur_ratio","velocity","program"]
for i,n in enumerate(names):
    col = stacked[:,i]
    col = col[np.isfinite(col)]
    print(f"{n:<9} min={col.min():.4f} p50={np.median(col):.4f} "
          f"p95={np.percentile(col,95):.4f} max={col.max():.4f}")

dt        min=0.0000 p50=0.0038 p95=0.3483 max=2.6626
interval  min=-24.0000 p50=3.0000 p95=21.0000 max=24.0000
pc        min=0.0000 p50=5.0000 p95=11.0000 max=11.0000
dur_ratio min=0.0010 p50=3.0000 p95=3.0000 max=3.0000
velocity  min=1.0000 p50=75.0000 p95=116.0000 max=127.0000
program   min=0.0000 p50=0.0000 p95=68.0000 max=120.0000


# Count the number of chunks per composer

In [5]:
import numpy as np

# Count the number of chunks per composer
unique_labels, counts = np.unique(y, return_counts=True)

for label, count in zip(unique_labels, counts):
    print(f"{label}: {count} chunks")


Bach: 2538 chunks
Beethoven: 2538 chunks
Chopin: 2538 chunks
Mozart: 2538 chunks
