In [50]:
import numpy as np
np.int = int
import pretty_midi
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import pickle
import os
from pathlib import Path

# List of target composers
target_compsers = ['Bach', 'Beethoven', 'Chopin', 'Mozart']

# Folder containing the dataset
dataset_folder = Path('../data/midi/archive/midiclassics')

# Storing paths of MIDI files for each composer
dataset = []

# Avoid dublicate filenames
seen_filenames = set()

#Looping through each target composer folder
for composer in target_compsers:
    composer_folder = dataset_folder / composer

    # Recursively find all MIDI files in all subfolders in the composer's folder
    for file_path in composer_folder.rglob('*'): # Iterate over this subtree and yield all existing files
        if file_path.suffix.lower() in ['.mid', '.midi']: # Only consider MIDI files
            
            # Check if the filename has already been seen
            if file_path.name not in seen_filenames:
                seen_filenames.add(file_path.name)
                dataset.append((str(file_path), composer)) # Add the file path to the dataset

# Check total files found 
print(f"Total MIDI files found: {len(dataset)}")

# Display the first 10 file paths
for path, composer in dataset[1500:1550]:
    print(f"Composer: {composer}, File Path: {path}")


Total MIDI files found: 1630
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n02 K280.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n03 K281.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n04 K282.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n05 K283.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n06 K284.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n07 K309.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n08 K311.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano Sonata n09 K310.mid
Composer: Mozart, File Path: ..\data\midi\archive\midiclassics\Mozart\Piano Sonatas\Piano S

In [51]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 398445658473397961
xla_global_id: -1
]


In [None]:
import pretty_midi
import numpy as np
import pickle
from tqdm import tqdm

# -----------------------------------
# Function to parse one MIDI file (returns 6 features per note)
def parse_midi_file(file_path):
    try:
        midi = pretty_midi.PrettyMIDI(file_path)
        notes = []

        # Get global tempo (default to 120 if not found)
        tempo_changes = midi.get_tempo_changes()[1]
        global_tempo = round(float(tempo_changes[0]) if len(tempo_changes) > 0 else 120.0, 2)

        # Get instrument notes (skip drums)
        for instrument in midi.instruments:
            if not instrument.is_drum:
                instrument_notes = instrument.notes
                break
        else:
            return []

        # Sort notes by start time
        instrument_notes.sort(key=lambda n: n.start)
        prev_start = 0.0

        for note in instrument_notes:
            pitch = note.pitch  # MIDI number (0–127)
            duration = round(note.end - note.start, 4)  # in seconds
            delta_time = round(note.start - prev_start, 4)  # time from previous note
            velocity = note.velocity  # note strength

            # Is part of a chord? (same start time as another note)
            is_chord = int(any(
                abs(note.start - other.start) < 0.01 and note.pitch != other.pitch
                for other in instrument_notes
            ))

            prev_start = note.start
            notes.append([pitch, duration, delta_time, velocity, global_tempo, is_chord])

        return notes

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return []

# -----------------------------------
# Step 2: Parse all files in dataset
sequence_length = 300  # Use 300-note (I used sequences =  100 at first try) 
parsed_sequences_300 = []
labels_300 = []

for path, composer in tqdm(dataset, desc="Parsing MIDI files (300 tokens)"):
    sequence = parse_midi_file(path)

    # Pad or truncate to 300 notes per song
    if len(sequence) > sequence_length:
        sequence = sequence[:sequence_length]
    else:
        pad_size = sequence_length - len(sequence)
        sequence += [[0, 0.0, 0.0, 0, 120.0, 0]] * pad_size  # padding values

    parsed_sequences_300.append(sequence)
    labels_300.append(composer)

# -----------------------------------
# Step 3: Convert to arrays
X_300 = np.array(parsed_sequences_300, dtype=np.float32)
y_300 = np.array(labels_300)

# Step 4: Print shapes and preview
print("Final dataset shapes (300 tokens, 6 features):")
print(f"X_300 shape (songs, notes, features): {X_300.shape}")
print(f"y_300 shape (labels): {y_300.shape}")
print("Sample row (first song, first 5 notes):")
print(X_300[0][:5])
print(f" Label: {y_300[0]}")

# -----------------------------------
# Step 5: Save for training
with open("parsed_midi_lstm_ready_300tokens_6features.pkl", "wb") as f:
    pickle.dump({'X': X_300, 'y': y_300}, f)

print("Saved parsed data to 'parsed_midi_lstm_ready_300tokens_6features.pkl'")


In [79]:
import pickle

# Load the parsed MIDI dataset
with open("parsed_midi_lstm_ready_300tokens_6features.pkl", "rb") as f:
    data = pickle.load(f)

X_300 = data['X']
y_300 = data['y']

print(f"Loaded shapes: X_300 = {X_300.shape}, y_300 = {y_300.shape}")


Loaded shapes: X_300 = (1630, 300, 6), y_300 = (1630,)


In [80]:
# Normalize the input X (shape: num_samples, 100, 6)
X_norm = np.copy(X_300)  

# Feature 0: pitch (0–127) → scale to [0,1]
X_norm[:, :, 0] = X_norm[:, :, 0] / 127.0

# Feature 1: duration (seconds) → z-score normalization
dur_mean = X_norm[:, :, 1].mean()
dur_std = X_norm[:, :, 1].std()
X_norm[:, :, 1] = (X_norm[:, :, 1] - dur_mean) / (dur_std + 1e-6)

# Feature 2: delta_time → z-score normalization
delta_mean = X_norm[:, :, 2].mean()
delta_std = X_norm[:, :, 2].std()
X_norm[:, :, 2] = (X_norm[:, :, 2] - delta_mean) / (delta_std + 1e-6)

# Feature 3: velocity (0–127) → scale to [0,1]
X_norm[:, :, 3] = X_norm[:, :, 3] / 127.0

# Feature 4: tempo (common range: ~60–240 bpm) → scale to [0,1] using 240 as max
X_norm[:, :, 4] = X_norm[:, :, 4] / 240.0

# Feature 5: is_chord → already 0 or 1, keep unchanged
# No action needed

print("Normalization complete.")
print("Sample normalized input (first row):")
print(X_norm[0][:5])


Normalization complete.
Sample normalized input (first row):
[[ 0.41732284 -0.31523398 -0.13586776  0.23622048  0.5         0.        ]
 [ 0.4488189  -0.31523398 -0.0850317   0.23622048  0.5         0.        ]
 [ 0.47244096 -0.31523398 -0.0850317   0.23622048  0.5         0.        ]
 [ 0.511811   -0.31523398 -0.0850317   0.23622048  0.5         0.        ]
 [ 0.54330707 -0.31523398 -0.0850317   0.23622048  0.5         0.        ]]


### Ignoring balacing data for now to find a proper way

In [43]:
# from collections import defaultdict
# import numpy as np
# import pickle

# # Set your token budget (e.g., 13000 tokens per composer)
# MAX_TOKENS_PER_COMPOSER = 13000

# # Group normalized sequences by composer
# composer_to_sequences = defaultdict(list)
# for seq, composer in zip(X_norm, y_300):  # Use normalized data and correct labels
#     composer_to_sequences[composer].append(seq)

# # ⚖ Select sequences to match max token count per composer
# selected_sequences = []
# selected_labels = []

# for composer, sequences in composer_to_sequences.items():
#     np.random.shuffle(sequences)  # Shuffle for randomness
#     token_count = 0
#     for seq in sequences:
#         token_count += len(seq)  # Each seq has 300 tokens (notes)
#         if token_count > MAX_TOKENS_PER_COMPOSER:
#             break
#         selected_sequences.append(seq)
#         selected_labels.append(composer)

# # Convert to NumPy arrays
# X_balanced = np.array(selected_sequences, dtype=np.float32)
# y_balanced = np.array(selected_labels)

# # Print result summary
# print("After token-based balancing:")
# for composer in np.unique(y_balanced):
#     count = np.sum(y_balanced == composer)
#     print(f"{composer}: {count} sequences (≈ {count * 300} tokens)")

# print(f"X_balanced shape: {X_balanced.shape}")
# print(f"y_balanced shape: {y_balanced.shape}")

# #  Save the balanced dataset
# with open("parsed_midi_lstm_balanced_300tokens_norm.pkl", "wb") as f:
#     pickle.dump({'X': X_balanced, 'y': y_balanced}, f)

# print(" Saved balanced dataset to 'parsed_midi_lstm_balanced_300tokens_norm.pkl'")


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

#Encode the balanced labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_300)  # Convert to integer labels: [0,1,2,3]
y_onehot = to_categorical(y_encoded)  # Convert to one-hot format: [[1,0,0,0], ...]

# See label mapping
print("Label Mapping:")
for i, name in enumerate(label_encoder.classes_):
    print(f"{i}: {name}")


Label Mapping:
0: Bach
1: Beethoven
2: Chopin
3: Mozart


In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Input(shape=(300, 6))) 
model.add(Masking(mask_value=0.0))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
# model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])



In [83]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1)


In [84]:
# from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(X, y_onehot, test_size=0.2, random_state=42, stratify=y_encoded)

# history = model.fit(X_train, y_train,
#                     validation_data=(X_val, y_val),
#                     epochs=50,
#                     batch_size=32)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# ------------------------------------------
# Step 1: Encode the balanced labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_300)  # Convert to integer labels: [0,1,2,3]
y_onehot = to_categorical(y_encoded)  # Convert to one-hot format: [[1,0,0,0], ...]

# Optional: See label mapping
print("Label Mapping:")
for i, name in enumerate(label_encoder.classes_):
    print(f"{i}: {name}")

# ------------------------------------------
# Step 2: Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X_300, y_onehot,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # stratify based on integer labels
)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

# ------------------------------------------
# Step 3: Train your LSTM model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop]  # Use early stopping to prevent overfitting
)


Label Mapping:
0: Bach
1: Beethoven
2: Chopin
3: Mozart
Train shape: (1304, 300, 6), Validation shape: (326, 300, 6)
Epoch 1/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 143ms/step - accuracy: 0.5958 - loss: 1.0242 - val_accuracy: 0.6411 - val_loss: 0.8588
Epoch 2/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 137ms/step - accuracy: 0.6633 - loss: 0.8788 - val_accuracy: 0.6779 - val_loss: 0.8474
Epoch 3/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 137ms/step - accuracy: 0.6486 - loss: 0.8857 - val_accuracy: 0.6748 - val_loss: 0.8065
Epoch 4/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 141ms/step - accuracy: 0.6424 - loss: 0.8511 - val_accuracy: 0.6871 - val_loss: 0.8047
Epoch 5/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 150ms/step - accuracy: 0.7013 - loss: 0.7776 - val_accuracy: 0.6748 - val_loss: 0.8119
Epoch 6/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [85]:
# Predict on validation set
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_val, axis=1)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step


In [86]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Accuracy
acc = accuracy_score(y_true, y_pred)

# Precision & Recall (macro for balanced view across all classes)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")


Accuracy : 0.7147
Precision: 0.5045
Recall   : 0.4853
