## Assignment 2
### Collaborators: Aldrin Ilagan, Lindsey Rappaport, Yvanna Cardenas

In [2]:
# Import packages (same from Workbook 3)
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
random.seed(42)

### Symbolic,  unconditioned generation

In [4]:
# Parse data
file_path = './maestro-v3.0.0/'
test_files = []
# Set train files to years 2004-2011
train_files = glob.glob(file_path + '2004/*.midi')
# train_files += glob.glob(file_path + '2006/*.midi')
# train_files += glob.glob(file_path + '2008/*.midi')
# train_files += glob.glob(file_path + '2009/*.midi')       # including the rest makes training take 2 min / epoch (very slow!)
# train_files += glob.glob(file_path + '2011/*.midi')

# Set test files to years 2013-2018
test_files = glob.glob(file_path + '2013/*.midi')
# test_files += glob.glob(file_path + '2014/*.midi')
# test_files += glob.glob(file_path + '2015/*.midi')
# test_files += glob.glob(file_path + '2017/*.midi')
# test_files += glob.glob(file_path + '2018/*.midi')

print(len(train_files))
print(len(test_files))


132
127


In [5]:
# Build Markov chain from training data
from miditok import REMI
from miditoolkit import MidiFile
from collections import defaultdict, Counter
import glob

# Initialize REMI tokenizer
tokenizer = REMI()

# Get all training MIDI file paths from MAESTRO v3.0.0 (years 2004–2011)
train_files = glob.glob('./maestro-v3.0.0/**/**/*.midi', recursive=True)
train_files = [f for f in train_files if any(str(y) in f for y in range(2004, 2012))]

# Initialize transition counter: token -> next_token -> count
transitions = defaultdict(Counter)

# Iterate over MIDI files and count token transitions
for file in train_files:
    encodings = tokenizer.encode(file)  # returns list of Encoding objects
    for encoding in encodings:
        ids = encoding.ids
        for i in range(len(ids) - 1):
            transitions[ids[i]][ids[i + 1]] += 1

# Normalize transition counts into probabilities
markov_model = {
    k: {kk: vv / sum(v.values()) for kk, vv in v.items()}
    for k, v in transitions.items()
}

print(f"Built Markov model with {len(markov_model)} unique starting tokens.")


Built Markov model with 217 unique starting tokens.


In [6]:
def generate_markov_sequence(start_token, length=200):
    sequence = [start_token]
    for _ in range(length - 1):
        curr = sequence[-1]
        next_tokens = markov_model.get(curr, {})
        if not next_tokens:
            break
        next_token = random.choices(
            list(next_tokens.keys()), weights=list(next_tokens.values()), k=1
        )[0]
        sequence.append(next_token)
    return sequence

In [7]:
from miditok import TokSequence

# Generate a sequence using the Markov chain
start = random.choice(list(markov_model.keys()))
generated_ids = generate_markov_sequence(start)

# Wrap in a TokSequence and decode to ScoreTick
seq = TokSequence(ids=generated_ids)

try:
    score = tokenizer.decode([seq])  # returns symusic ScoreTick
    score.dump_midi("symbolic_unconditioned.mid")  # save directly to MIDI
    print("Saved symbolic_unconditioned.mid ✅")
except KeyError as e:
    print("❌ Decode failed due to invalid token ID:", e)

Saved symbolic_unconditioned.mid ✅


In [8]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)


### Define RNN model (LSTM)

In [9]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

### Define training function

In [10]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=10, lr=0.001, device='mps'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)

Epoch 1/10 | Train Loss: 3.1528 | Val Loss: 2.6854
Epoch 2/10 | Train Loss: 2.5979 | Val Loss: 2.5191
Epoch 3/10 | Train Loss: 2.4555 | Val Loss: 2.4162
Epoch 4/10 | Train Loss: 2.3355 | Val Loss: 2.3572
Epoch 5/10 | Train Loss: 2.2258 | Val Loss: 2.3232
Epoch 6/10 | Train Loss: 2.1148 | Val Loss: 2.3184
Epoch 7/10 | Train Loss: 1.9908 | Val Loss: 2.3237
Epoch 8/10 | Train Loss: 1.8549 | Val Loss: 2.3573
Epoch 9/10 | Train Loss: 1.7168 | Val Loss: 2.4284
Epoch 10/10 | Train Loss: 1.5718 | Val Loss: 2.5037


### Define sampling function

In [11]:
def sample(model, start_token, max_length=100, temperature=1.0, device='mps'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 205, 63, 110, 126, 207, 62, 111, 126, 208, 63, 113, 126, 39, 112, 125, 27, 111, 125, 48, 111, 125, 218, 65, 112, 127, 220, 58, 111, 126, 27, 112, 126, 39, 110, 126, 34, 109, 125, 4, 198, 60, 113, 125, 199, 58, 111, 125, 49, 111, 125, 53, 111, 125, 201, 56, 109, 125, 202, 55, 108, 126, 204, 56, 112, 125, 48, 111, 125, 212, 60, 110, 125, 60, 111, 125, 51, 104, 125, 213, 63, 110, 125, 214, 58, 110, 125, 46, 104, 125, 216, 49, 107, 126, 56, 111, 126, 217, 55, 111, 126, 218, 56, 111, 126, 219, 51, 110, 129, 36, 104, 147, 4, 190, 60, 109, 126, 192, 58, 110, 126, 193, 58, 108, 125, 194, 60, 110, 125, 195, 56, 107, 125, 197, 56, 110, 125, 198, 55, 109, 126, 200, 56, 109, 125, 201, 29, 104, 137, 55, 107, 125, 203, 56, 107, 126, 205, 60, 107, 126, 207, 58, 108, 126, 208, 56, 109, 125, 210, 54, 109, 126, 212, 55, 109, 126, 213, 56, 110, 125, 215, 53, 112, 126, 216, 41, 104, 125, 56, 110, 125, 218, 58, 112, 126, 220, 60, 112, 126, 4, 189, 60, 112, 127, 191, 58, 111

In [13]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth

In [14]:

output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn.mid")
fs.midi_to_audio("rnn.mid", "rnn.wav")
display(Audio("rnn.wav"))

  output_score = tokenizer.tokens_to_midi([generated_sequence])


FluidSynth runtime version 2.4.5
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'rnn.wav'..


### Symbolic, conditioned generation (harmonization)

In [17]:
# Define simple harmonization model with LSTM
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class HarmonizationDataset(Dataset):
    def __init__(self, file_paths, tokenizer):
        self.data = []
        for path in file_paths[:100]:  # limit to 100 for speed
            try:
                encodings = tokenizer.encode(path)
                ids = []
                for encoding in encodings:
                    ids.extend(encoding.ids)  # flatten all token segments
                for i in range(len(ids) - 16):
                    melody = ids[i:i+8]
                    chords = ids[i+8:i+16]
                    self.data.append((melody, chords))
            except Exception as e:
                print(f"Skipping file {path} due to error: {e}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        melody, chords = self.data[idx]
        return torch.tensor(melody), torch.tensor(chords)

class HarmonizationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        output, _ = self.lstm(x)
        logits = self.fc(output)
        return logits

In [23]:
# Prepare training dataset and dataloader
train_files = glob.glob('maestro-v3.0.0/*/*.mid*')
train_files = train_files[:20] 
dataset = HarmonizationDataset(train_files, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model, loss, optimizer
model = HarmonizationModel(tokenizer.vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Simple training loop (no val loss tracking)
for epoch in range(10):
    model.train()
    train_losses = []
    for melody, chords in loader:
        preds = model(melody)
        loss = loss_fn(preds.view(-1, preds.shape[-1]), chords.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"Epoch {epoch+1}/10 | Train Loss: {avg_train_loss:.4f}")

Epoch 1/10 | Train Loss: 3.7513
Epoch 2/10 | Train Loss: 3.5983
Epoch 3/10 | Train Loss: 3.5000
Epoch 4/10 | Train Loss: 3.4238
Epoch 5/10 | Train Loss: 3.3658
Epoch 6/10 | Train Loss: 3.3205
Epoch 7/10 | Train Loss: 3.2849
Epoch 8/10 | Train Loss: 3.2560
Epoch 9/10 | Train Loss: 3.2317
Epoch 10/10 | Train Loss: 3.2125


In [44]:
from miditok import TokSequence

# 1) Combine melody + predicted chords into one list of IDs
combined = melody.tolist() + pred_ids

# 2) Wrap in a TokSequence
seq = TokSequence(ids=combined)

# 3) Decode into a ScoreTick (no output_path here)
score = tokenizer.decode([seq])  # returns ScoreTick

# 4) Dump the ScoreTick to a .mid file (must not collide with an existing folder)
score.dump_midi("symbolic_conditioned.mid")
print("✅ Saved symbolic_conditioned.mid")

✅ Saved symbolic_conditioned.mid
