# Install and load required libraries

In [6]:
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator

In [7]:
train_files = glob.glob("./data/train/*.mid")
test_files = glob.glob("./data/test/*.mid")

# RNN for MIDI generation

## A New Dataset for batch inputs

In [12]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [13]:
len(train_loader), len(test_loader)

(49, 3)

## RNN

In [14]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

### Training

In [17]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device='cpu'):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 3.8777 | Val Loss: 2.6451
Epoch 2/20 | Train Loss: 2.7362 | Val Loss: 2.1661
Epoch 3/20 | Train Loss: 2.4909 | Val Loss: 2.0267
Epoch 4/20 | Train Loss: 2.3528 | Val Loss: 1.9302
Epoch 5/20 | Train Loss: 2.2377 | Val Loss: 1.8431
Epoch 6/20 | Train Loss: 2.1123 | Val Loss: 1.6932
Epoch 7/20 | Train Loss: 1.9645 | Val Loss: 1.5815
Epoch 8/20 | Train Loss: 1.8526 | Val Loss: 1.5218
Epoch 9/20 | Train Loss: 1.7708 | Val Loss: 1.4791
Epoch 10/20 | Train Loss: 1.6985 | Val Loss: 1.4198
Epoch 11/20 | Train Loss: 1.6344 | Val Loss: 1.3965
Epoch 12/20 | Train Loss: 1.5806 | Val Loss: 1.3807
Epoch 13/20 | Train Loss: 1.5106 | Val Loss: 1.3099
Epoch 14/20 | Train Loss: 1.4379 | Val Loss: 1.2888
Epoch 15/20 | Train Loss: 1.3872 | Val Loss: 1.2589
Epoch 16/20 | Train Loss: 1.3240 | Val Loss: 1.2357
Epoch 17/20 | Train Loss: 1.2792 | Val Loss: 1.2254
Epoch 18/20 | Train Loss: 1.2526 | Val Loss: 1.2113
Epoch 19/20 | Train Loss: 1.1841 | Val Loss: 1.2201
Epoch 20/20 | Train L

### Sampling

In [19]:
def sample(model, start_token, max_length=100, temperature=1.0, device='cpu'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 189, 49, 113, 128, 53, 113, 128, 41, 113, 128, 193, 53, 111, 128, 58, 111, 128, 197, 58, 113, 128, 61, 113, 128, 58, 113, 128, 201, 46, 112, 128, 53, 112, 128, 205, 56, 113, 126, 52, 112, 128, 207, 61, 114, 125, 209, 53, 112, 128, 63, 112, 128, 211, 56, 110, 126, 213, 54, 111, 128, 56, 111, 128, 217, 51, 110, 126, 56, 110, 126, 219, 52, 110, 126, 4, 189, 56, 110, 140, 51, 111, 126, 191, 49, 109, 126, 193, 54, 112, 126, 195, 54, 111, 126, 197, 51, 110, 126, 42, 110, 128, 199, 56, 111, 126, 201, 56, 111, 126, 46, 111, 128, 203, 56, 111, 126, 205, 56, 114, 126, 61, 114, 126, 207, 61, 112, 126, 209, 58, 111, 126, 39, 111, 128, 211, 65, 113, 126, 213, 63, 114, 126, 217, 63, 112, 126, 51, 112, 128, 219, 56, 110, 126, 4, 189, 68, 110, 128, 63, 110, 128, 193, 58, 110, 126, 195, 56, 110, 126, 197, 53, 110, 132, 58, 110, 132, 37, 110, 126, 199, 39, 112, 126, 201, 41, 110, 126, 203, 41, 111, 126, 205, 51, 112, 126, 68, 112, 126, 35, 111, 128, 207, 58, 110, 126, 20

In [26]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth

output_score = tokenizer.decode([generated_sequence])
output_score.dump_midi(f"rnn.mid")
fs.midi_to_audio("rnn.mid", "rnn.wav")
display(Audio("rnn.wav"))

RuntimeError: File not found file (error:13): rnn.mid