In [12]:
#!pip install miditok
#!pip install symusic
#!pip install glob
#!pip install torch

In [13]:
import glob
import random
from typing import List
from collections import defaultdict

import os
import pandas as pd

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig

from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display

from pretty_midi import PrettyMIDI

In [14]:
# Uses 'cuda' if a gpu is detected. Otherwise uses cpu
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Can also set manually
#DEVICE = 'cpu'
#DEVICE = 'cuda'

print(DEVICE)

cuda


In [15]:
ROOT = "maestro-v3.0.0"            # change if you unpacked elsewhere
meta = pd.read_csv(os.path.join(ROOT, "maestro-v3.0.0.csv"))

def list_midi_files(split):
    paths = meta.loc[meta["split"] == split, "midi_filename"]
    return [os.path.join(ROOT, p) for p in paths]

train_files = list_midi_files("train")        # 962 MIDI files
val_files   = list_midi_files("validation")   # 137
test_files  = list_midi_files("test")         # 177


In [16]:
type(train_files[0])
train_files[0].encode('utf-8').decode('utf-8')
print(train_files[0].encode('utf-8'))
str.encode(train_files[0], 'utf-8')

b'maestro-v3.0.0\\2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi'


b'maestro-v3.0.0\\2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi'

Tokenizer

In [17]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [18]:
len(train_loader), len(test_loader)

(241, 45)

### RNN

In [19]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

Training

In [20]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device=DEVICE):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 3.2964 | Val Loss: 2.8257
Epoch 2/20 | Train Loss: 2.6973 | Val Loss: 2.6007
Epoch 3/20 | Train Loss: 2.5396 | Val Loss: 2.5278
Epoch 4/20 | Train Loss: 2.4392 | Val Loss: 2.4577
Epoch 5/20 | Train Loss: 2.3476 | Val Loss: 2.4080
Epoch 6/20 | Train Loss: 2.2708 | Val Loss: 2.3961
Epoch 7/20 | Train Loss: 2.2010 | Val Loss: 2.3785
Epoch 8/20 | Train Loss: 2.1327 | Val Loss: 2.3695
Epoch 9/20 | Train Loss: 2.0663 | Val Loss: 2.3833
Epoch 10/20 | Train Loss: 1.9955 | Val Loss: 2.4033
Epoch 11/20 | Train Loss: 1.9265 | Val Loss: 2.4172
Epoch 12/20 | Train Loss: 1.8580 | Val Loss: 2.4365
Epoch 13/20 | Train Loss: 1.7882 | Val Loss: 2.4719
Epoch 14/20 | Train Loss: 1.7257 | Val Loss: 2.5060
Epoch 15/20 | Train Loss: 1.6581 | Val Loss: 2.5525
Epoch 16/20 | Train Loss: 1.5921 | Val Loss: 2.6076
Epoch 17/20 | Train Loss: 1.5318 | Val Loss: 2.6554
Epoch 18/20 | Train Loss: 1.4685 | Val Loss: 2.7075
Epoch 19/20 | Train Loss: 1.4081 | Val Loss: 2.7746
Epoch 20/20 | Train L

Sampling

In [43]:
def sample(model, start_token, max_length=100, temperature=1.0, device=DEVICE):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

In [50]:
start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=2048)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 189, 46, 104, 126, 193, 32, 107, 125, 51, 106, 125, 194, 20, 104, 125, 195, 48, 105, 125, 196, 32, 103, 126, 44, 106, 126, 199, 20, 103, 125, 200, 29, 105, 125, 202, 39, 105, 125, 203, 37, 105, 125, 204, 32, 104, 125, 205, 37, 106, 125, 206, 32, 105, 125, 44, 108, 125, 208, 39, 109, 125, 32, 105, 125, 37, 108, 125, 209, 37, 109, 125, 210, 32, 111, 125, 210, 37, 110, 125, 212, 44, 110, 125, 213, 32, 110, 126, 39, 108, 125, 215, 36, 108, 125, 216, 37, 113, 126, 39, 113, 126, 217, 51, 114, 126, 218, 20, 111, 125, 32, 110, 125, 220, 37, 109, 125, 49, 113, 125, 4, 189, 41, 110, 125, 190, 37, 111, 126, 32, 112, 125, 44, 112, 125, 191, 41, 112, 126, 192, 29, 101, 125, 49, 109, 125, 193, 37, 112, 125, 51, 112, 125, 194, 44, 110, 125, 196, 32, 106, 161, 49, 113, 126, 197, 41, 103, 146, 199, 37, 109, 125, 200, 41, 108, 125, 202, 37, 106, 125, 204, 41, 106, 125, 205, 44, 107, 125, 206, 41, 108, 125, 207, 37, 109, 126, 208, 32, 109, 125, 209, 37, 109, 125, 210, 39,

Convert Midi to Wav

In [51]:
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth

output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn.mid")

  output_score = tokenizer.tokens_to_midi([generated_sequence])


In [52]:
pretty_midi = PrettyMIDI("rnn.mid")
print("Duration (seconds):", pretty_midi.get_end_time())
for i, instrument in enumerate(pretty_midi.instruments):
    print(f"{instrument.name or 'Unnamed'}:", len(instrument.notes), "notes")

Duration (seconds): 36.1875
Acoustic Grand Piano: 545 notes
