In [1]:
#!pip install miditok
#!pip install symusic
#!pip install glob
#!pip install torch

In [2]:
import glob
import random
from typing import List
from collections import defaultdict

import os
import pandas as pd

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig

from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display

from pretty_midi import PrettyMIDI

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Uses 'cuda' if a gpu is detected. Otherwise uses cpu
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Can also set manually
#DEVICE = 'cpu'
#DEVICE = 'cuda'

print(DEVICE)

cuda


In [4]:
ROOT = "maestro-v3.0.0"            # change if you unpacked elsewhere
meta = pd.read_csv(os.path.join(ROOT, "maestro-v3.0.0.csv"))

def list_midi_files(split):
    paths = meta.loc[meta["split"] == split, "midi_filename"]
    return [os.path.join(ROOT, p) for p in paths]

train_files = list_midi_files("train")        # 962 MIDI files
val_files   = list_midi_files("validation")   # 137
test_files  = list_midi_files("test")         # 177


In [5]:
type(train_files[0])
train_files[0].encode('utf-8').decode('utf-8')
print(train_files[0].encode('utf-8'))
str.encode(train_files[0], 'utf-8')

b'maestro-v3.0.0\\2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi'


b'maestro-v3.0.0\\2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi'

Tokenizer

In [6]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [7]:
len(train_loader), len(test_loader)

(241, 45)

### Transformer

In [8]:
class MusicTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, num_heads=8, num_layers=6, dropout=0.1, max_seq_len=1024):
        super(MusicTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(self._generate_positional_encoding(max_seq_len, embedding_dim), requires_grad=False)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=num_heads,
            dim_feedforward=embedding_dim * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # x: (batch_size, seq_len)
        x = self.embedding(x) + self.pos_encoder[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        return self.fc_out(x)

    def _generate_positional_encoding(self, max_len, d_model):
        """Creates sinusoidal positional encoding matrix"""
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)  # shape: (1, max_len, d_model)


Training

In [9]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device=DEVICE):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

In [10]:
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 512
num_layers = 2

model = MusicTransformer(vocab_size, embedding_dim=256, num_heads=8, num_layers=6)
train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 4.5337 | Val Loss: 4.4663
Epoch 2/20 | Train Loss: 4.5027 | Val Loss: 4.4593
Epoch 3/20 | Train Loss: 4.4962 | Val Loss: 4.4514
Epoch 4/20 | Train Loss: 4.4362 | Val Loss: 4.2630
Epoch 5/20 | Train Loss: 4.3251 | Val Loss: 4.4517
Epoch 6/20 | Train Loss: 4.4915 | Val Loss: 4.4505
Epoch 7/20 | Train Loss: 4.4898 | Val Loss: 4.4470
Epoch 8/20 | Train Loss: 4.4873 | Val Loss: 4.4457
Epoch 9/20 | Train Loss: 4.4874 | Val Loss: 4.4484
Epoch 10/20 | Train Loss: 4.4864 | Val Loss: 4.4470
Epoch 11/20 | Train Loss: 4.4864 | Val Loss: 4.4459
Epoch 12/20 | Train Loss: 4.4859 | Val Loss: 4.4449
Epoch 13/20 | Train Loss: 4.4860 | Val Loss: 4.4458
Epoch 14/20 | Train Loss: 4.4863 | Val Loss: 4.4440
Epoch 15/20 | Train Loss: 4.4850 | Val Loss: 4.4450
Epoch 16/20 | Train Loss: 4.4851 | Val Loss: 4.4451
Epoch 17/20 | Train Loss: 4.4856 | Val Loss: 4.4490
Epoch 18/20 | Train Loss: 4.4847 | Val Loss: 4.4462
Epoch 19/20 | Train Loss: 4.4839 | Val Loss: 4.4457
Epoch 20/20 | Train L

Sampling

In [75]:
def sample(model, start_token, tokenizer, max_length=512, temperature=1.0, device=DEVICE):
    model.eval()

    # Build ID → string mapping
    if hasattr(tokenizer, 'vocab') and isinstance(tokenizer.vocab, dict):
        id_to_token = {v: k for k, v in tokenizer.vocab.items()}
    elif hasattr(tokenizer, '_vocab'):
        id_to_token = {i: tok for i, tok in enumerate(tokenizer._vocab)}
    else:
        raise RuntimeError("Tokenizer vocab not found")

    generated = [start_token]
    input_seq = torch.tensor([generated], dtype=torch.long, device=device)

    while len(generated) < max_length:
        with torch.no_grad():
            logits = model(input_seq)
            next_logits = logits[0, -1] / temperature
            probs = F.softmax(next_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()

        token_str = id_to_token.get(next_token, "")

        # Always add bar/position/timeshift
        if token_str.startswith(("Bar", "TimeShift", "Position")):
            generated.append(next_token)

        # If it's a pitch, follow with velocity and duration
        elif token_str.startswith("Pitch_"):
            generated.append(next_token)

            # Sample a Velocity token
            velocity_ids = [i for i, tok in id_to_token.items() if tok.startswith("Velocity_")]
            generated.append(random.choice(velocity_ids))

            # Sample a Duration token
            duration_ids = [i for i, tok in id_to_token.items() if tok.startswith("Duration_")]
            generated.append(random.choice(duration_ids))

        # Stop on EOS or PAD
        if token_str in ("EOS_None"):
            break

        input_seq = torch.tensor([generated], dtype=torch.long, device=device)

    return generated


In [79]:
start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, tokenizer, max_length=1024)
#generated_sequence2 = sample(model, generated_sequence1[-1], tokenizer, max_length=1024)

#generated_sequence = generated_sequence1 + generated_sequence2

print("Generated token sequence:")
print(generated_sequence)
print("num tokens:", len(generated_sequence))

Generated token sequence:
[1, 203, 54, 102, 164, 47, 96, 137, 197, 4, 194, 58, 114, 131, 211, 29, 102, 126, 218, 218, 63, 116, 179, 41, 96, 178, 51, 96, 183, 208, 203, 22, 115, 174, 21, 102, 187, 27, 100, 132, 195, 77, 101, 146, 36, 102, 133, 52, 116, 139, 190, 190, 48, 102, 180, 63, 106, 175, 192, 193, 206, 195, 197, 217, 211, 49, 104, 132, 45, 98, 143, 83, 97, 185, 45, 98, 135, 200, 215, 49, 110, 153, 195, 40, 101, 156, 53, 108, 150, 199, 51, 110, 185, 52, 109, 140, 45, 96, 171, 46, 111, 132, 192, 50, 106, 183, 197, 220, 48, 102, 174, 39, 111, 150, 208, 208, 214, 4, 51, 117, 148, 212, 42, 96, 179, 60, 98, 133, 220, 16, 110, 128, 62, 124, 127, 59, 106, 127, 41, 100, 180, 35, 113, 141, 34, 107, 163, 70, 107, 147, 4, 41, 114, 139, 39, 120, 166, 50, 124, 135, 39, 96, 131, 55, 115, 134, 51, 107, 132, 204, 51, 99, 160, 51, 112, 169, 215, 194, 38, 123, 133, 62, 100, 171, 44, 124, 139, 50, 117, 180, 4, 39, 106, 174, 198, 206, 194, 4, 194, 52, 102, 166, 204, 58, 106, 177, 36, 101, 172, 200, 2

In [80]:
output_score = tokenizer.decode([generated_sequence])
output_score.dump_midi(f"transformer.mid")

In [81]:
pretty_midi = PrettyMIDI("transformer.mid")
print("Duration (seconds):", pretty_midi.get_end_time())
for i, instrument in enumerate(pretty_midi.instruments):
    print(f"{instrument.name or 'Unnamed'}:", len(instrument.notes), "notes")

Duration (seconds): 44.625
Acoustic Grand Piano: 269 notes


In [None]:
shortest_time = 1000
for i in range(100):
    start_token = tokenizer.special_tokens_ids[1]
    generated_sequence = sample(model, start_token, tokenizer, max_length=1024)

    output_score = tokenizer.decode([generated_sequence])
    output_score.dump_midi(f"transformer_temp.mid")

    pretty_midi_temp = PrettyMIDI("transformer_temp.mid")
    if(pretty_midi_temp.get_end_time() < shortest_time):
        output_score = tokenizer.decode([generated_sequence])
        output_score.dump_midi(f"transformer.mid")
        shortest_time = pretty_midi.get_end_time()

pretty_midi = PrettyMIDI("transformer.mid")
print("Duration (seconds):", pretty_midi.get_end_time())
for i, instrument in enumerate(pretty_midi.instruments):
    print(f"{instrument.name or 'Unnamed'}:", len(instrument.notes), "notes")
