In [39]:
#!pip install miditok
#!pip install symusic
#!pip install glob
#!pip install torch
!pip install midi2audio


Collecting midi2audio
  Downloading midi2audio-0.1.1-py2.py3-none-any.whl.metadata (5.7 kB)
Downloading midi2audio-0.1.1-py2.py3-none-any.whl (8.7 kB)
Installing collected packages: midi2audio
Successfully installed midi2audio-0.1.1



[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: C:\Users\enriq\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [40]:
import glob
import random
from typing import List
from collections import defaultdict
import midi2audio

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig

In [26]:
# Use 'cuda' to run on a gpu
DEVICE = 'cpu'
#DEVICE = 'cuda'

In [None]:
import zipfile

with zipfile.ZipFile("POP909.zip", "r") as zip_ref:
    zip_ref.extractall("./data")
print("Unzipped POP909.zip to ./data")


Unzipped POP909.zip to ./data


In [None]:
# train_files = glob.glob("./data/POP909/*/*.mid")
# test_files = glob.glob("./data/POP909/test/*.mid")
# print(f"Found {len(train_files)} training files and {len(test_files)} test files.")

Found 909 training files and 0 test files.


In [28]:
import random

all_files = glob.glob("./data/POP909/*/*.mid")
random.shuffle(all_files)

split_idx = int(0.8 * len(all_files))
train_files = all_files[:split_idx]
test_files = all_files[split_idx:]

print(f"Training files: {len(train_files)}, Testing files: {len(test_files)}")

Training files: 727, Testing files: 182


In [None]:
#train_files = glob.glob("./data/POP909/*.mid")
#test_files = glob.glob("./data/test/*.mid")


Found 0 training files and 0 test files.


In [29]:
type(train_files[0])
train_files[0].encode('utf-8').decode('utf-8')
print(train_files[0].encode('utf-8'))
str.encode(train_files[0], 'utf-8')

b'./data/POP909\\011\\011.mid'


b'./data/POP909\\011\\011.mid'

In [30]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

In [31]:
len(train_loader), len(test_loader)

(182, 46)

### RNN

In [34]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

Training

In [35]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=20, lr=0.001, device=DEVICE):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)

Epoch 1/20 | Train Loss: 3.2900 | Val Loss: 2.6637
Epoch 2/20 | Train Loss: 2.5365 | Val Loss: 2.4047
Epoch 3/20 | Train Loss: 2.2119 | Val Loss: 2.0299
Epoch 4/20 | Train Loss: 1.9321 | Val Loss: 1.8693
Epoch 5/20 | Train Loss: 1.7318 | Val Loss: 1.6806
Epoch 6/20 | Train Loss: 1.6205 | Val Loss: 1.6309
Epoch 7/20 | Train Loss: 1.5397 | Val Loss: 1.5455
Epoch 8/20 | Train Loss: 1.4702 | Val Loss: 1.5235
Epoch 9/20 | Train Loss: 1.4297 | Val Loss: 1.5075
Epoch 10/20 | Train Loss: 1.3973 | Val Loss: 1.5015
Epoch 11/20 | Train Loss: 1.3680 | Val Loss: 1.4992
Epoch 12/20 | Train Loss: 1.3377 | Val Loss: 1.4925
Epoch 13/20 | Train Loss: 1.3085 | Val Loss: 1.4918
Epoch 14/20 | Train Loss: 1.2779 | Val Loss: 1.4982
Epoch 15/20 | Train Loss: 1.2474 | Val Loss: 1.5097
Epoch 16/20 | Train Loss: 1.2111 | Val Loss: 1.5195
Epoch 17/20 | Train Loss: 1.1772 | Val Loss: 1.5282
Epoch 18/20 | Train Loss: 1.1367 | Val Loss: 1.5489
Epoch 19/20 | Train Loss: 1.0982 | Val Loss: 1.5755
Epoch 20/20 | Train L

Sampling

In [36]:
def sample(model, start_token, max_length=100, temperature=1.0, device=DEVICE):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)

Generated token sequence:
[1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 196, 56, 112, 130, 204, 59, 116, 130, 212, 61, 114, 134, 4, 192, 56, 113, 127, 196, 59, 115, 126, 200, 56, 116, 127, 204, 52, 115, 128, 208, 56, 113, 143, 4, 200, 47, 115, 127, 204, 52, 114, 127, 208, 52, 114, 127, 212, 54, 114, 127, 216, 61, 116, 137, 4, 204, 59, 116, 129, 210, 61, 114, 125, 212, 61, 116, 130, 220, 59, 115, 126, 4, 192, 59, 116, 126, 196, 61, 116, 127, 200, 66, 114, 132, 208, 54, 115, 137, 4, 192, 52, 116, 127, 196, 54, 114, 127, 200, 54, 115, 128, 204, 54, 114, 127, 208, 54, 114, 128, 212, 54, 114, 127, 216, 52, 115, 134, 4, 196, 52, 114, 125, 198, 54, 114, 127, 202, 54, 114, 133, 212, 52, 113, 128, 216, 47, 113, 128, 220, 52, 114, 134, 4, 200, 59, 114, 125, 202, 59, 114, 125, 204, 54, 114, 125, 206, 56, 115, 125, 208, 54, 114, 127, 212, 52, 114, 126, 216, 51, 114, 162, 4, 4, 204, 59, 115, 125, 206, 59, 116, 126, 208, 61, 116, 125, 210, 57, 114, 125, 212, 54, 115, 127, 216, 59, 115, 127, 220, 57, 114, 127, 4

Convert Midi to Wav

In [52]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3_GM.sf2") # Initialize FluidSynth

output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"rnn.mid")
fs.midi_to_audio("rnn.mid", "rnn.wav")
display(Audio("rnn.wav"))



  output_score = tokenizer.tokens_to_midi([generated_sequence])


FileNotFoundError: [WinError 2] The system cannot find the file specified