In [714]:
import pandas as pd
import torch

data = {
    "canonical_composer": [
        "Johann Sebastian Bach",
        "Ludwig van Beethoven",
        "Frédéric Chopin",
        "Wolfgang Amadeus Mozart",
        "Claude Debussy",
    ],
    "canonical_title": [
        "Prelude in C Major, BWV 846",
        "Piano Sonata No. 14 in C-sharp minor, Op. 27 No. 2 (Moonlight Sonata)",
        "Nocturne in E-flat major, Op. 9 No. 2",
        "Piano Sonata No. 11 in A major, K. 331 (Rondo Alla Turca)",
        "Clair de Lune, Suite bergamasque, L. 75",
    ],
    "split": ["train", "train", "train", "test", "test"],
    "year": [2018, 2018, 2018, 2019, 2019],
    "text_annotation": [
        "A soothing and calming piece by Bach, featuring a serene melody.",
        "A dramatic and emotional piano sonata by Beethoven, known for its contrasting movements.",
        "A gentle and lyrical nocturne by Chopin, featuring a beautiful melody.",
        "A playful and energetic piano sonata by Mozart, known for its catchy Rondo Alla Turca.",
        "A dreamy and evocative piano piece by Debussy, capturing the beauty of moonlight.",
    ],
    "midi_path": [
        "midi_files/bach_prelude_in_c_major.midi",
        "midi_files/beethoven_moonlight_sonata.midi",
        "midi_files/chopin_nocturne_op9_no2.midi",
        "midi_files/mozart_piano_sonata_k331.midi",
        "midi_files/debussy_clair_de_lune.midi",
    ],
}

df = pd.DataFrame(data)


In [715]:
df

Unnamed: 0,canonical_composer,canonical_title,split,year,text_annotation,midi_path
0,Johann Sebastian Bach,"Prelude in C Major, BWV 846",train,2018,"A soothing and calming piece by Bach, featurin...",midi_files/bach_prelude_in_c_major.midi
1,Ludwig van Beethoven,"Piano Sonata No. 14 in C-sharp minor, Op. 27 N...",train,2018,A dramatic and emotional piano sonata by Beeth...,midi_files/beethoven_moonlight_sonata.midi
2,Frédéric Chopin,"Nocturne in E-flat major, Op. 9 No. 2",train,2018,"A gentle and lyrical nocturne by Chopin, featu...",midi_files/chopin_nocturne_op9_no2.midi
3,Wolfgang Amadeus Mozart,"Piano Sonata No. 11 in A major, K. 331 (Rondo ...",test,2019,A playful and energetic piano sonata by Mozart...,midi_files/mozart_piano_sonata_k331.midi
4,Claude Debussy,"Clair de Lune, Suite bergamasque, L. 75",test,2019,"A dreamy and evocative piano piece by Debussy,...",midi_files/debussy_clair_de_lune.midi


In [716]:
import numpy as np
import pretty_midi


def extract_piano_roll(midi_path, fs=100):
    # Load the MIDI file
    midi_data = pretty_midi.PrettyMIDI(midi_path)

    # Extract the piano roll
    piano_roll = midi_data.get_piano_roll(fs=fs)

    # Transpose the piano roll to have time steps on axis 1
    piano_roll = piano_roll.T

    return piano_roll

train_midi_paths = train_df["midi_path"].tolist()
test_midi_paths = test_df["midi_path"].tolist()

train_piano_rolls = [extract_piano_roll(midi_path) for midi_path in train_midi_paths]
test_piano_rolls = [extract_piano_roll(midi_path) for midi_path in test_midi_paths]


In [717]:
import math

def pad_piano_roll(piano_roll, max_length):
    padded_piano_roll = np.pad(piano_roll, ((0, 0), (0, max_length - piano_roll.shape[1])), mode='constant')
    return padded_piano_roll

# Find the maximum number of time steps across all piano rolls
max_length = max([piano_roll.shape[1] for piano_roll in train_piano_rolls + test_piano_rolls])

# Pad the piano rolls to have the same number of time steps
train_piano_rolls_padded = [pad_piano_roll(piano_roll, max_length) for piano_roll in train_piano_rolls]
test_piano_rolls_padded = [pad_piano_roll(piano_roll, max_length) for piano_roll in test_piano_rolls]

# Set the target shape to have the maximum number of pitches and a fixed number of time steps
num_time_steps = 100
num_pitches = 128
max_time_steps = math.floor(max_length/num_pitches)
target_shape = (num_pitches, min(max_time_steps, num_time_steps))

# Reshape the piano rolls to have the target shape
train_midi_features = [np.reshape(piano_roll[:, :target_shape[1]*num_pitches].T, target_shape) for piano_roll in train_piano_rolls_padded]
test_midi_features = [np.reshape(piano_roll[:, :target_shape[1]*num_pitches].T, target_shape) for piano_roll in test_piano_rolls_padded]

# Convert to tensors
train_midi_features = torch.tensor(train_midi_features, dtype=torch.float)
test_midi_features = torch.tensor(test_midi_features, dtype=torch.float)


ValueError: cannot reshape array of size 12403456 into shape (128,1)

### Data Preparation

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast
import os

# Load or train the tokenizer
tokenizer = ByteLevelBPETokenizer(vocab="tokenizer/vocab.json", merges="tokenizer/merges.txt")

# Define the tokenizer components
special_tokens = ["[PAD]", "[CLS]", "[SEP]", "[MASK]"]
vocab_size = 30000

# Save text data to a file
with open("text_data.txt", "w") as f:
    for line in data['text_annotation']:
        f.write(line + "\n")

# Train the tokenizer using the text data file
tokenizer.train(files=["text_data.txt"], vocab_size=vocab_size, min_frequency=2, special_tokens=special_tokens)

# Add the BertProcessing component to the tokenizer
tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ("[CLS]", tokenizer.token_to_id("[CLS]")),
)

# Save the tokenizer files (optional)
if not os.path.exists("tokenizer"):
    os.makedirs("tokenizer")
tokenizer.save("tokenizer/tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer/tokenizer.json")
tokenizer.pad_token = "[PAD]"

In [None]:

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

# Tokenize the text annotations
def encode_text(text, tokenizer):
    return tokenizer.encode(text, add_special_tokens=False)

class TextMIDIDataset(Dataset):
    def __init__(self, texts, midi_features, tokenizer):
        self.texts = [torch.tensor(encode_text(text, tokenizer), dtype=torch.long) for text in texts]
        self.midi_features = midi_features

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        midi_feature = self.midi_features[idx]
        attention_mask = torch.ones_like(text)
        return midi_feature, text, attention_mask






# Prepare your data
train_df = df[df["split"] == "train"]
test_df = df[df["split"] == "test"]
# Load and process your MIDI files here
# For demonstration purposes, we'll create dummy data as placeholders for the actual MIDI features
# TODO: Replace this with your own MIDI feature extraction code - prettymidi or music21

train_texts = train_df["text_annotation"].tolist()
test_texts = test_df["text_annotation"].tolist()

train_dataset = TextMIDIDataset(train_texts, train_midi_features, tokenizer)
test_dataset = TextMIDIDataset(test_texts, test_midi_features, tokenizer)

In [None]:
for idx, piano_roll in enumerate(train_piano_rolls):
    non_zero_count = np.count_nonzero(piano_roll)
    total_elements = piano_roll.shape[0] * piano_roll.shape[1]
    sparsity = 1 - non_zero_count / total_elements
    print(f"Piano roll {idx}: sparsity = {sparsity * 100:.2f}%")


In [None]:
import matplotlib.pyplot as plt

def visualize_piano_roll(piano_roll, title):
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.imshow(piano_roll, aspect="auto", cmap="binary_r", origin="lower")
    ax.set_title(title)
    ax.set_xlabel("Time Steps")
    ax.set_ylabel("Pitch")
    plt.show()

# Visualize the first piano roll in the train set
visualize_piano_roll(train_piano_rolls[0], "Piano Roll for Train MIDI 0")


In [None]:
import torch
import torch.nn as nn
import numpy as np

"""
class TextToMIDIModel(nn.Module):
    def __init__(self, text_vocab_size, midi_feature_size, d_model=512, nhead=8, num_layers=6):
        super(TextToMIDIModel, self).__init__()
        self.embedding = nn.Embedding(text_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, midi_feature_size)

    def forward(self, input_ids, attention_mask):
        # Clip the input_ids to be within the range of the embedding layer
        input_ids = input_ids.clamp(max=self.embedding.num_embeddings - 1)

        # Convert attention_mask to boolean tensor and transpose it
        attention_mask = (attention_mask != 0).transpose(0, 1)

        # Add a dimension of size 1 to the attention_mask tensor
        attention_mask = attention_mask.unsqueeze(2)

        # Use the input_ids for the embedding layer
        x = self.embedding(input_ids.long()) * np.sqrt(input_ids.size(1))

        # Apply the transformer
        output = self.transformer(x, x, src_key_padding_mask=attention_mask, tgt_key_padding_mask=attention_mask)

        # Apply the linear layer and remove any extra dimensions
        output = self.fc(output).squeeze(dim=0)

        # Return the output
        return output

"""

import torch
import torch.nn as nn
import numpy as np

class TextToMIDIModel(nn.Module):
    def __init__(self, text_vocab_size, midi_feature_size, d_model=512, nhead=8, num_layers=6):
        super(TextToMIDIModel, self).__init__()
        self.embedding = nn.Embedding(text_vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, midi_feature_size)

    def forward(self, input_ids, attention_mask):
        # Clip the input_ids to be within the range of the embedding layer
        input_ids = input_ids.clamp(max=self.embedding.num_embeddings - 1)

        # Convert attention_mask to boolean tensor and transpose it
        attention_mask = (attention_mask != 0).transpose(0, 1)

        # Use the input_ids for the embedding layer
        x = self.embedding(input_ids.long()) * np.sqrt(input_ids.size(1))

        # Add a dimension of size 1 to the attention_mask tensor
        attention_mask = attention_mask.unsqueeze(2)

        # Apply the transformer
        output = self.transformer(x.permute(1, 0, 2), attention_mask=attention_mask)

        # Transpose the output back to the original shape
        output = output.permute(1, 0, 2)

        # Apply the linear layer and return the output
        return self.fc(output)





# Set your text vocabulary size and MIDI feature size
text_vocab_size = 100  # For example, if you have 10,000 unique words in your text data
midi_feature_size = 100  # For example, if you're using a piano roll with 128 notes

# Instantiate the model
model = TextToMIDIModel(text_vocab_size, midi_feature_size)


In [None]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.MSELoss()
#criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training configurations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
num_epochs = 100


In [None]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0

    for input_ids, attention_mask, midi in dataloader:
        print(input_ids.shape)
        input_ids, attention_mask, midi = input_ids.to(device), attention_mask.to(device), midi.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Check dimensions of tensors
        print(f"outputs.shape: {outputs.shape}")
        print(f"midi.shape: {midi.shape}")

        # Compute loss
        loss = criterion(outputs, midi)

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [None]:
def text_data_iterator(text_data):
    for line in text_data:
        yield line

# Example text data
"""
text_data = [
    "A deeply emotional and introspective piece...",
    "A lively and energetic composition...",
    # ... more lines of text data ...
]
"""

In [None]:
import torch.nn.utils.rnn as rnn_utils

def collate_fn(batch):
    midi_features, texts, attention_masks = zip(*batch)
    midi_features = torch.stack(midi_features)
    texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    return midi_features, texts, attention_masks




In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
print(train_midi_paths)
midi_data = [extract_piano_roll(midi_path) for midi_path in train_midi_paths]


In [None]:

print(midi_data)

In [None]:
def eval(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0

    for batch_idx, (midi, input_ids, attention_mask) in enumerate(dataloader):
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        midi = midi.to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute loss
        loss = criterion(outputs, midi)

        # Track loss
        running_loss += loss.item()

    return running_loss / (batch_idx + 1)


In [None]:
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch: {epoch + 1}/{num_epochs}, Loss: {train_loss:.6f}")

    # Evaluate the model on the validation set after each epoch
    with torch.no_grad():
        model.eval()
        val_loss = eval(model, test_dataloader, criterion, device)
        print(f"Validation Loss: {val_loss:.6f}")

    # Save the model checkpoint after each epoch
    checkpoint_path = f"model_checkpoint_epoch_{epoch + 1}.pt"
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Model saved to {checkpoint_path}")