In [1]:
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split
from models.tests import build_scale
import numpy as np
import torch.optim as optim
from torch.utils.data import Subset
import torch.nn.functional as F
from torch.distributions import Categorical
import copy
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

The dataset used for this project is 6 years old and was downloaded from Kaggle user Shivam Raj:\
https://www.kaggle.com/datasets/raj5287/abc-notation-of-tunes

The data is a `.txt` file that contains songs that have been transcribed into ABC notation, where each entry is separated by 2 newlines. The first set of lines contains the metadata fields:\
`X`: ID\
`T`: Title\
`M`: Meter\
`L`: Unit Note Length\
`B`: Background Information\
`N`: Notes\
`Z`: Transcription\
`K`: Key

These headers are then followed by the transcribed melody. Non-note symbols are used to indicate modifications to note durations, accidentals, and ornamentation.

In [2]:
data_dir = os.path.join(os.getcwd(), 'data')
output_dir = os.path.join(os.getcwd(), 'output')

with open(os.path.join(data_dir, 'raw', 'abc_notations.txt' ), 'r') as f:
    raw_lines = f.readlines()
    for line in raw_lines[:20]:
        print(line.strip())
    print("...")

X: 1
T: The Enchanted Valley
M: 2/4
L: 1/16
B: "O'Neill's 1"
N: "Very slow" "collected by J. O'Neill"
N:
Z: "Transcribed by Norbert Paap, norbertp@bdu.uva.nl"
Z:
K:Gm
G3-A (Bcd=e) | f4 (g2dB) | ({d}c3-B) G2-E2 | F4 (D2=E^F) |
G3-A (Bcd=e) | f4 d2-f2 | (g2a2 b2).g2 | {b}(a2g2 f2).d2 |
(d2{ed}c2) B2B2 | (A2G2 {AG}F2).D2 | (GABc) (d2{ed}c>A) | G2G2 G2z ||
G | B2c2 (dcAB) | G2G2 G3G | B2d2 (gfdc) | d2g2 (g3ga) |
(bagf) (gd)d>c | (B2AG) F-D.D2 | (GABc) d2d2 | (bgfd) cA.F2 |
G2A2 (B2{cB}AG) | A3-G F2-D2 | (GABc) (d2{ed}c>A) | G2G2 G2z2 ||


X: 2
T: Fare You Well
...


The raw `abc_notations.txt` is standardized then parsed into a json file named `songs_dict.json`. Certain fields that were not considered to be musically significant, such as background information, were dropped from the data.

In [3]:
with(open(os.path.join(data_dir, 'lookup_tables', 'songs_dict.json'), 'r')) as f:
    songs_dict = json.load(f)
    entry = songs_dict['1']
    for key, value in entry.items():
        if key =='melody':
            for line in value:
                print(line)
        else:
            print(f"{key}: {value}")


id: 1
title: The Enchanted Valley
time_signature: 2/4
note_length: 1/16
book: "O'Neill's 1"
notes: "Very slow" "collected by J. O'Neill"
transcription: "Transcribed by Norbert Paap, norbertp@bdu.uva.nl"
key: Gm
G3-A Bcde|f4 g2dB|dc3-B G2-E2|F4 D2EF|
G3-A Bcde|f4 d2-f2|g2a2 b2.g2|ba2g2 f2.d2|
d2edc2 B2B2|A2G2 AGF2.D2|GABc d2edcA|G2G2 G2z||
G|B2c2 dcAB|G2G2 G3G|B2d2 gfdc|d2g2 g3ga|
bagf gddc|B2AG F-D.D2|GABc d2d2|bgfd cA.F2|
G2A2 B2cBAG|A3-G F2-D2|GABc d2edcA|G2G2 G2z2||


# Dataset

## Vocabulary

In [4]:
"""
Vocabulary adapted from:
    https://abcnotation.com/wiki/abc:standard:v2.2
"""
NOTES = "ABCDEFGabcdefg"
ACCIDENTALS = "^_="
DECORATIONS = "~HLMOPSTuv"
DOT = "."
RESTS = "z"
BARS = ['|', '||', '|:', ':|', ':||']
TIE = '-'
SLUR = '()'
STRUCTURAL = ['K:', 'L:', 'M:']
DURATIONS = "23468"
SPECIAL = {
    'PAD': '[PAD]',
    'UNK': '[UNK]',
    'START': '[START]',
    'END': '[END]'
}

PREFIXED = [f"{p}{note}" for note in (NOTES + RESTS) for p in ACCIDENTALS + DOT + DECORATIONS]
SUFFIXED = [f"{note}{duration}" for note in (NOTES + RESTS) for duration in DURATIONS + DOT]

TRIPLET_PREFIX = [f"3{a}{b}{c}" for a in NOTES for b in NOTES for c in NOTES]

OCTAVE_NOTES = [f"{note}," for note in (NOTES + RESTS)]
OCTAVE_NOTES += [f"{note}'" for note in (NOTES + RESTS)]

FRACTIONAL_NOTES = [f"{note}/{dur}" for note in (NOTES + ''.join(SUFFIXED)) for dur in DURATIONS]
FRACTIONAL_RESTS = [f"z/{dur}" for dur in DURATIONS]

VOCAB = list(SPECIAL.values()) + list(NOTES) + PREFIXED + SUFFIXED + STRUCTURAL + BARS +\
        [TIE, '(', ')'] + TRIPLET_PREFIX + OCTAVE_NOTES + FRACTIONAL_RESTS

VOCAB_SIZE = len(VOCAB)
print(f"Vocabulary size: {VOCAB_SIZE}")

Vocabulary size: 3108


A custom tokenizer is used to break up input sequences into meaningful tokens, which is often more than just individual notes. For example, triplets, accidentals, and note durations are treated as single tokens if they match known patterns in the vocabulary.

Each entry from the JSON file is processed field by field. Important metadata such as title, time signature, note length, and key are inserted at the start of the sequence. The melody is then tokenized line-by-line, and the full token sequence is converted into a tensor of indices using the predefined vocabulary.

This representation allows the model to learn from both musical structure and contextual metadata.
For example, the average Seq2Seq accuracy of the model outputs increased from around 20% to 52% by introducing composer and rythym metadata into the model, along with other improvements.

In [5]:
from models.dataset import tok2ind

PAD = SPECIAL['PAD']
UNK = SPECIAL['UNK']
START = SPECIAL['START']
END = SPECIAL['END']
EXCLUDED_ENTRIES = {'1850'}

def tokenize_melody(melody):
    all_tokens = []

    for line in melody:
        line = line.strip().replace('x', 'z')   # normalize 'x' to 'z'
        if line.startswith('C:'):   # save composer information where available
            composer = line[2:].strip()
            all_tokens.append(f"C:{composer} ") if composer not in all_tokens else None
            continue
        if line.startswith('R:'):   # save rhythm information where available
            rhythm = line[2:].strip()
            all_tokens.append(f"R:{rhythm} ") if rhythm not in all_tokens else None
            continue

        line_tokens = tokenize_line(line)
        all_tokens.extend(line_tokens)

    return all_tokens


def tokenize_line(line):
    tokens = []
    i = 0

    vocab_sort = sorted(VOCAB, key=len, reverse=True)
    while i < len(line):
        match = None

        if i + 3 <= len(line) and line[i:i+3] in TRIPLET_PREFIX:
            match = line[i:i+3]
            tokens.append(match.strip())
            i += 3
            continue

        for token in vocab_sort:
            if line.startswith(token, i):
                match = token
                break

        if match:
            tokens.append(match.strip())
            i += len(match)
        else:
            if i + 2 < len(line) and line[i + 1] == '/' and line[i + 2].isdigit():
                # fractional duration
                note = line[i]
                fraction = line[i:i + 3]
                tokens.append(note)
                tokens.append(fraction)
                i += 3
            else:
                tokens.append(UNK)
                i += 1

    return tokens


def entry2tensor(entry):
    tokens = []
    metadata = {}

    if 'title' in entry:
        val = entry['title']
        tokens.append(f"N:{val} ")
        metadata['N:'] = val

    if 'time_signature' in entry:
        val = entry['time_signature']
        tokens.append(f"T:{val} ")
        metadata['T:'] = val

    if 'note_length' in entry:
        val = entry['note_length']
        tokens.append(f"L:{val} ")
        metadata['L:'] = val

    if 'key' in entry:
        val = entry['key']
        tokens.append(f"K:{val} ")
        metadata['K:'] = val
        
    tokens.append(f'{START} ')

    melody = entry.get('melody', '')
    melody_tokens = tokenize_melody(melody)
    tokens.extend(melody_tokens)

    tokens.append(END)

    # print(f"Tokens:\n{(''.join(tokens))[:125] + ' ...'}")

    indices = [tok2ind(tok) for tok in tokens]

    return torch.tensor(indices, dtype=torch.long), metadata

tensor, metadata = entry2tensor(entry)
print(f"Converted Tokens:\n{tensor}")


Converted Tokens:
tensor([  1,   1,   1,   1,   1, 265, 326,   4,   1,   5,  13,  14,  15, 321,
        302,   1, 306,  14,   5, 321,  14, 283, 326,   5,   1, 264, 326, 252,
        321, 260,   1, 246,   8,   9, 321, 265, 326,   4,   1,   5,  13,  14,
         15, 321, 302,   1, 288, 326, 300, 321, 306, 270,   1, 276, 203,   1,
        321,  12, 270, 306,   1, 300, 161,   1, 321, 288,  15,  14, 282,   1,
        234, 234, 321, 228, 264,   1,   4,  10, 258,  63,   1, 321,  10,   4,
          5,  13,   1, 288,  15,  14,  13,   4, 321, 264, 264,   1, 264,   1,
        322,  10, 321, 234, 282,   1,  14,  13,   4,   5, 321, 264, 264,   1,
        265,  10, 321, 234, 288,   1,  17,  16,  14,  13, 321, 288, 306,   1,
        307,  17,  11, 321,  12,  11,  17,  16,   1,  17,  14,  14,  13, 321,
        234,   4,  10,   1,   9, 326, 251, 246, 321,  10,   4,   5,  13,   1,
        288, 288, 321,  12,  17,  16,  14,   1,  13, 233, 258, 321, 264, 228,
          1, 234,  13,   5,   4,  10, 321, 229

```
Tokens:
N:The Enchanted Valley T:2/4 L:1/16 K:Gm [START] G3-A[UNK]Bcde|f4[UNK]g2dB|dc3-B[UNK]G2-E2|F4[UNK]D2EF|G3-A[UNK]Bcde|f4[UNK]d ...
```

A PyTorch Dataset module is then used to load the entries from the JSON file and:\
     - tokenize it\
     - optionally augment it by shifting the key\
     - split it into training, validation, and test subsets\
     - generate input/target tensors for to feed into the models

In [6]:
class ABCDataset(Dataset):
    def __init__(self, json_file, augment_data=False, transpose_range=(-2, 3)):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

        self.file_name = json_file
        self.sequences = []
        self.entry_indices = list(self.data.keys())
        self.metadata = []
        self.vocab_size = VOCAB_SIZE
        self.vocab = VOCAB
        self.pad_token = PAD
        self.pad_idx = VOCAB.index(PAD)
        self.entries = []

        self.idx2char_dict = {i: ch for i, ch in enumerate(VOCAB)}

        self.augment_data = augment_data
        self.transpose_range = transpose_range

        for idx in self.entry_indices:
            if str(idx) in EXCLUDED_ENTRIES:
                continue
            entry = self.data[idx]
            self.entries.append(entry)
            entry_t, metadata = entry2tensor(entry)
            self.sequences.append(entry_t)
            self.metadata.append(metadata)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence_tensor = self.sequences[idx]
        entry = self.entries[idx]
        key = entry.get('K:', 'C')
        if self.augment_data:
            shift = np.random.randint(*self.transpose_range)
            sequence_tensor = self.transpose_tensor(sequence_tensor, shift, key)

        input_tensor = sequence_tensor[:-1]
        target_tensor = sequence_tensor[1:]
        metadata = self.metadata[idx]

        return input_tensor, target_tensor, metadata

    def transpose_tensor(self, tensor, steps, key):
        transposed = tensor.clone()
        for i in range(tensor.size(0)):
            idx = torch.argmax(tensor[i]).item()
            char = self.vocab[idx]
            if char in NOTES:
                shifted = self.transpose_note(char, steps, key)
                transposed[i] = 0
                transposed[i][tok2ind(shifted)] = 1
        return transposed

    def transpose_note(self, note, shift, key):
        scale = build_scale(key)
        is_upper = note.isupper()
        base = note.upper()

        if base not in scale:
            return note

        i = scale.index(base)
        new_i = (i + shift) % len(scale)
        transposed = scale[new_i]
        return transposed if is_upper else transposed.lower()

    def get_pad_idx(self):
        return self.pad_idx

    def get_vocab_size(self):
        return self.vocab_size

    def split_dataset(self, train_ratio=0.8, val_ratio=0.1):
        train_size = int(train_ratio * len(self))
        val_size = int(val_ratio * len(self))
        test_size = len(self) - train_size - val_size
        return random_split(self, [train_size, val_size, test_size])


# Models

## Hyperparameters

Hidden and embedding dimensions were chosen to be equal, with a value of 144 resulting in the best balance between output complexity and structural correctness. Additionally, a temperature of 1.1 was set to increase the creativity of model outputs, as outlined in: https://medium.com/@weidagang/demystifying-temperature-in-machine-learning-ef6828ad4e2d

The number of layers for RNN and GRU models worked best with 4, while LSTM overfit quickly with any value greater than 2. 
A dropout of 0.5 was the sweet spot for our model's ability to decrease training and validation loss consistently and evenly.

Batch size was set to 32 for efficient training and a suitable output length is 256. An upper bound of 200 epochs allowed the model to learn with a learning rate of 0.0006 and save the best model through early stopping.


In [7]:
class HP:
    hidden_dim = 144 
    embed_dim = 144
    n_layers = 4    # set to 2 for LSTM
    # n_layers = 2
    dropout = 0.5

    batch_size = 32
    num_epochs = 200    # with early stopping, i.e. upper bound
    lr = 0.0006 
    output_len = 256
    
    # > 1 for more creative < 1 for more rule-following
    temp = 1.1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## RNN

In [8]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, pad_idx, dropout):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        # print("Input shape:", x.shape)
        # print("Input dtype:", x.dtype)
        output, hidden = self.rnn(x, hidden)
        logits = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return logits, hidden

## LSTM

In [9]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, pad_idx, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        # print("Input shape:", x.shape)
        # print("Input dtype:", x.dtype)
        output, hidden = self.lstm(x, hidden)
        logits = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return logits, hidden

## GRU

In [10]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, pad_idx, dropout):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        # print("Input shape:", x.shape)
        # print("Input dtype:", x.dtype)
        output, hidden = self.gru(x, hidden)
        logits = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return logits, hidden


# Training

The training and validation data is used to train and save models in `train_model()`, then the test data is used to ensure the model did not overfit in `eval_model()`. The trained models are then saved to the output directory.

In [12]:
def train_model(dataloader, val_loader, num_epochs=3, batch_size=32, learning_rate=0.0005, mtype='rnn'):
    device = HP.device
    dataset = dataloader.dataset

    if isinstance(dataset, Subset):
        dataset = dataset.dataset

    input_size = dataset.get_vocab_size()
    hidden_size = HP.hidden_dim
    output_size = input_size

    model = None
    if mtype == 'rnn':
        model = RNNModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)
    elif mtype == 'lstm':
        model = LSTMModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)
    elif mtype == 'gru':
        model = GRUModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)

    PAD_IDX = dataset.get_pad_idx()
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    best_loss = float('inf')
    best_model = None
    epochs_no_improvement = 0
    records = []
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        total_loss = 0
        model.train()

        for input_tensor, target_tensor, _ in dataloader:
            input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

            optimizer.zero_grad()

            output, _ = model(input_tensor)

            output = output.reshape(-1, output_size)
            target_flat = target_tensor.reshape(-1)

            loss = criterion(output, target_flat)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(dataloader)
        train_losses.append(avg_train_loss)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for input_tensor, target_tensor, _ in val_loader:
                input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

                output, _ = model(input_tensor)

                output = output.reshape(-1, output_size)
                target_flat = target_tensor.reshape(-1)

                loss = criterion(output, target_flat)
                val_loss += loss.item()

            avg_val_loss = val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
        records.append({'epoch': epoch + 1, 'train_loss': avg_train_loss, 'val_loss': avg_val_loss})

        # early stopping
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            epochs_no_improvement = 0
            best_model = copy.deepcopy(model.state_dict())
            print(f"Model saved @ epoch {epoch + 1}")
        else:
            epochs_no_improvement += 1
            if epochs_no_improvement >= 5:
                print("Early stopping triggered.")
                break

    if best_model is not None:
        torch.save(model.state_dict(), f"output/{mtype}_model.pth")

    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss', color='red')
    plt.plot(val_losses, label='Validation Loss', color='blue')
    plt.xlabel('Epoch')
    plt.ylabel('Average Loss')
    plt.title(f"{mtype.upper()} Training and Validation Loss")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"output/figs/{mtype}_loss.png")
    plt.close()

    records_df = pd.DataFrame(records)
    records_df.to_csv(f"output/{mtype}_loss.csv", index=False)
    print("Training complete. Model saved.")

    return model


def eval_model(test_loader, output_dir, model_type='rnn'):
    dataset = test_loader.dataset
    if isinstance(dataset, Subset):
        dataset = dataset.dataset

    input_size = dataset.get_vocab_size()
    hidden_size = HP.hidden_dim
    output_size = input_size
    device = HP.device

    model = None
    if model_type == 'rnn':
        model = RNNModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)
    if model_type == 'lstm':
        model = LSTMModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)
    if model_type == 'gru':
        model = GRUModel(input_size, HP.embed_dim, hidden_size, HP.n_layers, dataset.get_pad_idx(), HP.dropout).to(device)

    model.load_state_dict(torch.load(f"output/{model_type}_model.pth"))
    model.to(device)
    model.eval()

    PAD_IDX = dataset.get_pad_idx()
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

    test_loss = 0
    correct = 0
    total = 0

    losses = []
    accuracies = []
    with torch.no_grad():
        for input_tensor, target_tensor, metadata in test_loader:
            input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)
            output, _ = model(input_tensor)

            output_flat = output.reshape(-1, output_size)
            target_flat = target_tensor.reshape(-1)

            loss = criterion(output_flat, target_flat)
            test_loss += loss.item()
            losses.append(loss.item())

            preds = output_flat.argmax(dim=1)
            mask = (target_flat != PAD_IDX)
            correct_batch = (preds[mask] == target_flat[mask]).sum().item()
            total_batch = mask.sum().item()
            acc = 100.0 * correct_batch / total_batch
            accuracies.append(acc)
            total += total_batch
            correct += correct_batch

    test_loss /= len(test_loader)
    accuracy = 100.0 * correct / total
    print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{total} ({accuracy:.2f}%)")
    return model


## Sampling

A sampling function was added to take a random input sequence and use it to sample the trained model's output by converting it back into tokens. This allowed evaluation metrics other than loss and accuracy to be used. 

In [13]:
def sample(model, dataset):
    model.eval()
    device = next(model.parameters()).device

    if isinstance(dataset, Subset):
        dataset = dataset.dataset
    idx2char_dict = dataset.idx2char_dict

    rand_idx = np.random.randint(len(dataset) - 1)
    input_seq, _, input_metadata = dataset[rand_idx]

    T = input_metadata.get('T:', "")
    L = input_metadata.get('L:', "")
    K = input_metadata.get('K:', "")

    SPECIAL = {'[PAD]', '[UNK]', '[START]', '[END]'}

    metadata_tokens = [f"T:{T}", f"L:{L}", f"K:{K}"]
    metadata_indices = [tok2ind(tok) for tok in metadata_tokens]
    input_seq = input_seq.unsqueeze(0).to(device)
    metadata_tensor = torch.tensor(metadata_indices, dtype=torch.long).unsqueeze(0).to(device)
    input_seq = torch.cat([metadata_tensor, input_seq], dim=1)

    hidden_state = None

    generated = metadata_tokens.copy()
    with torch.no_grad():
        output, hidden_state = model(input_seq, hidden_state)

        last_output = output[:, -1, :]
        vals = last_output / HP.temp
        probs = F.softmax(vals, dim=1)

        dist = Categorical(probs)
        next_token = dist.sample()

        current_input = next_token.unsqueeze(0)

        for _ in range(HP.output_len):
            output, hidden_state = model(current_input, hidden_state)
            last_output = output[:, -1, :]

            vals = last_output / HP.temp
            probs = F.softmax(vals, dim=1)

            dist = Categorical(probs)
            next_token = dist.sample()

            token_idx = next_token.item()
            if token_idx in idx2char_dict:
                tok = idx2char_dict[token_idx]
                if tok not in SPECIAL:
                    generated.append(tok)

            current_input = next_token.unsqueeze(0)

    result = ' '.join(generated)

    if K and L and T:
        key_score = evaluate_key(result, K)
        time_sig_score = evaluate_time_signature(result, L, T)
        # print(f"Key: {K}, Score: {key_score:.2%}")
        # print(f"Time Signature: {T}, Length: {L}, Score: {time_sig_score}")
        return result, key_score, time_sig_score

    elif K:
        key_score = evaluate_key(result, K)
        # print(f"Key: {K}, Score: {key_score:.2f%}")
        return result, key_score, 0.0

    elif L and T:
        time_sig_score = evaluate_time_signature(result, L, T)
        # print(f"Time Signature: {T:.2f%}, Length: {L:.2f%}, Score: {time_sig_score:.2f%}")
        return result, 0.0, time_sig_score

    return result, 0.0, 0.0

## Tests

The `build_scale()` function in tests is used for evaluating whether notes in a melody are in-key, but also for transposing the data in the dataset.

The `evaluate_key()` functions works by viewing the notes in a melody and checking if they match the notes in the associated key and scale.

`evaluate_time_signature()` calculates the duration of the notes in each bar and checks if it is aligned with a song's time signature.

## RNN

In [14]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, pad_idx, dropout):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        # print("Input shape:", x.shape)
        # print("Input dtype:", x.dtype)
        output, hidden = self.rnn(x, hidden)
        logits = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return logits, hidden

In [15]:
import re

CHROMATIC = [
    'C', 'C#', 'D', 'D#', 'E', 'F',
    'F#', 'G', 'G#', 'A', 'A#', 'B'
]

NOTES = "ABCDEFGabcdefg"

TRIPLET_PREFIX = [f"3{a}{b}{c}" for a in NOTES for b in NOTES for c in NOTES]

EQUIVALENTS = {
    'B#': 'C', 'E#': 'F',
    'Cb': 'B', 'Fb': 'E',
    'Db': 'C#', 'Eb': 'D#', 'Gb': 'F#', 'Ab': 'G#', 'Bb': 'A#',
    'C#': 'C#', 'D#': 'D#', 'F#': 'F#', 'G#': 'G#', 'A#': 'A#'
}

MAJOR_STEPS = [2, 2, 1, 2, 2, 2, 1]
MINOR_STEPS = [2, 1, 2, 2, 1, 2, 2]

DURATIONS = "23468"
DOT = "."
SUFFIXED = [f"{note}{duration}" for note in (NOTES) for duration in DURATIONS + DOT]


def eq_note(note):
    return EQUIVALENTS.get(note, note)


def normalize_note(token):

    token = token.strip()

    match = re.match(r'([_=^]*)([A-Ga-g])', token)
    if not match:
        return None

    accidental, letter = match.groups()
    letter = letter.upper()

    if accidental == '^':
        note = f"{letter}#"
    elif accidental == '_':
        note = f"{letter}b"
    else:
        note = letter

    return eq_note(note)


def build_scale(key):
    is_minor = key.endswith('m')
    root = key[:-1] if is_minor else key
    root = eq_note(root)

    pattern = MINOR_STEPS if is_minor else MAJOR_STEPS

    try:
        idx = CHROMATIC.index(root)
    except ValueError:
        # print(f"Unknown root: {root}")
        return []

    scale = [CHROMATIC[idx]]
    for step in pattern:
        idx = (idx + step) % len(CHROMATIC)
        scale.append(CHROMATIC[idx])
    return [eq_note(n) for n in scale]


def evaluate_key(sample, key):
    scale = build_scale(key)
    if not scale:
        return 0.0

    in_key = {}
    out_key = {}
    total = 0

    for raw in sample.split():
        note = normalize_note(raw)
        if note is None:
            continue
        total += 1

        if note in scale:
            in_key[note] = in_key.get(note, 0) + 1
        else:
            out_key[note] = out_key.get(note, 0) + 1


    # output frequencies of in/out key notes

    # print(f"\nKEY: {key}\nSCALE: {scale}")
    # if in_key:
    #     # print("\nValid note frequencies:")
    #     for note, count in sorted(in_key.items()):
    #         print(f"{note}: {count}")

    # if out_key:
    #     # print("\nInvalid note frequencies:")
    #     for note, count in sorted(out_key.items()):
    #         print(f"{note}: {count}")

    score = sum(in_key.values()) / total if total > 0 else 0.0

    return score


def evaluate_time_signature(sample, length, time_sig):
    # print(f"Evaluating time signature: {time_sig}, length: {length}")
    
    # Parse base note length (e.g., "1/8" -> 0.125)
    length = eval(length)  # Convert string fraction to float
    
    # Convert time signature to total beats per measure
    if time_sig == "C":
        beats_per_bar = 4/4  # 4/4
    elif time_sig == "C|":
        beats_per_bar = 2/2  # 2/2
    else:
        beats_per_bar = eval(time_sig)  # e.g., "3/4" -> 0.75
    
    bars = [bar.strip() for bar in sample.split('|') if bar.strip()]
    correct = 0
    
    for i, bar in enumerate(bars):
        # print(f"\n{i} Bar: {bar}")
        total_duration = 0.0

        #initial two are definiting time signature and length
        i = 2
        
        while i < len(bar):
            # Handle triplets (e.g., "3DEF")
            if i + 3 <= len(bar) and bar[i] == '3' and bar[i+1] in NOTES and bar[i+2] in NOTES:
                total_duration += 2 * length  # 3 notes in time of 2
                i += 3
            
            # Handle dotted notes (e.g., ".D")
            elif i + 1 < len(bar) and bar[i+1] in NOTES and bar[i] == DOT:
                total_duration += 1.5 * length  # Original + half
                i += 2
            
            # Handle regular notes with duration (e.g., "C2")
            elif i + 1 < len(bar) and bar[i] in NOTES and bar[i+1] in DURATIONS:
                dur = int(bar[i+1])
                total_duration += length * (dur)  # e.g., "C2" in 1/8 base = 4/2=2 units
                i += 2
            
            # Handle simple notes (e.g., "C")
            elif bar[i] in NOTES:
                total_duration += length
                i += 1
            
            # Skip rests/other characters
            else:
                i += 1
        
        # Compare with tolerance for floating-point precision
        if abs(total_duration - beats_per_bar) < 0.001:
            correct += 1
            # print(f"✓ Correct duration: {total_duration}")
        # else:
        #     print(f"✗ Incorrect duration: {total_duration} (expected {beats_per_bar})")
    
    return correct / len(bars) if bars else 0.0

In [16]:
dataset = ABCDataset(os.path.join(data_dir, 'lookup_tables', 'songs_dict.json'), augment_data=True)
PAD_IDX = dataset.get_pad_idx()
train_dataset, val_dataset, test_dataset = dataset.split_dataset(train_ratio=0.8, val_ratio=0.1)

In [None]:
import random
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## RNN Sample

In [18]:
HP.n_layers = 4
rnn = RNNModel(VOCAB_SIZE, HP.embed_dim, HP.hidden_dim, HP.n_layers, PAD_IDX, HP.dropout)
rnn.load_state_dict(torch.load(os.path.join(output_dir, 'rnn_model.pth'), map_location=torch.device('cpu')))
result, key_score, time_sig_score = sample(rnn, test_dataset)
print(result)
print(f"Key Score: {key_score:.2%}")
print(f"Time Signature Score: {time_sig_score:.2%}")

T:9/8 L:1/8 K:A g B c3 - A | G2 E E D D | B, F G c A A | d B G A G E | B G A G A G | G A B A F G | D D D D E F | D E D G2 |: | d e f a2 f | g f e g e g | B f B A F A F | G2 G G A G | A B A A B c | a e d c d G G | e A A G F d c d B | A F D F A B c | B g B g2 g | g c d e d c | d2 c f g f | g f. e d f f | a b g e f d | B G B d d B | c A F E2 A | B c d A B2 G F G A | c d B g2 d B | c B c2 c d || e2 | e f d B | c d e f | B2 e g | a g f f e e d
Key Score: 57.23%
Time Signature Score: 3.12%


## GRU Sample

In [19]:
HP.n_layers = 4
gru = GRUModel(VOCAB_SIZE, HP.embed_dim, HP.hidden_dim, HP.n_layers, PAD_IDX, HP.dropout)
gru.load_state_dict(torch.load(os.path.join(output_dir, 'gru_model.pth'), map_location=torch.device('cpu')))
result, key_score, time_sig_score = sample(gru, test_dataset)
print(result)
print(f"Key Score: {key_score:.2%}")
print(f"Time Signature Score: {time_sig_score:.2%}")


T:6/8 L:1/8 K:Em f e d B B d | g b b a g f | g f g f g a | f a f g2 a | b a f d d f | 3ABA B G E | E2 E G2 :| | c B E E F G | B A B c d e | f g f e f g f | e d c B2 A | B g e d B A | B B A B2 A | B d g e2 d | g e B A2 :| | c A F E c A | B d e g f e | d B d g2 f | g f e d e c | e2 d f d e | f d c B c d | 3efg f d e B A | B d d A2 b f | e f e d B d | c A F G2 :| | d g a 3gag f a D. b2 | b2 f g f g | a b a g b2 | g e d f e d | ~A F E D2 :| | B A2 d | e f g a b g | d
Key Score: 82.94%
Time Signature Score: 6.06%


## LSTM Sample

In [20]:
HP.n_layers = 2

lstm = LSTMModel(VOCAB_SIZE, HP.embed_dim, HP.hidden_dim, HP.n_layers, PAD_IDX, HP.dropout)
lstm.load_state_dict(torch.load(os.path.join(output_dir, 'lstm_model.pth'), map_location=torch.device('cpu')))
result, key_score, time_sig_score = sample(lstm, test_dataset)
print(result)
print(f"Key Score: {key_score:.2%}")
print(f"Time Signature Score: {time_sig_score:.2%}")

T:6/8 L:1/8 K:G | G B c d B G | G F G E F G | A3 B d c | d e d c A B | c B A F D F E | D E A d c d | e d c f2 g | a g f e2 a g | d c A B d c | A G E G2 :| A | B G E G B d | f e f g2 f | e d c d2 B d | c B A G F E | G E D G2 A | A G G A2 G | B G G G2 :| d | g2 g g a g | f d f e c A | d2 B A B c | d g g g g a | b e g f g f | | e f g f2 f | f d c B G g | g e c d2 :| | G c d e f g | f g a b g e | f d B g e c | d d d 3gag f | e d A D A F :| :| g a g g e f | g f g f e d | c
Key Score: 85.47%
Time Signature Score: 10.81%
