# **Imports and Installations**

In [3]:
!sudo apt install -y fluidsynth
!pip install --upgrade pyfluidsynth
!pip install pretty_midi
!pip install MIDIUtil

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin
  libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5
  libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 qsynth qt5-gtk-platformtheme qttranslations5-l10n
  timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs qt5-image-formats-plugins qtwayland5 jackd
The following NEW packages will be installed:
  fluid-soundfont-gm fluidsynth libevdev2 libfluidsynth3 libgudev-1.0-0
  libinput-bin libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a
  libqt5dbus5 libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libx

In [27]:
import pandas as pd
import numpy as np
import glob
import json
import os
import pathlib
import shutil
import time
import random

import pretty_midi
from midiutil import MIDIFile
import fluidsynth
from IPython import display

import torch
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model, Model

import warnings
warnings.filterwarnings('ignore')

In [9]:
data_dir = pathlib.Path('data/maestro-v3.0.0')
if not data_dir.exists():
  tf.keras.utils.get_file(
      'maestro-v3.0.0-midi.zip',
      origin='https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip',
      extract=True,
      cache_dir='.', cache_subdir='data',
  )

Downloading data from https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip
[1m58416533/58416533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


# **Preprocessing**

In [11]:
#composer_list = ["Wolfgang Amadeus Mozart", "Franz Schubert", "Frédéric Chopin", "Ludwig van Beethoven", "Johann Sebastian Bach"]
composer_list = ["Wolfgang Amadeus Mozart"]

maestro_metadata = pd.read_csv("/content/data/maestro-v3.0.0/maestro-v3.0.0.csv")
maestro_metadata = maestro_metadata[maestro_metadata['canonical_composer'].isin(composer_list)]

for composer in composer_list:
    metadata = maestro_metadata[maestro_metadata['canonical_composer']==composer]
    os.makedirs(f"/content/extracted_midis/{composer}/midis", exist_ok=True)

    for row in metadata.iterrows():
        idx = row[0]
        row = row[1]

        file_path = "/content/data/maestro-v3.0.0/" + row["midi_filename"]
        shutil.copy(file_path, f"/content/extracted_midis/{composer}/midis/{idx}.mid")

In [15]:
filenames = glob.glob("/content/extracted_midis/**/midis/*.mid")

for i, filename in enumerate(filenames):
    print(f"{i} - Processing {filename}")
    _id = filename.split("/")[-1].split(".")[0]

    pm = pretty_midi.PrettyMIDI(filename)
    instrument = pm.instruments[0]

    columns = ["start(sec)", "note"]
    metadata = pd.DataFrame({col: [None] * (len(instrument.notes)-1) for col in columns})

    for idx, note in enumerate(instrument.notes):
        metadata["start(sec)"][idx] = note.start.round(3)
        metadata["note"][idx] = pretty_midi.note_number_to_name(note.pitch)

    metadata.sort_values("start(sec)", ascending=True, inplace=True)

    os.makedirs("/content/csvs", exist_ok=True)
    metadata.to_csv(f"/content/csvs/{_id}.csv", index=False)

Number of files: 38


In [20]:
sample_metadata = pd.read_csv("/content/csvs/1238.csv")
print(f"Total number of rows: {len(sample_metadata)}\n")
sample_metadata.head(20)

Total number of rows: 2289


In [18]:
threshold = 0.05

for i, filename in enumerate(os.listdir("/content/csvs")):
  print(f"{i} - Processing {filename}")
  metadata = pd.read_csv(f"/content/csvs/{filename}")

  metadata['step'] = metadata['start(sec)'].diff()
  metadata['group'] = (metadata['step'] > threshold).cumsum()

  metadata = metadata.drop(columns=['step'])
  grouped = metadata.groupby('group')

  columns = ["start(sec)", "chord"]
  modified_metadata = pd.DataFrame({col: [None] * (len(grouped)) for col in columns})

  for idx, (group_name, group_metadata) in enumerate(grouped):
      modified_metadata["start(sec)"][idx] = min(group_metadata["start(sec)"].values)
      modified_metadata["chord"][idx] = "_".join(sorted(set(group_metadata["note"].values)))

  modified_metadata.sort_values("start(sec)", ascending=True, inplace=True)
  modified_metadata.to_csv(f"/content/csvs/{filename}", index=False)

In [23]:
sample_metadata = pd.read_csv("/content/csvs/1238.csv")
print(f"Total number of rows: {len(sample_metadata)}\n")
sample_metadata.head(20)

Total number of rows: 2289



Unnamed: 0,start(sec),chord
0,1.007,C2_C3_C4
1,4.021,D#2_D#3_D#4
2,4.723,F#2_F#3_F#4
3,5.405,G2_G3_G4
4,6.148,G#2_G#3_G#4
5,7.003,C2_C3_C4
6,8.06,B1_B2_B3
7,10.615,C5_D#5_F#4
8,11.533,B4_D5_G4
9,12.57,C5_F#5_G#4


# **Train-Validation-Test Split**

In [29]:
csv_directory = "/content/csvs"
csv_files = [filename for filename in os.listdir(csv_directory) if filename.endswith('.csv')]
random.shuffle(csv_files)

total_files = len(csv_files)
train_count = int(total_files * 0.85)
val_count = int(total_files * 0.10)
test_count = total_files - train_count - val_count

os.makedirs("/content/train", exist_ok=True)
os.makedirs("/content/validation", exist_ok=True)
os.makedirs("/content/test", exist_ok=True)

for filename in csv_files[:train_count]:
    name = filename.split(".")[0]
    shutil.copy(csv_directory + f"/{filename}", f"/content/train/{name}.csv")

for filename in csv_files[train_count:train_count + val_count]:
    name = filename.split(".")[0]
    shutil.copy(csv_directory + f"/{filename}", f"/content/validation/{name}.csv")

for filename in csv_files[train_count + val_count:]:
    name = filename.split(".")[0]
    shutil.copy(csv_directory + f"/{filename}", f"/content/test/{name}.csv")

# **Corpus Creation**

In [49]:
def create_corpus(_type: str):
    directory = f"/content/{_type}"
    corpus = []

    for idx, filename in enumerate(os.listdir(directory)):
        if os.path.isfile(os.path.join(directory, filename)) and filename.endswith(".csv"):
            metadata = pd.read_csv(os.path.join(directory, filename)).sort_values(by='start(sec)')
            corpus.append(" ".join(list(metadata["chord"])))

    print(corpus)
    return corpus

# **Tokenizer Functions**

In [37]:
def detokenizer(tokenized_piece: list, tokenizer) -> list:
    notes = [tokenizer.index_word[index] for index in tokenized_piece]
    return notes

def create_tokenizer(corpus: list):
    padding_token = "<pad>"
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token=padding_token)
    tokenizer.fit_on_texts(corpus)
    vocab_size = len(tokenizer.word_index)

    return tokenizer, vocab_size

def create_filtered_tokenizer(corpus: list, min_count=1):
    padding_token = "<pad>"
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token=padding_token)
    tokenizer.fit_on_texts(corpus)

    filtered_words = [word for word, count in tokenizer.word_counts.items() if count >= min_count]

    filtered_tokenizer = Tokenizer(filters='', oov_token=padding_token, num_words=len(filtered_words))
    filtered_tokenizer.fit_on_texts(filtered_words)

    vocab_size = len(filtered_tokenizer.word_index)
    return filtered_tokenizer, vocab_size

# **Data Preparation**

In [41]:
def prepare_training_data(max_seq_length: int, input_seq_size: int, output_seq_size: int, corpus: list, tokenizer, stride: int=1):
    vocab_size = len(tokenizer.word_index)
    window_size = (input_seq_size+output_seq_size)

    input_sequences = []
    output_sequences = []

    for piece in corpus:
        tokenized_piece = tokenizer.texts_to_sequences([piece])[0]
        for step in range(0, (len(tokenized_piece) - window_size), stride):
            seq = tokenized_piece[step:step+window_size]

            input_seq = seq[:input_seq_size]
            output_seq = seq[input_seq_size:]
            output_seq = list(np.squeeze(tf.keras.preprocessing.sequence.pad_sequences([output_seq], value=tokenizer.word_index["<pad>"], maxlen=max_seq_length, padding='post')))
            output_seq = [int(value) for value in output_seq]

            input_sequences.append(input_seq)
            output_sequences.append(output_seq)

    return (input_sequences, output_sequences), vocab_size

# **Prediction Functions**

In [43]:
def predict_notes_default(input_sequence: str, prediction_size: int, prediction_count: int, model, tokenizer):
    output_sequence = input_sequence.split(" ")

    for i in range(prediction_count):
        tokenized_input = tokenizer.texts_to_sequences(input_sequence.split(" "))
        input_batch = np.array([tokenized_input for _ in range(64)])

        predictions = model.predict(input_batch, verbose=0)
        prediction_indices = list(np.argmax(predictions[0], axis=-1))
        detokenized_prediction = detokenizer(prediction_indices, tokenizer)

        input_sequence = input_sequence.split()
        for i in range(prediction_size):
          input_sequence.append(detokenized_prediction[i].lower())
          output_sequence.append(detokenized_prediction[i].lower())

        input_sequence = " ".join(input_sequence[prediction_size:])

    return output_sequence

def predict_notes_probability_distribution(input_sequence: list, prediction_size: int, prediction_count: int, model, tokenizer):
    output_sequence = input_sequence.split(" ")

    for i in range(prediction_count):
        tokenized_input = tokenizer.texts_to_sequences(input_sequence.split(" "))
        input_batch = np.array([tokenized_input for _ in range(64)])

        logits = torch.tensor(model.predict(input_batch, verbose=0))
        sampled_token_indices = torch.multinomial(logits[0], 1).squeeze()
        detokenized_prediction = detokenizer(sampled_token_indices.tolist(), tokenizer)

        input_sequence = input_sequence.split()
        for i in range(prediction_size):
          input_sequence.append(detokenized_prediction[i].upper())
          output_sequence.append(detokenized_prediction[i].upper())
        input_sequence = " ".join(input_sequence[prediction_size:])

    return output_sequence

def predict_notes_thresholded_probability_distribution(input_sequence: list, prediction_size: int, prediction_count: int, model, tokenizer, threshold=0.0):
    output_sequence = input_sequence.split(" ")

    for i in range(prediction_count):
        tokenized_input = tokenizer.texts_to_sequences(input_sequence.split(" "))
        input_batch = np.array([tokenized_input for _ in range(64)])

        logits = torch.tensor(model.predict(input_batch, verbose=0))
        max_logit = torch.max(logits[0][:prediction_size])

        threshold = max_logit * threshold

        filtered_logits = logits.clone()
        filtered_logits[filtered_logits < threshold] = float('-inf')

        sampled_token_indices = torch.multinomial(torch.exp(filtered_logits)[0], 1).squeeze()
        detokenized_prediction = detokenizer(sampled_token_indices.tolist(), tokenizer)

        input_sequence = input_sequence.split()
        for i in range(prediction_size):
            input_sequence.append(detokenized_prediction[i].upper())
            output_sequence.append(detokenized_prediction[i].upper())
        input_sequence = " ".join(input_sequence[prediction_size:])

    return output_sequence

# **MIDI Functions**

In [44]:
def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
    _SAMPLING_RATE = 16000
    waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
    waveform_short = waveform[:seconds*_SAMPLING_RATE]
    return display.Audio(waveform_short, rate=_SAMPLING_RATE)

def add_note_to_midi(midi, channel, pitch, time, duration, volume, instrument):
    """
    Adds a new note to a MIDI file.

    Args:
        midi (MIDIFile): The MIDIFile object to which the note will be added.
        channel (int): The MIDI channel on which to play the note (0-15).
        pitch (int): The MIDI note number (0-127).
        time (float): The start time of the note in beats.
        duration (float): The duration of the note in beats.
        volume (int): The volume of the note (0-127).
        instrument (int): The instrument program number (0-127).
    """
    midi.addProgramChange(channel, 0, time, instrument)
    midi.addNote(channel, 0, pitch, time, duration, volume)

def prediction_to_midi(predicted_sequence: list, file_path: str):
    midi = MIDIFile(1, deinterleave=False)
    midi.addTempo(0, 0, 120)  # Tempo in BPM

    step=0
    for note in predicted_sequence:
        # Add a note with channel 0, pitch 60 (C4), start time 0, duration 1 beat, volume 100, and instrument 0 (piano)
        if "_" in note:
          note_list = note.split("_")
          for note in note_list:
            if note=="<PAD>" or note=="<pad>":
                pass
            else:
                add_note_to_midi(midi, 0, pretty_midi.note_name_to_number(note), step, 0.6, 65, 0)
        else:
            if note=="<PAD>" or note=="<pad>":
                pass
            else:
                add_note_to_midi(midi, 0, pretty_midi.note_name_to_number(note), step, 0.6, 65, 0)
        step+=0.6

    with open(file_path, "wb") as file:
        midi.writeFile(file)

# **Transformer Architecture**

In [47]:
def transformer_model(input_vocab_size, output_vocab_size, max_seq_length, d_model=128, num_heads=4, num_layers=2, dropout_rate=0.1):
    inputs = Input(shape=(max_seq_length,), dtype=tf.int32)
    outputs = inputs  # Placeholder

    # Embedding layers
    embedding_layer = tf.keras.layers.Embedding(input_vocab_size, d_model)
    outputs = embedding_layer(outputs)

    # Positional encoding
    positional_encoding = tf.keras.layers.Embedding(max_seq_length, d_model)
    position = tf.range(start=0, limit=max_seq_length, delta=1)
    position = positional_encoding(position)
    outputs += position

    # Transformer blocks
    for _ in range(num_layers):
        # Multi-head self-attention
        attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(outputs, outputs)
        attention_output = tf.keras.layers.Dropout(dropout_rate)(attention_output)
        attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-7)(outputs + attention_output)

        # Layer normalization before feed-forward network
        attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-7)(attention_output)

        # Feed forward network with additional depth and non-linearity
        dense_output = tf.keras.layers.Dense(units=d_model*4, activation='gelu')(attention_output)
        dense_output = tf.keras.layers.Dense(units=d_model, activation='gelu')(dense_output)
        dense_output = tf.keras.layers.Dropout(dropout_rate)(dense_output)

        # Residual connection
        outputs = tf.keras.layers.LayerNormalization(epsilon=1e-7)(attention_output + dense_output)

    # Output layer
    outputs = Dense(output_vocab_size, activation='softmax')(outputs)

    return Model(inputs=inputs, outputs=outputs)

# **Corpus Creation, Tokenizer Creation and Data Preparation Pipeline**

In [46]:
def corpus_tokenizer_data_preparation(composer: str, input_seq_size: int, output_seq_size: int, threshold=1):
  train_corpus = create_corpus("train")
  validation_corpus = create_corpus("validation")
  test_corpus = create_corpus("test")

  total_corpus = validation_corpus + train_corpus
  print("Composer: ", composer)
  print("Train corpus size: ", len(train_corpus))
  print("Validation corpus size: ", len(validation_corpus))
  print("Test corpus size: ", len(test_corpus))
  print("Total corpus size: ", len(total_corpus))

  tokenizer, vocab_size = create_filtered_tokenizer(total_corpus, threshold)

  max_seq_length = max(input_seq_size, output_seq_size)

  validation_data, _ = prepare_training_data(max_seq_length=max_seq_length, input_seq_size=input_seq_size, output_seq_size=output_seq_size, corpus=validation_corpus, tokenizer=tokenizer)
  train_data, _ = prepare_training_data(max_seq_length=max_seq_length, input_seq_size=input_seq_size, output_seq_size=output_seq_size, corpus=train_corpus, tokenizer=tokenizer)

  print("Vocabulary size: ", vocab_size)
  print("Train data size: ", len(train_data[0]))
  print("Validation data size: ", len(validation_data[0]))

  return (train_data, validation_data, tokenizer, vocab_size, max_seq_length, total_corpus)

# **Training**

**Wolgang Amadeus Mozart**
*   **Window=(50,50)**
*   **Filter=1**
*   **d_model=128**
*   **n_heads=4**
*   **n_layers=2**
*   **Dropout=0.25**

In [50]:
composer = "Wolfgang Amadeus Mozart"
input_seq_size, output_seq_size, threshold = 50, 50, 1

(train_data, validation_data, tokenizer, vocab_size, max_seq_length, total_corpus) = corpus_tokenizer_data_preparation(composer=composer, input_seq_size=input_seq_size, output_seq_size=output_seq_size, threshold=threshold)

['C5 A#4 A4 G4 F4 A4 G4 A#4 E4 F4 C4 C5 A#4 A4 G4 F4 A4 G4 A#4 E4 A3_F3 F4 C4 A3 C4 A3 C4 A3 C4 A#3_F#4_F3 C4_G4 A#3_A4 A#4_C4 A#3_C5 C#5_C4 A#3_D5 C4_E5 A3_F3_G5 C4_F5 A3_E5 C4_D5 A3_D5 C4_C5 A#4_A3 A4_C4 A#3_E4_F3_G4 C4 A#3_A4_G4 C4_F#4 A#3_G4 C4 A#3_C5 C4 A3_F4 C4 A#3 A3 G3 F3 A3 G3 A#3 E3 F3 C3 C4 A#3 A3 G3 F3 A3 G3 A#3 E3 C5_F3 C6 A5 C6 C5 C6 A5 C6 C5_F#3 C6_G3 A#5_A3 A#3_C6 C4_C5 C#4_C6 A#5_D4 C6_E4 C5_G4 C6_F4 A5_E4 C6_D4 C5_D4 C4_C6 A#3_A5 A3_C6 A#3_F#5 E4_G5 A5_C4 A#5_E4 A#3_C6 C#6_G4 C4_D6 E6_G4 A3_F6 F5 A4_C5_F6 A#4_E6_G4 A4_D6_F4 D5 A4_D6_F4 C6_E4_G4 A#5_D4_F4 A#4 A#5_D4_F4 A5_C4_E4 A#3_D4_G5 G4 D4_G5 A3_C4_F5 A#3_F5_G3 E5 A#3_E3 A5 G5 F5 E5 D5 C5 A3_C5_F3 F5 E5 D#5 A#2_C#5_G3 D5 A#4 G4 F4 A3_C3 A#3 G4 F4 E4 F4 A4_E4 G4 B3_D3_F3 F4 A5 G5 F5 E5 D5 C5 B4 A#4_F#3 E5_G3 A3_C5 A#3_E5 A#4_C4 C#4_G5 C5_D4 E4_G5 A4_F4 F3 A4_C5_F4 E4 D4 D3 A4_D4_F5 C4 A#3 A#2 A#3_D5_F5 A3 G3 G2 A#5_D5_G3 F3 A#5_C#5_F3 C5_E3 A#4_E5 A4 G4 F4 E4 D4 C4 A4_C4_F5 A#4_A5 G5 F5 E5 D4_D5 C5_E4 A4_C5_F4 F#4 A

In [None]:
mozart_improved_transformer = transformer_model(input_vocab_size=vocab_size, output_vocab_size=vocab_size, max_seq_length=max_seq_length, dropout_rate=0.25)
mozart_improved_transformer.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy(), metrics=[SparseCategoricalAccuracy()])
mozart_improved_transformer.fit(train_data[0], train_data[1], validation_data=(validation_data[0], validation_data[1]), batch_size=64, epochs=1, verbose=1)