# Setup

## Libraries

### Audio

In [None]:
!pip install pretty-midi
!pip install essentia
!pip install resampy
!pip install librosa

In [None]:
!sudo apt install -y fluidsynth
!pip install midi2audio

### Modelling

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install tensorflow

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip install sentencepiece

## Imports

In [1]:
# Audio
import os
import librosa
import glob
import pretty_midi
from midi2audio import FluidSynth
import numpy as np
import pandas as pd

# Model
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
import torch

# Load midi data

In [None]:
os.getcwd()
!ls

In [None]:
data_folder = "../data"
midi_file_paths = glob.glob(f"{data_folder}/.mid")[:10]
midi_files = [pretty_midi.PrettyMIDI(p) for p in midi_file_paths]
print(f"Working with {len(midi_files)} files.")

In [None]:
test_title = "Hotel_California_1"
single_midi = pretty_midi.PrettyMIDI(f"{data_folder}/{test_title}.mid")

# Pop2Piano - out of the box

## Create wav file from midi

In [None]:
fs = FluidSynth()
fs.midi_to_audio(f"{data_folder}/{test_title}.mid", f'../test_data/{test_title}.wav')

## Use Pop2Piano to generate a piano arrangement

In [None]:
audio, sr = librosa.load(f"../test_data/{test_title}.wav", sr=44100)  # feel free to change the sr to a suitable value.
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")

inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")
model_output = model.generate(input_features=inputs["input_features"], composer="composer1")
tokenizer_output = processor.batch_decode(
    token_ids=model_output, feature_extractor_output=inputs
)["pretty_midi_objects"][0]
tokenizer_output.write(f"../test_data/{test_title}_pop2piano.mid")

### Make piano arrangement sound like drum



In [None]:
piano_to_drum_hack = pretty_midi.PrettyMIDI(f"../test_data/{test_title}_pop2piano.mid")

In [None]:
piano_to_drum_hack.instruments[0].is_drum =True
piano_to_drum_hack.instruments

In [None]:
piano_to_drum_hack.write(f"../test_data/{test_title}_pop2piano_drum_hack.mid")

In [None]:
# This is basically a test thing
embedding = torch.nn.Embedding(num_embeddings=213356, embedding_dim=100)
output = embedding(padded_guitar_tensor) 

# Load and pad pre-tokenized data

## Load tokenized data from pickle

In [2]:
df = pd.read_pickle("../../midi_df_2199.pkl")
df.head(2)

Unnamed: 0,song_name,guitar_tracks,drum_tracks,standardized_guitar_bars,standardized_drum_bars,tokenized_guitar,tokenized_drums
0,data/No_Son_Of_Mine.mid,"Instrument(program=28, is_drum=False, name=""No...","Instrument(program=16, is_drum=True, name=""No ...","[[], [], [], [], [], [Note(start=1.022917, end...","[[], [], [], [], [], [], [], [], [], [], [], [...","[5, 5, 5, 5, 5, 165357, 164307, 163344, 162387...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
1,data/Deja-Vu.mid,"Instrument(program=30, is_drum=False, name=""Gu...","Instrument(program=0, is_drum=True, name=""Drums"")","[[], [], [], [], [], [], [], [], [], [Note(sta...","[[], [], [], [], [], [], [], [], [Note(start=1...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 168616, 82913, 711...","[5, 5, 5, 5, 5, 5, 5, 5, 212531, 147704, 14708..."


In [3]:
type(df["tokenized_guitar"][0])

list

## Add padding to make all rows same length

In [4]:
#df['tokenized_guitar_list'] = df['tokenized_guitar'].apply(eval)
#df['tokenized_drums_list'] = df['tokenized_drums'].apply(eval)

In [5]:
pad_to_max_len_list = max(df['tokenized_guitar'].apply(lambda x : len(x)))
pad_to_max_len_list

1005

In [6]:
df['tokenized_guitar_padded'] = df['tokenized_guitar'].apply(
    lambda x: np.pad(x, pad_width=(0, (pad_to_max_len_list-len(x))), mode='constant', constant_values=0))
df['tokenized_drums_padded'] = df['tokenized_drums'].apply(
    lambda x: np.pad(x, pad_width=(0, (pad_to_max_len_list-len(x))), mode='constant', constant_values=0))

In [7]:
df['tokenized_guitar_padded'].to_numpy().shape

(2199,)

In [8]:
padded_guitar_tensor = torch.LongTensor(df['tokenized_guitar_padded'].apply(lambda x: x.astype("int32")))
padded_drum_tensor = torch.LongTensor(df['tokenized_drums_padded'].apply(lambda x: x.astype("int32")))

  padded_guitar_tensor = torch.LongTensor(df['tokenized_guitar_padded'].apply(lambda x: x.astype("int32")))


In [9]:
padded_guitar_tensor.size()

torch.Size([2199, 1005])

# Train a T5 model with a custom config

In [11]:
from transformers import T5Config, T5ForConditionalGeneration

## Initialize a model with custom config

In [12]:
# Change the config!
config = T5Config()
config.vocab_size = 213356
config.decoder_start_token_id = 0
config

T5Config {
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.39.0.dev0",
  "use_cache": true,
  "vocab_size": 213356
}

In [13]:
custom_model = T5ForConditionalGeneration(config)

## Train the model

In [14]:
# forward pass
forward_pass = custom_model(input_ids=padded_guitar_tensor[:2], labels=padded_drum_tensor[:2])

In [16]:
forward_pass.loss

tensor(8.0127, grad_fn=<NllLossBackward0>)

## Make prediction

In [23]:
# Prep input data!
pred_input = [df["tokenized_guitar"][800]]

padded_pred_input = np.pad(
    pred_input, pad_width=(0, (pad_to_max_len_list-len(pred_input))), mode='constant', constant_values=0
)

padded_pred_tensor = torch.LongTensor(pred_input)

In [24]:
# The prediction input encoded
padded_pred_tensor

tensor([[     5,      5,  76202,  91139, 166530,  56924, 164763,  82674,  89941,
         162018,  89361,  89096,  88927,  80625,  64691, 156708,  87623, 154961,
         153994,  66694, 161171,  24913, 150738,  59593,  85453, 148377,  23963,
          23565,  23261, 145508, 144760, 144029, 143333, 142661,  21038,  48864,
         154025,  20584, 139481,  80672,  80427, 152015,  20627,  20514,  20404,
          20313,  34577, 134637,  20117, 133551,  42513,  39156,  19517,  40926,
          76101, 130649,  85784,      6]])

In [26]:
outputs = custom_model.generate(padded_pred_tensor, max_length=1000)
print(outputs)

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

# Playing around with T5
Inspired by pop2piano! Check it out here: https://github.com/sweetcocoa/pop2piano/tree/main

## Let's explore the T5

In [None]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model

### Let's play around with a pretrained T5 model first

In [None]:
# load the tokenizers and model
pretrained_tokenizer = T5Tokenizer.from_pretrained("t5-small") # vocab size is 32100.
predtrained_model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
# for a phrase get the tokenised input ids
input_ids = tokenizer("translate English to French: I am going to the party.", return_tensors="pt").input_ids
# use the input ids to generte output
outputs = model.generate(input_ids)
# decode the output token ids to text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
## Output --> 
## Ich werde zur Partei gehen.

### Tokenize with t5

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

In [None]:
''.join(str(df["tokenized_guitar"][0]))

In [None]:
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# goal
task_prefix = "translate guitar to drums"

# Suppose we have the following training examples:
input_sequences = df["tokenized_guitar"].apply(lambda x: ''.join(str(x)))
output_sequences = df["tokenized_drums"].apply(lambda x: ''.join(str(x)))
# encode the inputs

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

In [None]:
import tensorflow as tf

guitar_vector = df["tokenized_guitar"][:2].apply(eval).to_numpy()
drum_vector = df["tokenized_drums"][:2].apply(eval).to_numpy()

ragged_guitar_tensor = tf.ragged.constant(guitar_vector)
padded_guitar_tensor = ragged_guitar_tensor.to_tensor(default_value=0)

ragged_drum_tensor = tf.ragged.constant(drum_vector)
padded_drum_tensor = ragged_drum_tensor.to_tensor(default_value=0)


In [None]:
guitar_vector_padded = np.pad(guitar_vector, 'pad_width', mode='constant') 
guitar_vector_padded