# Setup

## Libraries

### Audio

In [None]:
!pip install pretty-midi
!pip install essentia
!pip install resampy
!pip install librosa

In [None]:
!sudo apt install -y fluidsynth
!pip install midi2audio

### Modelling

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install tensorflow

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip install sentencepiece

## Imports

In [None]:
# Audio
import os
import librosa
import glob
import pretty_midi
from midi2audio import FluidSynth
import numpy as np

# Model
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
import torch

# Data

In [None]:
os.getcwd()
!ls

In [None]:
data_folder = "../data"
midi_file_paths = glob.glob(f"{data_folder}/.mid")[:10]
midi_files = [pretty_midi.PrettyMIDI(p) for p in midi_file_paths]
print(f"Working with {len(midi_files)} files.")

In [None]:
test_title = "Hotel_California_1"
single_midi = pretty_midi.PrettyMIDI(f"{data_folder}/{test_title}.mid")

# Pop2Piano - out of the box

## Create wav file from midi

In [None]:
fs = FluidSynth()
fs.midi_to_audio(f"{data_folder}/{test_title}.mid", f'../test_data/{test_title}.wav')

## Use Pop2Piano to generate a piano arrangement

In [None]:
audio, sr = librosa.load(f"../test_data/{test_title}.wav", sr=44100)  # feel free to change the sr to a suitable value.
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")

inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")
model_output = model.generate(input_features=inputs["input_features"], composer="composer1")
tokenizer_output = processor.batch_decode(
    token_ids=model_output, feature_extractor_output=inputs
)["pretty_midi_objects"][0]
tokenizer_output.write(f"../test_data/{test_title}_pop2piano.mid")

### Make piano arrangement sound like drum



In [None]:
piano_to_drum_hack = pretty_midi.PrettyMIDI(f"../test_data/{test_title}_pop2piano.mid")

In [None]:
piano_to_drum_hack.instruments[0].is_drum =True
piano_to_drum_hack.instruments

In [None]:
piano_to_drum_hack.write(f"../test_data/{test_title}_pop2piano_drum_hack.mid")

# Custom implementation of T5
Inspired by pop2piano! Check it out here: https://github.com/sweetcocoa/pop2piano/tree/main

## Let's explore the T5

In [23]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model

### Let's play around with a pretrained T5 model first

In [24]:
# load the tokenizers and model
pretrained_tokenizer = T5Tokenizer.from_pretrained("t5-small") # vocab size is 32100.
predtrained_model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# for a phrase get the tokenised input ids
input_ids = tokenizer("translate English to French: I am going to the party.", return_tensors="pt").input_ids
# use the input ids to generte output
outputs = model.generate(input_ids)
# decode the output token ids to text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
## Output --> 
## Ich werde zur Partei gehen.

### Let's try some finetuning

In [10]:
import pandas as pd

In [11]:
df = pd.read_pickle("../../midi_df_2199.pkl")
df.head(2)

Unnamed: 0,song_name,guitar_tracks,drum_tracks,standardized_guitar_bars,standardized_drum_bars,tokenized_guitar,tokenized_drums
0,data/No_Son_Of_Mine.mid,"Instrument(program=28, is_drum=False, name=""No...","Instrument(program=16, is_drum=True, name=""No ...","[[], [], [], [], [], [Note(start=1.022917, end...","[[], [], [], [], [], [], [], [], [], [], [], [...","[5, 5, 5, 5, 5, 165357, 164307, 163344, 162387...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
1,data/Deja-Vu.mid,"Instrument(program=30, is_drum=False, name=""Gu...","Instrument(program=0, is_drum=True, name=""Drums"")","[[], [], [], [], [], [], [], [], [], [Note(sta...","[[], [], [], [], [], [], [], [], [Note(start=1...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 168616, 82913, 711...","[5, 5, 5, 5, 5, 5, 5, 5, 212531, 147704, 14708..."


In [6]:
type(df["tokenized_guitar"][0])

list

### Tokenize with t5

In [25]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [64]:
''.join(str(df["tokenized_guitar"][0]))

'[5, 5, 5, 5, 5, 165357, 164307, 163344, 162387, 161460, 160533, 159366, 158322, 157372, 156384, 155451, 154554, 153580, 152724, 151878, 151032, 150226, 149141, 148290, 147523, 146739, 145988, 145203, 144313, 143593, 142861, 142204, 141529, 140677, 140034, 139408, 138811, 138192, 137596, 136953, 136402, 135839, 135292, 134784, 133968, 133420, 132890, 132387, 131908, 131460, 130923, 130476, 130015, 129594, 129176, 128490, 128098, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 120200, 119936, 119693, 119443, 119235, 119003, 118789, 118352, 118146, 117938, 117750, 117557, 117387, 117171, 116995, 116825, 116643, 116466, 116195, 116056, 115892, 115748, 115586, 115441, 115284, 115143, 114995, 114864, 114704, 114422, 114308, 114185, 114076, 113949, 113844, 113729, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 111430, 111373, 111266, 111215, 111156, 111110, 111014, 110974, 110923, 110873, 110817, 110774, 110706, 110622, 110596, 110544, 110513,

In [65]:
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# goal
task_prefix = "translate guitar to drums"

# Suppose we have the following training examples:
input_sequences = df["tokenized_guitar"].apply(lambda x: ''.join(str(x)))
output_sequences = df["tokenized_drums"].apply(lambda x: ''.join(str(x)))
# encode the inputs

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
import tensorflow as tf

guitar_vector = df["tokenized_guitar"][:2].apply(eval).to_numpy()
drum_vector = df["tokenized_drums"][:2].apply(eval).to_numpy()

ragged_guitar_tensor = tf.ragged.constant(guitar_vector)
padded_guitar_tensor = ragged_guitar_tensor.to_tensor(default_value=0)

ragged_drum_tensor = tf.ragged.constant(drum_vector)
padded_drum_tensor = ragged_drum_tensor.to_tensor(default_value=0)


In [None]:
guitar_vector_padded = np.pad(guitar_vector, 'pad_width', mode='constant') 
guitar_vector_padded

### Add padding to rows

In [None]:
#df['tokenized_guitar_list'] = df['tokenized_guitar'].apply(eval)
#df['tokenized_drums_list'] = df['tokenized_drums'].apply(eval)

In [12]:
pad_to_max_len_list = max(df['tokenized_guitar'].apply(lambda x : len(x)))
pad_to_max_len_list

1005

In [13]:
df['tokenized_guitar_padded'] = df['tokenized_guitar'].apply(
    lambda x: np.pad(x, pad_width=(0, (pad_to_max_len_list-len(x))), mode='constant', constant_values=0))
df['tokenized_drums_padded'] = df['tokenized_drums'].apply(
    lambda x: np.pad(x, pad_width=(0, (pad_to_max_len_list-len(x))), mode='constant', constant_values=0))

In [14]:
df['tokenized_guitar_padded'].to_numpy().shape

(2199,)

In [15]:
padded_guitar_tensor = torch.IntTensor(df['tokenized_guitar_padded'].apply(lambda x: x.astype("int32")))
padded_drum_tensor = torch.IntTensor(df['tokenized_drums_padded'].apply(lambda x: x.astype("int32")))

  padded_guitar_tensor = torch.IntTensor(df['tokenized_guitar_padded'].apply(lambda x: x.astype("int32")))


In [16]:
padded_guitar_tensor.size()

torch.Size([2199, 1005])

### Create a T5 model with a custom config

In [17]:
from transformers import T5Config, T5ForConditionalGeneration

In [18]:
# This is basically a test thing
embedding = torch.nn.Embedding(num_embeddings=213356, embedding_dim=100)
output = embedding(padded_guitar_tensor) 

In [19]:
# Change the config!
config = T5Config()
config.vocab_size = 213356
config

T5Config {
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.39.0.dev0",
  "use_cache": true,
  "vocab_size": 213356
}

In [20]:
custom_model = T5ForConditionalGeneration(config)

In [None]:
# forward pass
custom_model(input_ids=padded_guitar_tensor[:100], labels=padded_drum_tensor[:100])

In [None]:
test_input = df["tokenized_guitar"][800]
input_ids = tokenizer(f"{test_input}", return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_length=1000)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(len(decoded))
print(decoded)

In [None]:
print(eval(decoded.split("[")[1]))

In [None]:
df["tokenized_drums"][800]

In [None]:
df["tokenized_guitar"][800]

Song at idx 500: (5, 5, 168967, 86700, 168207, 82728, 167405, 78644, 166499, 74742, 165637, 71124, 164729, 67670, 163806, 64600, 162910, 61545, 162000, 58782, 161132, 56356, 160270, 53915, 159321, 51697, 158392, 49733, 157471, 47920)