# Setup

## Libraries

### Audio

In [None]:
!pip install pretty-midi
!pip install essentia
!pip install resampy
!pip install librosa

In [None]:
!sudo apt install -y fluidsynth
!pip install midi2audio

### Modelling

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install tensorflow

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip install sentencepiece

## Imports

In [None]:
# Audio
import os
import librosa
import glob
import pretty_midi
from midi2audio import FluidSynth

# Model
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
import torch

# Data

In [None]:
os.getcwd()
!ls

In [None]:
data_folder = "../data"
midi_file_paths = glob.glob(f"{data_folder}/.mid")[:10]
midi_files = [pretty_midi.PrettyMIDI(p) for p in midi_file_paths]
print(f"Working with {len(midi_files)} files.")

In [None]:
test_title = "Hotel_California_1"
single_midi = pretty_midi.PrettyMIDI(f"{data_folder}/{test_title}.mid")

# Pop2Piano - out of the box

## Create wav file from midi

In [None]:
fs = FluidSynth()
fs.midi_to_audio(f"{data_folder}/{test_title}.mid", f'../test_data/{test_title}.wav')

## Use Pop2Piano to generate a piano arrangement

In [None]:
audio, sr = librosa.load(f"../test_data/{test_title}.wav", sr=44100)  # feel free to change the sr to a suitable value.
model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")

inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")
model_output = model.generate(input_features=inputs["input_features"], composer="composer1")
tokenizer_output = processor.batch_decode(
    token_ids=model_output, feature_extractor_output=inputs
)["pretty_midi_objects"][0]
tokenizer_output.write(f"../test_data/{test_title}_pop2piano.mid")

### Make piano arrangement sound like drum



In [None]:
piano_to_drum_hack = pretty_midi.PrettyMIDI(f"../test_data/{test_title}_pop2piano.mid")

In [None]:
piano_to_drum_hack.instruments[0].is_drum =True
piano_to_drum_hack.instruments

In [None]:
piano_to_drum_hack.write(f"../test_data/{test_title}_pop2piano_drum_hack.mid")

# Custom implementation of T5
Inspired by pop2piano! Check it out here: https://github.com/sweetcocoa/pop2piano/tree/main

## Let's explore the T5

In [23]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model

### Let's play around with a pretrained T5 model first

In [24]:
# load the tokenizers and model
pretrained_tokenizer = T5Tokenizer.from_pretrained("t5-small") # vocab size is 32100.
predtrained_model = T5ForConditionalGeneration.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# for a phrase get the tokenised input ids
input_ids = tokenizer("translate English to French: I am going to the party.", return_tensors="pt").input_ids
# use the input ids to generte output
outputs = model.generate(input_ids)
# decode the output token ids to text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
## Output --> 
## Ich werde zur Partei gehen.

### Let's try some finetuning

In [2]:
import pandas as pd

In [22]:
df = pd.read_csv("../../midi_df_full.csv")
df.head(1)

Unnamed: 0.1,Unnamed: 0,song_name,guitar_tracks,drum_tracks,standardized_guitar_bars,standardized_drum_bars,tokenized_guitar,tokenized_drums
0,0,data/No_Son_Of_Mine.mid,"Instrument(program=28, is_drum=False, name=""No...","Instrument(program=16, is_drum=True, name=""No ...","[[], [], [], [], [], [Note(start=1.022917, end...","[[], [], [], [], [], [], [], [], [], [], [], [...","[5, 5, 5, 5, 5, 165357, 164307, 163344, 162387...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# Suppose we have the following training examples:
input_sequences = df["tokenized_guitar"][:30].to_list()
output_sequences = df["tokenized_drums"][:30].to_list()


# encode the inputs
task_prefix = "translate guitar to drums"

encoding = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
test_input = df["tokenized_guitar"][2000]
input_ids = tokenizer(f"translate guitar to drums: {test_input}", return_tensors="pt").input_ids
outputs = model.generate(input_ids, max_length = 200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

to drums: [5, 91376, 168933, 168599, 168159, 167837, 167349, 166984, 166444, 166113, 165585, 165218, 164652, 89376, 89168, 88986, 108102, 84419, 166245, 87813, 165890, 87643, 81801, 100528, 80905, 80577, 107187, 103470, 1578
