# Organize Dataset

In [1]:
import glob
import mido

NOTE_MAX = 119.0

def get_message_value(msg):
    if msg.type == "note_off" or msg.velocity == 0:
        return -msg.note / NOTE_MAX
    else:
        return msg.note / NOTE_MAX

def track_to_notes(track):
    return [get_message_value(m) for m in track if m.type.startswith("note_")]

def get_author_tracks(author):
    tracks = []
    for dr in glob.glob("dataset/" + author + "/**"):
        mid = mido.MidiFile(dr)
        tracks.extend(mid.tracks)
    return tracks

def get_author_note_list(author, min_note_count=600):
    tracks = get_author_tracks(author)
    note_list = [track_to_notes(t) for t in tracks]
    return [n for n in note_list if len(n) >= min_note_count]

def get_author_min_note_list(note_lists):
    lens = [len(t) for t in note_lists]
    return min(lens)

def divide_note_list_into_inputs(note_list, group_size=600):
    chunks = []
    chunk_size = len(note_list) // 600
    for i in range(0, chunk_size):
        chunks.append(note_list[group_size * i: group_size * (i + 1)])
    return chunks

def get_tracks_chunks(tracks):
    chunks = []
    for t in tracks:
        chunks.extend(divide_note_list_into_inputs(t))
    
    return chunks

albeniz_tracks = get_author_note_list("Albéniz Isaac")
agnew_tracks = get_author_note_list("Agnew Roy")
behr_tracks = get_author_note_list("Behr Franz")
liszt_tracks = get_author_note_list("Liszt Franz")
zierau_tracks = get_author_note_list("Zierau Fritz")
frontini_tracks = get_author_note_list("Frontini Francesco Paolo")

albeniz_chunks = get_tracks_chunks(albeniz_tracks)
agnew_chunks = get_tracks_chunks(agnew_tracks)
behr_chunks = get_tracks_chunks(behr_tracks)
liszt_chunks = get_tracks_chunks(liszt_tracks)
zierau_chunks = get_tracks_chunks(zierau_tracks)
frontini_chunks = get_tracks_chunks(frontini_tracks)

# Run Model

In [None]:
import numpy as np
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, ZeroPadding1D, Reshape, GlobalAveragePooling1D, Dropout

def halve_array(arr):
    div = len(arr) // 2
    return arr[:div], arr[div:]

def create_labels(label, leng):
    labels = []
    for _ in range(leng):
        labels.append(label)
    return labels

def generate_data_from_chunks(chunks, label):
    train, test = halve_array(chunks)
    train_labels = create_labels(label, len(train))
    test_labels = create_labels(label, len(test))

    return {
        "train_data": train,
        "test_data": test,
        "train_labels": train_labels,
        "test_labels": test_labels
    }

def train_data_from_datasets(datasets):
    data = []
    for d in datasets:
        data += d["train_data"] + d["test_data"]
    return np.array(data)

def train_labels_from_datasets(datasets):
    data = []
    for d in datasets:
        data += d["train_labels"] + d["test_labels"]
    return np.array(data)

input_dim = len(albeniz_chunks[0])

def test_data_from_datasets(datasets):
    data = []
    for d in datasets:
        data += d["test_data"]
    return np.array(data)

def test_labels_from_datasets(datasets):
    data = []
    for d in datasets:
        data += d["test_labels"]
    return np.array(data)

input_dim = len(albeniz_chunks[0])

albeniz_dataset = generate_data_from_chunks(albeniz_chunks, [1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
behr_dataset = generate_data_from_chunks(behr_chunks, [0.0, 1.0, 0.0, 0.0, 0.0, 0.0])
frontini_dataset = generate_data_from_chunks(frontini_chunks, [0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
liszt_dataset = generate_data_from_chunks(liszt_chunks, [0.0, 0.0, 0.0, 1.0, 0.0, 0.0])
zierau_dataset = generate_data_from_chunks(zierau_chunks, [0.0, 0.0, 0.0, 0.0, 1.0, 0.0])
agnew_dataset = generate_data_from_chunks(agnew_chunks, [0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
datasets = [albeniz_dataset, behr_dataset, frontini_dataset, liszt_dataset, zierau_dataset, agnew_dataset]

train_data = train_data_from_datasets(datasets)
train_labels = train_labels_from_datasets(datasets)

# test_data = test_data_from_datasets(datasets)
# test_labels = test_labels_from_datasets(datasets)

final_train, final_labels = shuffle(train_data, train_labels)

model = Sequential([
    Reshape((30, 20), input_shape=(input_dim, )),
    Conv1D(30, 10, activation="relu", input_shape=((30, 20))),
    MaxPooling1D(3),
    Conv1D(10, 2, activation='relu'),
    GlobalAveragePooling1D(),
    Dropout(0.5),
    Dense(6, activation='softmax')
])
model.summary()

opt = keras.optimizers.Adam(learning_rate=0.003)
model.compile(
  optimizer=opt,
  loss="mse",
  metrics=["accuracy"],
)

model.fit(
  final_train,
  final_labels,
  validation_split=0.2,
  epochs=200,
  shuffle=True
)

test_01 = np.array([
    albeniz_dataset["test_data"][0],
])

print(model.predict(test_01))