In [None]:
import json
import pretty_midi
import numpy as np
import pandas as pd
#import my methods
from modules.midiMethods import *
from modules.dataMethods import *
import modules.models as models
import modules.mlClasses as mlClasses

## Data Generation
The code blocks below generate X sequences only, that is, thay have no target sequences - that is because the targets are simply the original sequence moved along by a single time step, resulting in much redundant storage if these are stored separately.

The following code blocks are not comprehensive, but provide some examples of how to use my methods to generate training data, or chroma data from training data.

### Read in information about where data is stored
The following reads in the filenames and locations of the train and validation partitions of the MAESTRO dataset.

In [None]:
maestro = pd.read_csv('training_data/maestro-v2.0.0.csv', index_col=0)
filenames_train = list(maestro[maestro['split'] == 'train']['midi_filename'])
filenames_val = list(maestro[maestro['split'] == 'validation']['midi_filename'])
data_path = 'training_data/MaestroV2.00/maestro-v2.0.0/'

### Generating Performance Representation Data
Throughout this project, I referred to Performance Representation as oore. The following produces training and validation examples of the second version of oore that I used, which represented velocity and time shifts at lower resolution than the original paper, using 20ms increments for time shifts, and 16 possible velocity values.
Training examples are produced at three speeds. Harmonic augmentation is not produced as separate data, processed on the fly.

In [None]:
# for oore, get 601 so that we can use 600 at train time
# (Maybe I corrected in the function for this already!)
for speed in [0.9, 1, 1.1]:
    X = get_processed_oore2_data(data_path, filenames_train, skip=1, n_events=601, speed=speed)
    print('examples in X: ', len(X))
    with open(f'training_data/oore_v2/oore2_train_{speed}.json', 'w') as f:
        json.dump(X, f)

X_val = get_processed_oore2_data(data_path, filenames_val, skip=1, n_events=601, speed=speed)
print('examples in X: ', len(X))
with open(f'training_data/oore_v2/oore2_val.json', 'w') as f:
    json.dump(X_val, f)


### Generating chroma for NoteTuple data
The first code block generates note bin data (called NoteTuple in the original paper in which it was introduced), whilst the second generates the corresponding chroma data.

Chroma takes up quite a bit of space, especially with needing three different versions for different speeds. Ideally the speed data augmentation would take place as the data was being fed into the model, but there is a potential bottleneck there, and that would result in only one version of chroma needed. In fact, there is nothing stopping the same chroma data being used for different speeds - it would just require some reorganizing of how I store and get the examples. The same is not true for oore data, in which sequence length changes with speed, due to differing numbers of tokens needed to represent longer timeshifts of different lengths.

In [None]:
for speed in [0.90,1.1]:
    X = files2note_bin_examples(data_path, filenames_train, skip=1, n_notes=220, speed=speed)
    with open(f'training_data/note_bin_v2/nb_220_train{speed}.json', 'w') as f:
        json.dump(X, f)

X_val = files2note_bin_examples(data_path, filenames_train, skip=1, n_notes=220, speed=1)
with open(f'training_data/note_bin_v2/nb_220_val.json', 'w') as f:
    json.dump(X_val, f)

In [None]:
data_path = 'training_data/note_bin_v2/'

for speed in [0.9, 1, 1.1]:
    # chroma modes to make data for
    for mode in ['weighted', 'normal', 'lowest']:
        with open(data_path + f'nb_220_train{speed}.json', 'r') as f:
            examples = json.load(f)
        print(len(examples[0]))
        chroma = nb_data2chroma(np.array(examples),  mode=mode)
        with open(data_path + f'nb_220_train{speed}_chroma{mode}.json', 'w') as f:
            json.dump(chroma.tolist(), f)

for mode in ['weighted', 'normal', 'lowest']:
    with open(data_path + f'nb_220_val.json', 'r') as f:
                val = json.load(f)
    chroma_val = nb_data2chroma(np.array(val),  mode=mode)
    with open(data_path + f'nb_220_val_{mode}.json', 'w') as f:
        json.dump(chroma_val.tolist(), f)