# Music Generation
The notebook will serve to generate music from the trained model.

In [1]:
# import the necessary libraries
import pickle
import numpy as np
from music21 import instrument, note, stream, chord
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.layers import Activation, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [2]:
# import the original dataset first


# import note stuff

with open('../assets_notes_times/notes', 'rb') as filepath:
    notes = pickle.load(filepath)

with open('../assets_notes_times/note_to_int', 'rb') as filepath:
    note_to_int = pickle.load(filepath)
    
with open('../assets_notes_times/song_notes', 'rb') as filepath:
    song_notes = pickle.load(filepath)
    

# import time stuff

with open('../assets_notes_times/time', 'rb') as filepath:
    time = pickle.load(filepath)

with open('../assets_notes_times/time_to_int', 'rb') as filepath:
    time_to_int = pickle.load(filepath)
    
with open('../assets_notes_times/song_times', 'rb') as filepath:
    song_times = pickle.load(filepath)

In [3]:
# load the song_notes and song_times for test dataset
with open('../assets_test_songs/song_notes', 'rb') as filepath:
    test_song_notes = pickle.load(filepath)
        
with open('../assets_test_songs/song_times', 'rb') as filepath:
    test_song_times = pickle.load(filepath)

## Prepare Prediction Input

Create 2 different inputs using the test song data
- **network_input**: the list of 100-note inputs to randomize for prediction
- **model_input**: to recreate the training model architecture

In [4]:
def prediction_input(some_notes_train, some_notes_test, something_to_dict):
    # some_notes will be the song_notes/song_times of the test dataset songs
    # something_to_dict is the note_to_int/time_to_int

    sequence_length = 100
    network_input = []
    
    for element in some_notes_test:
    
        for i in range(0, len(element) - sequence_length, 1):
            sequence_in = element[i:i + sequence_length]
            sequence_out = element[i + sequence_length]
            network_input.append([something_to_dict.get(char,0) for char in sequence_in])
            #.get(char,0) defaults to key 0, which is the 'unkw' value.
            # this ensures that if note/duration does ot exist in the test data, it will default to 'unkw'.

    # reshape the input into a format compatible with LSTM layers
    
    model_input = []
    for element in some_notes_train:
    
        for i in range(0, len(element) - sequence_length, 1):
            sequence_in = element[i:i + sequence_length]
            sequence_out = element[i + sequence_length]
            model_input.append([something_to_dict.get(char,0) for char in sequence_in])
    
    n_patterns = len(model_input)
            
    model_input = np.reshape(model_input, (n_patterns, sequence_length, 1))

    return (network_input, model_input)

In [5]:
# prepare note inputs
network_input_notes, model_input_notes = prediction_input(song_notes, test_song_notes, note_to_int)

In [6]:
# prepare duration inputs
network_input_times, model_input_times = prediction_input(song_times, test_song_times, time_to_int)

In [7]:
print(len(network_input_notes))
print(len(model_input_notes))
print(len(network_input_times))
print(len(model_input_times))

84424
53167
84424
53167


## Recreating the Model Architecture
This will allow us to load the trained model weights and get the predictions.

In [8]:
def create_network(model_input, vocab):
    # re-create the structure of the neural network
    # vocab is the n_vocab/t_vocab
    model = Sequential()
    model.add(Embedding(
        vocab,
        512,
        input_length=100,
    ))
    model.add(Bidirectional(LSTM(
        512,
        recurrent_dropout=0,
        return_sequences=True
    )))
    model.add(Bidirectional(LSTM(512, return_sequences=True, recurrent_dropout=0,)))
    model.add(Bidirectional(LSTM(512)))
    model.add(BatchNorm())
    model.add(Dropout(0.3))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(BatchNorm())
    model.add(Dropout(0.3))
    model.add(Dense(vocab))
    model.add(Activation('softmax'))
    opt = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])

    return model

In [9]:
# notes vocab
n_vocab = len(note_to_int)
print(n_vocab)

# duration vocab
t_vocab = len(time_to_int)
print(t_vocab)

383
174


In [10]:
# recreate notes model
model_notes = create_network(model_input_notes, n_vocab)
model_notes.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 512)          196096    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 1024)        4198400   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 1024)        6295552   
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 1024)             6295552   
 nal)                                                            
                                                                 
 batch_normalization (BatchN  (None, 1024)             4096      
 ormalization)                                          

In [43]:
# load note weights
# model_notes.load_weights('../weights/notes/weights-09-1.8850.hdf5') # 50% accuracy
# model_notes.load_weights('../weights/notes/weights-15-0.6089.hdf5') # 80% accuracy
# model_notes.load_weights('../weights/notes/weights-19-0.3403.hdf5') # 90% accuracy
model_notes.load_weights('../weights/notes/weights-58-0.1027.hdf5') # 97% accuracy

In [12]:
# recreate notes model
model_time = create_network(model_input_times, t_vocab)
model_time.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 512)          89088     
                                                                 
 bidirectional_3 (Bidirectio  (None, 100, 1024)        4198400   
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, 100, 1024)        6295552   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 1024)             6295552   
 nal)                                                            
                                                                 
 batch_normalization_2 (Batc  (None, 1024)             4096      
 hNormalization)                                      

In [44]:
# load duration weights
# model_time.load_weights('../weights/times/weights-01-2.2547.hdf5') # 50% accuracy
# model_time.load_weights('../weights/times/weights-21-0.6097.hdf5') # 80% accuracy
# model_time.load_weights('../weights/times/weights-28-0.3240.hdf5') # 90% accuracy
model_time.load_weights('../weights/times/weights-52-0.0870.hdf5') # 97% accuracy

## Generate Notes
We will generate 100 notes/chrods and durations. To start us off, we are randomly selecting one of our 100-note inputs from the test set.

In [14]:
# create pitchnames and timenames
pitchnames = sorted(set(item for item in notes))
pitchnames.insert(0,'unkw')
int_to_note = dict((number, note) for number, note in enumerate(pitchnames))

timenames = sorted(set(item for item in time))
timenames.insert(0,'unkw')
int_to_time = dict((number, time) for number, time in enumerate(timenames))

The first note generation technique invloves generating a note/chord or duration, and putting this generated note back into the input sequence, which is then used to predict the next note. Repeat the process as many times as you want the length of the song to be.

I find that this works best for note/chord generation.

In [15]:
def generation(model, network_input, int_to_something, start, song_length):

    # start - pick a random 100-note training input to start off our prediction
    pattern = network_input[start] # every element is a 100 note sequence
    prediction_output = []

    # generate notes
    for note_index in range(song_length):
        # reshape 1 record of prediction input and predict the output
        prediction_input = np.reshape(pattern, (1, len(pattern), 1))
     
        prediction = model.predict(prediction_input, verbose=0)

        # Return the index of the output vector with the highest value
        index = np.argmax(prediction)
        # Map the predicted integer back to the corresponding note 
        result = int_to_something[index]
        # Store the predicted note into an output list and append the predicted note to the initial training input
        prediction_output.append(result)
        pattern.append(index)
        # Drop the first note and keep the latest 100 note for the next note prediction cycle 
        pattern = pattern[1:len(pattern)]

    return prediction_output

The second note generation technique invloves not inserting the generated notes back into the input, but rather shift the input song by one note with each cycle.

I find this works best for duration generation.

In [16]:
def generation_2(model, network_input, int_to_something, start, song_length):

    # start - pick a random 100-note training input to start off our prediction
    pattern = network_input[start] # every element is a 100 note sequence
    prediction_output = []
    count = 0

    # generate notes
    for note_index in range(song_length):
        # reshape 1 record of prediction input and predict the output
        prediction_input = np.reshape(pattern, (1, len(pattern), 1))
     
        prediction = model.predict(prediction_input, verbose=0)

        # Return the index of the output vector with the highest value
        index = np.argmax(prediction)
        # Map the predicted integer back to the corresponding note 
        result = int_to_something[index]
        # Store the predicted note into an output list and append the predicted note to the initial training input
        prediction_output.append(result)
        # Shift the input song by 1 note and keep the latest 100 note for the next note prediction cycle 
        count+=1
        pattern = network_input[(start+count)]

    return prediction_output

In [45]:
# get the start
start = np.random.randint(0, len(network_input_notes)) # both notes and time input have same length

# set song_length
song_length = 100

# get the predicted notes
prediction_notes = generation(model_notes, network_input_notes, int_to_note, start, song_length)

# get the predicted duration
prediction_times = generation(model_time, network_input_times, int_to_time, start, song_length)

## Generate MIDI
Finally, we can generate the MIDI file! We will combine the predicted notes together with the durations.

In [18]:
# create function to turn the string durations back into float
def convert_to_float(frac_str):
    try:
        return float(frac_str)
    except ValueError:
        num, denom = frac_str.split('/')
        try:
            leading, num = num.split(' ')
            whole = float(leading)
        except ValueError:
            whole = 0
        frac = float(num) / float(denom)
        return whole - frac if whole < 0 else whole + frac

In [46]:
def create_midi(prediction_notes, prediction_times):
#     offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model
    for n in range(len(prediction_notes)): # length of predicted notes is same as length of predicted time
        pattern_note = prediction_notes[n]
        duration = prediction_times[n]
        pattern = pattern_note
        # pattern is a chord
        if ('.' in pattern) or pattern.isdigit():
            notes_in_chord = pattern.split('.')
            notes = []
            for current_note in notes_in_chord:
                new_note = note.Note(int(current_note))
                new_note.storedInstrument = instrument.Piano()
                notes.append(new_note)
            new_chord = chord.Chord(notes, quarterLength=convert_to_float(duration))
            output_notes.append(new_chord)
        # pattern is a rest
        elif('rest' in pattern):
#             pass
            new_rest = note.Rest(pattern, quarterLength=convert_to_float(duration))
            new_rest.storedInstrument = instrument.Piano()
            output_notes.append(new_rest)
        else:
            new_note = note.Note(pattern, quarterLength=convert_to_float(duration))
            new_note.storedInstrument = instrument.Piano()
            output_notes.append(new_note)


    midi_stream = stream.Stream(output_notes)


#     midi_stream.write('midi', fp='../midi_generate_classical/50%.mid') # 50% accuracy
#     midi_stream.write('midi', fp='../midi_generate_classical/80%.mid') # 80% accuracy
#     midi_stream.write('midi', fp='../midi_generate_classical/90%.mid') # 90% accuracy
    midi_stream.write('midi', fp='../midi_generate_classical/97%.mid') # 97% accuracy

In [47]:
create_midi(prediction_notes, prediction_times)

## Generated Music

Thanks to the weights of the model, we are able to see the progress of the music generation at various stages of the training:

### 50% accuracy

https://soundcloud.com/zhe-wei-3/50a?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

As can be heard, the generated music is very repetitive, with only slight variations in the notes and no variation in the durations. This can be expected as it is still very early in the training.

### 80% accuracy

https://soundcloud.com/zhe-wei-3/80a?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

This generated music is slighlty better, with more variation in the notes and durations, but still prone to repetitive patterns.

### 90% accuracy

https://soundcloud.com/zhe-wei-3/90a?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

This song starts out strong, with some interesting ideas at the start. Much more melodic and complex than the last 2, it nevertheless falls back to some repetition towards the end.

### 97% accuracy

https://soundcloud.com/zhe-wei-3/97a?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

This is the most musical piece out of the 4. It has motifs, with the repeated patterns gradually change from one to another, showing a high degree of musical literacy.

### Best Songs

These are a collection of some of the best and most interesting generations from the model. I used the **generation** function for the notes/chords, and the **generation_2** function for thr durations. Enjoy!

#### Emotional
https://soundcloud.com/zhe-wei-3/emotional?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

#### Fast
https://soundcloud.com/zhe-wei-3/fast?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

#### Happy
https://soundcloud.com/zhe-wei-3/happy?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

#### Melodic
https://soundcloud.com/zhe-wei-3/melodic?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing

#### Moody
https://soundcloud.com/zhe-wei-3/moody?utm_source=clipboard&utm_medium=text&utm_campaign=social_sharing