# Music Generation using LSTM neural network

This notebook walk through the process of building Music Generation model based on LSTM using Keras.

## Imports

In [66]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import music21 as m21
import numpy as np
import glob

## Preprocessing song dataset

I'm using different piano melodies in MIDI format mostly from Final Fantasy soundtracks.

### Loading

In [33]:
DATA_PATH = '../data/'

Here you can see the code that parse all MIDI files and makes array with all the notes and chords from them.

In [34]:
notes = []

for file in glob.glob(DATA_PATH + '*.mid'):
    midi = m21.converter.parse(file)
    notes_to_parse = None

    parts = m21.instrument.partitionByInstrument(midi)
    if parts:
        notes_to_parse = parts.parts[0].recurse()
    else:
        notes_to_parse = midi.flat.notes

    for element in notes_to_parse:
        if isinstance(element, m21.note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, m21.chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))

### Shaping data

In [35]:
SEQ_LEN = 100

Get all unique pitchnames.

In [70]:
pitchnames = sorted(set(notes))
n_vocab = len(pitchnames)
n_vocab

326

Create a note_to_int dictionary to map pitches to integers.

In [52]:
note_to_int = {note:number for number, note in enumerate(pitchnames)}
note_to_int[pitchnames[52]]

52

Create input sequences and the corresponding outputs.

In [62]:
network_input = []
network_output = []

for i in range(0, len(notes) - SEQ_LEN):
    input_seq = notes[i:i + SEQ_LEN]
    output_note = notes[i + SEQ_LEN]
    network_input.append([note_to_int[char] for char in input_seq])
    network_output.append(note_to_int[output_note])

Reshape the input into a format compatible with LSTM layers.

In [64]:
network_input = np.reshape(network_input, (len(network_input), SEQ_LEN, 1))

Normalize input and one-hot encode the output.

In [65]:
network_input = network_input / float(n_vocab)
network_output = tf.keras.utils.to_categorical(network_output)

In [67]:
network_input.shape

(45876, 100, 1)

## Model

### Neural network architecture

In [None]:
model = Sequential()
model.add(LSTM(
    512,
    input_shape=(network_input.shape[1], network_input.shape[2]),
    return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

### Fitting

Declare checkpoints to save the weights and be able to stop training at any time.

In [None]:
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"    
checkpoint = ModelCheckpoint(
    filepath, monitor='loss', 
    verbose=0,        
    save_best_only=True,        
    mode='min'
)    
callbacks_list = [checkpoint]     

Fit the model.

In [None]:
model.fit(network_input, network_output, epochs=200, batch_size=64, callbacks=callbacks_list)

## Generation

Set up the network model in the same way as before.

In [None]:
model = Sequential()
model.add(LSTM(
    512,
    input_shape=(network_input.shape[1], network_input.shape[2]),
    return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Load the weights to each node.

In [None]:
model.load_weights('weights.hdf5')

Randomize the first note.

In [None]:
start = numpy.random.randint(0, len(network_input)-1)

pattern = network_input[start]
prediction_output = []

Create a int_to_note dictionary to map integers to pitches.

In [None]:
int_to_note = {number:note for number, note in enumerate(pitchnames)}

Generate 500 notes.

In [None]:
for note_index in range(500):
    prediction_input = numpy.reshape(pattern, (1, len(pattern), 1))
    prediction_input = prediction_input / float(n_vocab)
    prediction = model.predict(prediction_input, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_note[index]
    prediction_output.append(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]