# Audio2Map
This is an encoder-decoder model based off of seq2seq.

It takes in an audio file for the music as an mp3 and outputs a fully functional map for the hit rhythm game Osu!

In [1]:
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import keras
from functools import reduce

## Preprocessing

Here, we transform the input to a Constant-Q spectrogram spanning C1 to roughly C7. Then, for training, we obtain the pkl file containing the output vector representing the target output map.

We also obtain the difficulty for our target output to feed into the decoder, when deployed, this will be input from the user.

In [2]:
import datetime
def convert_to_spectrogram(filename):
	try:
		targetSampleRate = 11025
		y, sr = librosa.load(filename, sr=targetSampleRate)
		C = np.abs(librosa.cqt(y, sr=targetSampleRate, n_bins=84, bins_per_octave=12))
		S = librosa.amplitude_to_db(C, ref=np.max)
		#plot the spectrogram
		
		'''plt.figure(figsize=(12, 4))
		librosa.display.specshow(S, sr=targetSampleRate, x_axis='time', y_axis='cqt_note')
		plt.colorbar(format='%+2.0f dB')
		plt.title('Constant-Q power spectrogram')
		plt.tight_layout()
		plt.show()'''
		return S
	except:
		tsprint("ERROR: cannot convert to spectrogram. Removed file " + filename + ".")

def get_pkl(filename):
	try:
		return pickle.load(open(filename, 'rb'))
	except:
		tsprint("ERROR: .pkl file does not exist.")
		return -1

def tsprint(s):
	print("[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "] " + s)

   
def parse_difficulty(filename):
	if(not os.path.isfile(filename)):
		tsprint("ERROR: map file does not exist. Removing.")
		os.remove("pickles/" + filename.split("/")[1].split(".")[0] + ".pkl")
		return -1

	with open(filename, "r") as f:
		try:
			lines = f.readlines()
		except:
			tsprint("ERROR: cannot read lines of .osu file.")


	difficulty = [-1,-1,-1,-1,-1,-1]

	for line in lines:
		#difficulty
		if line.startswith("HPDrainRate"): difficulty[0] = float(line.split(":", 1)[1])
		elif line.startswith("CircleSize"): difficulty[1] = float(line.split(":", 1)[1])
		elif line.startswith("OverallDifficulty"): difficulty[2] = float(line.split(":", 1)[1])
		elif line.startswith("ApproachRate"): difficulty[3] = float(line.split(":", 1)[1])
		elif line.startswith("SliderMultiplier"): difficulty[4] = float(line.split(":", 1)[1])
		elif line.startswith("SliderTickRate"): difficulty[5] = float(line.split(":", 1)[1])
		elif not (line.startswith("[Difficulty]")): break

	#check if all the difficulty stats are there
	for val in difficulty:
		if val == -1:
			tsprint("ERROR: Not a valid osu! map due to insufficient stats. Removed file " + filename + ".")
			os.remove(filename)
			return -1


	return difficulty

def load_data():
	inputs = []
	diffs = []
	targets = []

	curr_length = 0
	counter = 0

	if os.path.isfile("loaded_save.pkl"):
		inputs, diffs, targets = pickle.load(open("loaded_save.pkl", 'rb'))
		curr_length = len(inputs)


	for pickle_root, pickle_dirs, pickle_files in os.walk("pickles"):
		for pickle_file in pickle_files:
			counter += 1
			if counter < curr_length: continue

			tsprint("Parsing file " + pickle_file)
			inputs.append(convert_to_spectrogram(os.path.join("audio/", pickle_file.split("_")[0] + ".mp3")))
			diffs.append(parse_difficulty("maps/" + pickle_file.split(".")[0] + ".osu"))
			targets.append(get_pkl("pickles/" + pickle_file))

			if counter % 100 == 0:
				pickle.dump([inputs, diffs, targets], open("loaded_save.pkl", 'wb'))
				tsprint("Saved progress.")
				tsprint("Parsed " + str(counter) + " files.")
	
	return inputs, diffs, targets

In [3]:

#Load data
inputs, diffs, targets = pickle.load(open("loaded_save.pkl", 'rb'))


In [6]:
for i in range(len(inputs)):
	if inputs[i] is not None: inputs[i] = inputs[i].T
	if not isinstance(diffs[i], int): diffs[i] = tf.convert_to_tensor(diffs[i])

In [7]:
from sklearn.model_selection import train_test_split
import copy
# Now get the data >:D
#inputs, diffs, targets = load_data()

train_x, test_x, train_diffs, test_diffs, train_y, test_y = train_test_split(inputs, diffs, targets, test_size=0.1)
train_x, val_x, train_diffs, val_diffs, train_y, val_y = train_test_split(train_x, train_diffs, train_y, test_size=0.1)

decoder_inputs = train_y
decoder_targets = []


for i in range(len(decoder_inputs)):
    if decoder_inputs[i] is None or isinstance(decoder_inputs[i], int): 
        continue

    decoder_targets.append(tf.sparse.slice(decoder_inputs[i][0], [0, 0], [decoder_inputs[i][0].dense_shape[0], decoder_inputs[i][0].dense_shape[1] - 1]))


In [13]:
print(tf.test.gpu_device_name())




In [7]:
"""
- Given a song, we can generate a spectrogram
- Take the spectrogram and produce a list of times (rythmic beats)
"""
# Encoder
audio_dim = 84
enc_hidden_dim = 64
enc_input = keras.Input(shape=(None, audio_dim))

# Use LSTM to predict the note timings of the song
_, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(enc_hidden_dim, return_state=True, dropout=0.5))(enc_input)
state_h = keras.layers.Concatenate()([forward_h, backward_h])
state_c = keras.layers.Concatenate()([forward_c, backward_c])

encoder = keras.Model(enc_input, outputs=[state_h, state_c], name='encoder')
encoder.summary()

In [8]:
# Decoder

dec_hidden_dim = enc_hidden_dim * 2

# Use LSTM to predict the note timings of the song
decoder_input_h = keras.Input(shape=(dec_hidden_dim,), name='decoder_input_h')
decoder_input_c = keras.Input(shape=(dec_hidden_dim,), name='decoder_input_c')
decoder_input = keras.Input(shape=(None, audio_dim+6), name='decoder_input', sparse=True)

decoder_lstm = tf.keras.layers.LSTM(dec_hidden_dim, return_sequences=True, return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input, initial_state=[decoder_input_h, decoder_input_c])

decoder_dense = keras.layers.Dense(8)
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder = keras.Model([decoder_input, decoder_input_h, decoder_input_c], 
                      outputs=[decoder_outputs, state_h, state_c], 
                      name='decoder')
decoder.summary()

In [9]:
encoder_input_x = keras.Input(shape=(None, audio_dim), name='encoder_input_x')
decoder_input_x = keras.Input(shape=(None, audio_dim), name='decoder_input_x', sparse=True)
output_diff = keras.Input(shape=(None, 6), name='output_diff') # output difficulty target

class Dumb_Layer(keras.layers.Layer):
    def __init__(self):
        super(Dumb_Layer, self).__init__()
        self.tile_layer = keras.layers.Lambda(lambda x: tf.tile(x, [1, tf.shape(x)[1], 1]))

    def call(self, bruh):
        expanded_diff = self.tile_layer(bruh)
        return expanded_diff

expandeddiff = Dumb_Layer()(output_diff)
concat_decoder_input= keras.layers.Concatenate()([decoder_input_x, expandeddiff])


encoder_states = encoder(encoder_input_x)
decoder_lstm_out, _, _ = decoder_lstm(concat_decoder_input, initial_state=encoder_states)
decoder_pred = decoder_dense(decoder_lstm_out)

audio2map = keras.Model(inputs=[encoder_input_x, decoder_input_x, output_diff], 
                        outputs=decoder_pred, 
                        name='audio2map')
audio2map.summary()




## Training Time!

In [12]:
audio2map.compile(optimizer='adam', loss='mean_squared_error', metrics=['loss', 'accuracy'])
print(train_diffs[0])
history = audio2map.fit([train_x, decoder_inputs, train_diffs], decoder_targets, epochs=10, batch_size=32)
audio2map.save('audio2map.h5') # This may not work well, but just in case we can

[8.0, 4.0, 8.0, 5.0, 1.5, 2.0]


In [None]:
train_loss = history.history['loss']
plt.plot(np.arange(len(train_loss)), train_loss, 'b', label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
train_acc = history.history['accuracy']
plt.plot(np.arange(len(train_acc)), train_acc, 'b', label='Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

## Make some predictions :O

Do some fine tuning for the model as necessary

In [None]:
def decode_audio(audio):
	states = encoder.predict(audio)
	
	target_seq = np.zeros((1, 1, 84))

	decoded_map = []

	for i in range(audio.shape[0]):
		output, h, c = decoder.predict([target_seq] + states)
		
		decoded_map.append(output)

		states = [h, c]

	return decoded_map

In [None]:
for i in range(10):
	decoded_map = decode_audio(test_x[i])	
	print("Actual: ")
	print(test_y[i])
	print("Predicted: ")
	print(decoded_map)
	print(keras.losses.MSE(test_y[i], decoded_map))

## Evaluate that bish B)

In [None]:
decoder_targets = np.zeros(targets.shape)
decoder_targets[:, 0:-1] = decoder_targets[:, 1:]

audio2map.compile(optimizer='adam', loss='mean_squared_error', metrics=['loss', 'accuracy', 'precision', 'recall', 'f1'])
history = audio2map.fit([inputs, targets, diffs], decoder_targets, epochs=10, batch_size=32, validation_split=0.15)
audio2map.save('audio2map_full.h5') # This may not work well, but just in case we can

In [None]:
print("Final Loss: ")
print(history.history['loss'])
print("Final Accuracy: ")
print(history.history['accuracy'])
print("Final Precision: ")
print(history.history['precision'])
print("Final Recall: ")
print(history.history['recall'])
print("Final F1: ")
print(history.history['f1'])