# Metric learning for MIR coding demo (2)

# Training

## Enabling and testing the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to **Edit→Notebook** Settings
- select **GPU** from the **Hardware Accelerator** drop-down

Next, we'll confirm that we can connect to the GPU with tensorflow:

> Source: https://colab.research.google.com/notebooks/gpu.ipynb

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
print(f'TensorFlow version: {tf.__version__}')

Found GPU at: /device:GPU:0
TensorFlow version: 2.3.0


## Preparing the dataset

In [None]:
# Install a Google Drive downloading tool
!pip install gdown

# Download the dataset
!gdown --id 1MycZ6p3Y4OPtQVQXddqbOOTi7f7Wh_8f
!gdown --id 17Yl_K84dbADoHude6v_ON6pGqsPCMPPA

# Extract mel-spectrograms
!tar zxf dim-sim_mel.tar.gz

Downloading...
From: https://drive.google.com/uc?id=1MycZ6p3Y4OPtQVQXddqbOOTi7f7Wh_8f
To: /content/dim-sim_mel.tar.gz
721MB [00:06, 114MB/s]
Downloading...
From: https://drive.google.com/uc?id=17Yl_K84dbADoHude6v_ON6pGqsPCMPPA
To: /content/dim-sim_all.json
3.07MB [00:00, 146MB/s]


## Importing packages

In [None]:
import json
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from tensorflow.keras.layers import (Conv1D, MaxPool1D, BatchNormalization, GlobalAvgPool1D, Dense, dot, 
                                     Activation, Input, Flatten, Lambda, Embedding, Concatenate, Layer, Reshape)
from sklearn.preprocessing import normalize

## Loading the metadata

In [None]:
# Load json metadata
def load_json(file_name):
	"""Load json."""
	with open(file_name, 'r') as f:
		data = json.load(f)
	return data
	
trainset = load_json('dim-sim_all.json')

print(f'The number of training examples: {len(trainset)}')

The number of training examples: 3781


## Creating data loaders

In [None]:
# Setup the batch size and compute steps
batch_size = 10
steps_per_epoch = int(len(trainset) / batch_size)

def data_loader(dataset):
	"""Data loader."""

	# IDs for dataset.
	triplet_ids = list(dataset.keys())

	# Generator.
	count_triplet = 0
	while True:
		for batch_iter in range(0, steps_per_epoch * batch_size, batch_size):
			if count_triplet > len(dataset) - batch_size:
				count_triplet = 0 

			batch_x, batch_y = batch_triplet_loader(dataset, triplet_ids[count_triplet: count_triplet + batch_size])
			
			count_triplet += batch_size
			yield batch_x, batch_y

def mel_normalization(mel):
	"""Normalization mel value."""
	mel -= 0.20
	mel /= 0.25
	return mel

def batch_triplet_loader(dataset, triplet_ids):
	"""Batch loader."""

	anchor_col = []
	positive_col = []
	negative_col = []
	for triplet_id in triplet_ids:
		triplet = dataset[triplet_id]
		anchor_mel = np.load('./dim-sim_mel/' + triplet['anchor']['id'] + '.npy')
		positive_mel = np.load('./dim-sim_mel/' + triplet['positive']['id'] + '.npy')
		negative_mel = np.load('./dim-sim_mel/' + triplet['negative']['id'] + '.npy')

		# Normalize mel.
		anchor_mel = mel_normalization(anchor_mel)
		positive_mel = mel_normalization(positive_mel)
		negative_mel = mel_normalization(negative_mel)

		# Stack batch data.
		anchor_col.append(anchor_mel)
		positive_col.append(positive_mel)
		negative_col.append(negative_mel)

	# To array.
	anchor_col = np.array(anchor_col)
	positive_col = np.array(positive_col)
	negative_col = np.array(negative_col)

	batch_x = {
		'anchor_input': anchor_col,
		'positive_input': positive_col,
		'negative_input': negative_col
	}

	batch_y = np.zeros((batch_size, 2))
	batch_y[:, 0] = 1
	return batch_x, batch_y

## Creating a backbone model

In [None]:
# Basic block.
def basic_block(x, num_features, fp_length):
	x = Conv1D(num_features, fp_length, padding='same', use_bias=True, kernel_initializer='he_uniform')(x)
	x = BatchNormalization()(x)
	x = Activation('relu')(x)
	x = MaxPool1D(pool_size=fp_length, padding='valid')(x)
	return x

# Backbone model.
num_frames = 130
x_in = Input(shape = (num_frames, 128))
x = basic_block(x_in, 64, 4)
x = basic_block(x, 64, 4)
x = basic_block(x, 64, 4)
x = basic_block(x, 64, 2)
x = GlobalAvgPool1D()(x)
backbone_model = Model(inputs=[x_in], outputs=[x], name='backbone')
backbone_model.summary()

Model: "backbone"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 130, 128)]        0         
_________________________________________________________________
conv1d (Conv1D)              (None, 130, 64)           32832     
_________________________________________________________________
batch_normalization (BatchNo (None, 130, 64)           256       
_________________________________________________________________
activation (Activation)      (None, 130, 64)           0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 32, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 32, 64)            16448     
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 64)            256

## Creating a triplet model

In [None]:
# Triplet model.
anchor = Input(shape = (num_frames, 128), name='anchor_input')
positive = Input(shape = (num_frames, 128), name='positive_input')
negative = Input(shape = (num_frames, 128), name='negative_input')

anchor_embedding = backbone_model(anchor)
positive_embedding = backbone_model(positive)
negative_embedding = backbone_model(negative)

# Cosine similarity.
dist_fn = Lambda(lambda x: dot(x, axes=1, normalize=True))
dist_anchor_positive = dist_fn([anchor_embedding, positive_embedding])
dist_anchor_negative = dist_fn([anchor_embedding, negative_embedding])

# Stack the similarity scores [1,0] and triplet model.
similarity_scores = Lambda(lambda vects: K.stack(vects, axis=1))([dist_anchor_positive, dist_anchor_negative])
tripletmodel = Model(inputs=[anchor, positive, negative], outputs=similarity_scores, name='triplet')
tripletmodel.summary()

Model: "triplet"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       [(None, 130, 128)]   0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     [(None, 130, 128)]   0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     [(None, 130, 128)]   0                                            
__________________________________________________________________________________________________
backbone (Functional)           (None, 64)           75008       anchor_input[0][0]               
                                                                 positive_input[0][0]       

## Defining the triplet loss function

In [None]:
# Define the loss function
def triplet_hinge_loss(y_true, y_pred):
	"""Triplet hinge loss."""
	# Always the first dimension of the similarity score is true.
	# Margin is set to 0.1
	y_pos = y_pred[:, 0]
	y_neg = y_pred[:, 1]
	loss = K.mean(K.maximum(0., 0.1 + y_neg - y_pos))
	return loss

## Training!

In [None]:
# Create an optimizer
optimizer = Adam(lr=0.001)

# Compile the model with the loss
tripletmodel.compile(optimizer, loss=triplet_hinge_loss)

# Kick off the training!
tripletmodel.fit(data_loader(trainset),
		epochs=20,
		verbose=1,
		steps_per_epoch=steps_per_epoch,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f78f3b92048>



---
# Evaluation


## Preparing input data

In [None]:
# Collect unique tracks.
track_ids = []
triplet_ids = list(trainset.keys())
for triplet_id in triplet_ids:
	triplet = trainset[triplet_id]
	anchor = triplet['anchor']['id']
	positive = triplet['positive']['id']
	negative = triplet['negative']['id']
	track_ids.append(anchor)
	track_ids.append(positive)
	track_ids.append(negative)

# Load mel.
track_id_to_mel = {}
for track_id in track_ids:
	mel = np.load('./dim-sim_mel/' + track_id + '.npy')
	# Normalize mel.
	mel = mel_normalization(mel)
	mel = np.expand_dims(mel, axis=0)
	track_id_to_mel[track_id] = mel

# Prepare input mel-spectrograms
mels = np.squeeze(np.array(list(track_id_to_mel.values())))

## Extracting embedding features

In [None]:
# Extract embedding features of the tracks
embedding_features = backbone_model.predict(mels, batch_size=64)

# Collect the embedding features
track_id_to_features = {}
for i, track_id in enumerate(track_ids):
  track_id_to_features[track_id] = embedding_features[i]

## Computing distances and scores (triplet prediction)

In [None]:
# Define a distance function
def euclidean_distance(x1, x2):
	return np.sqrt(np.maximum(np.sum(np.square(x1 - x2)), 1e-07))

# Define an evaluation metric
def calculate_accuracy(prediction, groundtruth):
	y_true = np.argmax(groundtruth, axis=-1)
	y_pred = np.argmin(prediction, axis=-1)
	accuracy = float(sum(y_true == y_pred))/len(groundtruth)
	return accuracy

# A placeholder array for triplet prediction 
prediction = np.zeros((len(triplet_ids), 2))
# A placeholder array for the baseline
mel_prediction = np.zeros((len(triplet_ids), 2))
# Create a groundtruth array
groundtruth = np.zeros_like(prediction)
groundtruth[:, 0] = 1

0.9000264480296218
0.608833641893679


In [None]:
# Compute distances and scores
for i in range(len(triplet_ids)):
	triplet = trainset[triplet_ids[i]]
	anchor = triplet['anchor']['id']
	positive = triplet['positive']['id']
	negative = triplet['negative']['id']
	
	prediction[i, 0] = euclidean_distance(
			np.squeeze(normalize(track_id_to_features[anchor].reshape(1, -1), 'l2')),
			np.squeeze(normalize(track_id_to_features[positive].reshape(1, -1), 'l2'))
			)
	prediction[i, 1] = euclidean_distance(
			np.squeeze(normalize(track_id_to_features[anchor].reshape(1, -1), 'l2')),
			np.squeeze(normalize(track_id_to_features[negative].reshape(1, -1), 'l2'))
			)
	
	# mel similarity
	mel_prediction[i, 0] = euclidean_distance(
			np.squeeze(normalize(track_id_to_mel[anchor].flatten().reshape(1, -1), 'l2')),
			np.squeeze(normalize(track_id_to_mel[positive].flatten().reshape(1, -1), 'l2'))
			)
	mel_prediction[i, 1] = euclidean_distance(
			np.squeeze(normalize(track_id_to_mel[anchor].flatten().reshape(1, -1), 'l2')),
			np.squeeze(normalize(track_id_to_mel[negative].flatten().reshape(1, -1), 'l2'))
			)


accuracy = calculate_accuracy(prediction, groundtruth)
mel_accuracy = calculate_accuracy(mel_prediction, groundtruth)
print(f'Triplet model accuracy: {accuracy:.2f}')
print(f'Baseline accuracy     : {mel_accuracy:.2f}')

Triplet model accuracy: 0.90
Baseline accuracy     : 0.61
