# Install Dependencies

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import datetime
import seaborn as sn
import pandas as pd
import sklearn
import itertools
import plotly.express as px
import plotly.graph_objects as go

from PIL import Image, ImageChops
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, Rescaling, GlobalAveragePooling2D
from keras.utils import to_categorical
from keras import backend as K
from keras.callbacks import Callback
from keras.applications.vgg16 import VGG16
from IPython.display import Audio, display


dataset_path = "/content/drive/MyDrive/Extended_Essay/Data_GZAN/images_original"

music_genres_list = ['blues', 'classical', 'country', 'disco',
                     'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data
tf.test.gpu_device_name()

/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data


'/device:GPU:0'

# Prepare Image Data

In [None]:


# a manual way to load image data. we will convert each image into a numpy array
# and load it directly to memory


def trim_white_space(im):
    bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -100)
    bbox = diff.getbbox()
    if bbox:
        return im.crop(bbox)
    else:
        # Failed to find the borders, convert to "RGB"
        return trim_white_space(im.convert('RGB'))


def get_images(input_images_dir):
  num_images = len(os.listdir(input_images_dir))  # find number of images in dataset
  data_loader_size = (288, 432)   #specify input image size for keras data loader
  final_image_size = (218, 336)  # specify final image size for the dataset
  dataset = np.zeros(shape=(num_images, final_image_size[0], final_image_size[1], 3))  # creates empty array of batches size (1000, 218, 336, 3)
                                                                           # indicates (num_imgs, image_dim1, image_dim2, #of channels like rgb)

  for image_index, file_name in enumerate(os.listdir(input_images_dir)):  # enumerate listing of image_dir
    image_path = f"{input_images_dir}/{file_name}"
    print(image_path)

    # load img from directory as a PIL instance
    img = keras.utils.load_img(
        path=image_path,
        color_mode='rgb',
        target_size=data_loader_size,
        interpolation='bilinear') # what is interpolation? --> helps rescale image if it doesn't fit with the target size


    img = trim_white_space(img) # trim surrounding white space in the image

    img = keras.utils.img_to_array(img) # converts PIL instance from utils.load_img() to numpy array
    img = img.astype("float32") / 255 # scale pixel values to range of 0-1 to make model training easier
    dataset[image_index] = img # replace specific index of dataset to numpy image array

  return np.array(dataset) # return array of the dataset



def get_dataset(dataset_dir):
  input_data = [] # create some empty variables
  target_labels = []
  genre_index = 0

  for music_genre in music_genres_list:
    genre_img_data = get_images(f"{dataset_dir}/{music_genre}") # get images from each directory

    num_images = len(genre_img_data)
    num_classes = len(music_genres_list)
    ground_truths = np.zeros(shape=(num_images, num_classes)) # create empty array of 1000 images and 10 classes --> np.shape() = (1000, 10)
    input_data.append(genre_img_data) # add to overall list to convert to giant np array later

    for index in range(ground_truths.shape[0]): # for each image, change the value of the 10 array and put in a 1 for its class position (genre_index)
     ground_truths[index, genre_index] = 1

    target_labels.append(ground_truths) # add to overall list to convert to giant np array later again
    genre_index += 1 # iterate to next music genre

  #print(input_data[9].shape)
  input_data = np.vstack(input_data) # concatenate all classes
  target_labels = np.vstack(target_labels)

  # take an example from dataset for a sanity check
  example_num = 310
  print(f"\npredict data shape: {input_data.shape}\ntarget labels shape: {target_labels.shape}")
  print(f"Example input size: {input_data[example_num].shape}\nExample target_label {target_labels[example_num]}")

  return input_data, target_labels




spectrogram_train, spectrogram_targets = get_dataset(dataset_path)
print(f"\n\nLoaded train dataset: {spectrogram_train.shape}, {spectrogram_targets.shape}")

# Save to disk

In [None]:
# save mfcc data to disk so it doesn't take so long
%cd "/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data"

# takes 18s
np.save('spectrogram_train.npy', spectrogram_train)
np.save('spectrogram_targets.npy', spectrogram_targets)

/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data


# Load from disk

In [None]:
spectrogram_train = np.load('spectrogram_train.npy')
spectrogram_targets = np.load('spectrogram_targets.npy')

# Make a prediction

In [None]:
# some note: look at genre: rock, track 99. Mislcassifies as Jazz, but is actually rock. Though I would think it's jazz

data_loading_image_size = (288, 432)

genre = ""
track_num = "99"

# specify path to test song
example_song = f"{genre}/{genre}000{track_num}.png"
audio_file = f"{genre}/{genre}.000{track_num}.wav"
test_song_path = f"/content/drive/MyDrive/Extended_Essay/Data_GZAN/images_original/{example_song}"
test_audio_path = f"/content/drive/MyDrive/Extended_Essay/Data_GZAN/genres_original/{audio_file}"

# load test song as PIL using Keras
test_song = keras.utils.load_img(
        path=test_song_path,
        color_mode='rgb',
        target_size=data_loading_image_size,
        interpolation='bilinear')

test_song = trim_white_space(test_song)

# convert test song to numpy array and performing pixel scaling
test_song_array = keras.utils.img_to_array(test_song)
test_song_array = test_song_array.astype("float32") / 255  # divide pixel values by 255
test_song_array = np.expand_dims(test_song_array, axis=0)  # add one more dimension so batchsize = 1 --> (1, 288, 432, 3)

# make model predictions
predicted_scores = spectrogram_model.predict(test_song_array)
predicted_music_genre = np.max(predicted_scores)
# find most confident prediction and index to list to find genre
predicted_label = np.where(predicted_scores == predicted_music_genre)[1][0]
predicted_music_genre = music_genres_list[predicted_label]

# print some of the raw scores and plot the mel spectrogram graph
print(predicted_scores)
print(predicted_music_genre)
plt.imshow(test_song)
plt.show()

# make nice audio slider, note standard sampling rate is 22-40kHz
sampling_rate = 22000
display(Audio(test_audio_path, rate=sampling_rate, autoplay=False))

FileNotFoundError: ignored

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.title('LSTM Small Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
#plt.legend(['train'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.title('LSTM Small Accuracy')
plt.ylabel('Loss')
plt.xlabel('Epochs')
#plt.legend(['train'], loc='upper left')
plt.show()


#burrowed from https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/

In [None]:
# takes 28s
%cd "/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data"
spectrogram_train = np.load('spectrogram_train.npy')
spectrogram_targets = np.load('spectrogram_targets.npy')

# Load seperate genre data

In [None]:
def seperate_genre_data(train_dataset):
  # make a placeholder that we can modify without worrying
  placeholder_dataset = train_dataset
  num_tracks_per_genre = {}
  genre_individual_data = {}

  # get num of tracks per genre and save to a directory
  for music_genre in music_genres_list:
    genre_directory = f"/content/drive/MyDrive/Extended_Essay/Data_GZAN/images_original/{music_genre}"
    count = 0
    # Iterate directory
    for path in os.listdir(genre_directory):
        # check if current path is a file
        if os.path.isfile(os.path.join(genre_directory, path)):
            count += 1

    # add to overall dictionary and continously modify the placeholder dataset
    genre_individual_data[music_genre] = placeholder_dataset[:count]
    placeholder_dataset = placeholder_dataset[count:]

  return genre_individual_data

genre_seperate_data = seperate_genre_data(spectrogram_train)

# Click for smaller model

In [None]:
# define our model: example take from here: https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
input_dims = (218, 336, 3)

spectrogram_model = None

spectrogram_model = Sequential()
spectrogram_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_dims))
spectrogram_model.add(Activation('relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(32, (3, 3)))
spectrogram_model.add(Activation('relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(32, (3, 3)))
spectrogram_model.add(Activation('relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
spectrogram_model.add(Dense(64))
spectrogram_model.add(Activation('relu'))
spectrogram_model.add(Dense(32))
spectrogram_model.add(Activation('relu'))
spectrogram_model.add(Dense(10))  # change to num of predicted classes
spectrogram_model.add(Activation('sigmoid'))


spectrogram_model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])


spectrogram_model.summary()


In [None]:
batch_size = 32
epochs = 20

class CustomEarlyStopping(Callback):
    def __init__(self, target_accuracy=1.0):
        super(CustomEarlyStopping, self).__init__()
        self.target_accuracy = target_accuracy

    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= self.target_accuracy:
            print(f"\nReached target accuracy ({self.target_accuracy}). Training stopped.")

# Define custom early stopping callback
custom_early_stopping = CustomEarlyStopping(target_accuracy=1.0)

#experiments with epochs or batch_size!
history = spectrogram_model.fit(
                                x=spectrogram_train,
                                y=spectrogram_targets,
                                epochs=20,
                                batch_size=batch_size,
                                validation_split=0.2,
                                callbacks=[custom_early_stopping]
                               )


 Plot learning curve

In [None]:
#burrowed from https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.title('CNN (Small) Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.ylim(0, 1.2)  # Set the y-axis limit from 0 to 1.2
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.title('CNN (Small) loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
#plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Create confusion matrix

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

def get_confusion_matrix():
  # create some empty variables
  correct_classifications_dict = {}
  y_true = []
  y_predictions = []

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    genre_index = music_genres_list.index(music_genre)
    correct_counter = 0

    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]

    # perform predictions
    predictions = spectrogram_model.predict(spectrogram_test_features)

    # Finding the index of the maximum value along each prediction scores
    max_indices = np.argmax(predictions, axis=1)

    for predicted_label in max_indices:
      # add to correct counter if scored correctly
      if predicted_label == genre_index:
        correct_counter += 1

      # add to previous dictionaries and list for confusion matrix later
      y_true.append(music_genre)
      y_predictions.append(music_genres_list[predicted_label])


    correct_classifications_dict[music_genre] = correct_counter

  # create the confusion matrix using sklearn
  generated_conf_matrix = sklearn.metrics.confusion_matrix(y_true,y_predictions)

  return generated_conf_matrix, correct_classifications_dict

spec_conf_matrix, correct_dict = get_confusion_matrix()
print(spec_conf_matrix)
print(correct_dict)

Display Confusion Matrix

In [None]:
# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
plt.imshow(spec_conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('CNN (Small) Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(len(music_genres_list))
plt.xticks(tick_marks, music_genres_list, rotation=45)
plt.yticks(tick_marks, music_genres_list)

# Annotate each cell with the numeric value
for i in range(len(music_genres_list)):
    for j in range(len(music_genres_list)):
        text_color = 'white' if spec_conf_matrix[i, j] > np.max(spec_conf_matrix) / 2 else 'black'
        plt.text(j, i, str(spec_conf_matrix[i, j]), horizontalalignment='center',
                 verticalalignment='center', color=text_color)

plt.tight_layout()
plt.ylabel('True genre')
plt.xlabel('Predicted genre')

plt.show()

Get last song representations, apply TSNE and obtain centroids

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

# Define a new model to extract the the Dense(32) layer
second_last_layer_model = keras.models.Model(inputs=spectrogram_model.input, outputs=spectrogram_model.layers[-3].output)


def get_hidden_states():
  # create some empty variables
  output_list = []
  genre_track_num = {}

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]
    # get vector outputs
    vector_outputs = second_last_layer_model.predict(spectrogram_test_features)
    # save to list to plot later and count how many tracks there are for each genre
    output_list.append(vector_outputs)
    genre_track_num[f"{music_genre}"] = vector_outputs.shape[0]

  # flatten list of lists into numpy array
  outputs_array = np.vstack((output_list))

  return genre_track_num, outputs_array

genres_totals, raw_learned_representations = get_hidden_states()

In [None]:
# transform hidden states dimensions of 20 to 2 so we can plot it.
# use TSNE dimensionality reduction algorithm to reduce size from (100,330) --> (100,2)
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

tsne_model = sklearn.manifold.TSNE(learning_rate='auto', perplexity=50)
transformed_values = tsne_model.fit_transform(raw_learned_representations)

TSNE_dict = {}

for music_genre in genres_totals:
  num_tracks = genres_totals[music_genre]
  TSNE_dict[music_genre] = transformed_values[:num_tracks]
  transformed_values = transformed_values[num_tracks:]


/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data


In [None]:
genre_centroids = {}

# find the average central point of each genre
for genre in TSNE_dict:
  # get each data for each genre
  data = TSNE_dict[genre]

  # calculate mean_x/y then add to dictionary
  mean_x = np.mean(data[:, 0])
  mean_y = np.mean(data[:, 1])
  centroid = np.array([mean_x, mean_y])

  # calculate euclidean distance (also called L2 norm) from the calculate centroid
  total = 0
  for datapoint in data:
    distance = np.linalg.norm(centroid-datapoint)
    total += distance

  # calculate average distance from centroid
  average_distance = total / len(data)
  genre_centroids[genre] = centroid

Plot learned representations

In [None]:
# Plotting each label's data points using plotly library
fig = px.scatter()

# Add traces for each label's data points
for label, data_points in TSNE_dict.items():
    fig.add_scatter(x=data_points[:, 0], y=data_points[:, 1], mode='markers', name=label)

# configure plot layout
fig.update_layout(
    title='CNN (Small) Song Representations',
    xaxis_title='X-axis',
    yaxis_title='Y-axis'
)

# Add centroids to the plot
for genre in genre_centroids:
    fig.add_trace(go.Scatter(
        x=[genre_centroids[genre][0]],
        y=[genre_centroids[genre][1]],
        mode='markers',
        marker=dict(size=15, symbol='star', line=dict(color='black', width=2)),
        showlegend=False
    ))

fig.show()

# Click for bigger model


In [None]:
# define our model: example take from here: https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
input_dims = (218, 336, 3)

spectrogram_model = None

spectrogram_model = Sequential()

# Convolutional layers
spectrogram_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_dims))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(64, (3, 3), activation='relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(128, (3, 3), activation='relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten before dense layers
spectrogram_model.add(Flatten())

# Dense layers
spectrogram_model.add(Dense(256, activation='relu'))
spectrogram_model.add(Dense(128, activation='relu'))
spectrogram_model.add(Dense(64, activation='relu'))
spectrogram_model.add(Dense(32, activation='relu'))

# Output layer
num_classes = 10  # Change this to the number of predicted classes
spectrogram_model.add(Dense(num_classes, activation='softmax'))


spectrogram_model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])


#spectrogram_model.summary()
plot_model(model, to_file='Mel-spectrogram (Big).png', show_shapes=True, show_layer_names=True)


In [None]:
input_dims = (218, 336, 3)

spectrogram_model = None

spectrogram_model = Sequential()

# Convolutional layers
spectrogram_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_dims))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(64, (3, 3), activation='relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

spectrogram_model.add(Conv2D(128, (3, 3), activation='relu'))
spectrogram_model.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten before dense layers
spectrogram_model.add(Flatten())

# Dense layers
spectrogram_model.add(Dense(256, activation='relu'))
spectrogram_model.add(Dense(128, activation='relu'))
spectrogram_model.add(Dense(64, activation='relu'))
spectrogram_model.add(Dense(32, activation='relu'))

# Output layer
num_classes = 10  # Change this to the number of predicted classes
spectrogram_model.add(Dense(num_classes, activation='softmax'))


spectrogram_model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])


#spectrogram_model.summary()
plot_model(model, to_file='Mel-spectrogram (Big).png', show_shapes=True, show_layer_names=True)


batch_size = 32
epochs = 20

class CustomEarlyStopping(Callback):
    def __init__(self, target_accuracy=1.0):
        super(CustomEarlyStopping, self).__init__()
        self.target_accuracy = target_accuracy

    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= self.target_accuracy:
            print(f"\nReached target accuracy ({self.target_accuracy}). Training stopped.")

# Define custom early stopping callback
custom_early_stopping = CustomEarlyStopping(target_accuracy=1.0)

#experiments with epochs or batch_size!
history = spectrogram_model.fit(
                                x=spectrogram_train,
                                y=spectrogram_targets,
                                epochs=20,
                                batch_size=batch_size,
                                validation_split=0.2,
                                callbacks=[custom_early_stopping]
                               )


 Plot learning curve

In [None]:
#burrowed from https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.title('CNN (Big) Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
#plt.legend(['train', 'validation'], loc='upper left')
plt.ylim(0, 1.2)  # Set the y-axis limit from 0 to 1.2
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('CNN (Big) loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
#plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Create confusion matrix

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

def get_confusion_matrix():
  # create some empty variables
  correct_classifications_dict = {}
  y_true = []
  y_predictions = []

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    genre_index = music_genres_list.index(music_genre)
    correct_counter = 0

    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]

    # perform predictions
    predictions = spectrogram_model.predict(spectrogram_test_features)

    # Finding the index of the maximum value along each prediction scores
    max_indices = np.argmax(predictions, axis=1)

    for predicted_label in max_indices:
      # add to correct counter if scored correctly
      if predicted_label == genre_index:
        correct_counter += 1

      # add to previous dictionaries and list for confusion matrix later
      y_true.append(music_genre)
      y_predictions.append(music_genres_list[predicted_label])


    correct_classifications_dict[music_genre] = correct_counter

  # create the confusion matrix using sklearn
  generated_conf_matrix = sklearn.metrics.confusion_matrix(y_true,y_predictions)

  return generated_conf_matrix, correct_classifications_dict

spec_conf_matrix, correct_dict = get_confusion_matrix()
print(spec_conf_matrix)
print(correct_dict)

Display Confusion Matrix

In [None]:
# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
plt.imshow(spec_conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('CNN (Big) Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(len(music_genres_list))
plt.xticks(tick_marks, music_genres_list, rotation=45)
plt.yticks(tick_marks, music_genres_list)

# Annotate each cell with the numeric value
for i in range(len(music_genres_list)):
    for j in range(len(music_genres_list)):
        text_color = 'white' if spec_conf_matrix[i, j] > np.max(spec_conf_matrix) / 2 else 'black'
        plt.text(j, i, str(spec_conf_matrix[i, j]), horizontalalignment='center',
                 verticalalignment='center', color=text_color)

plt.tight_layout()
plt.ylabel('True genre')
plt.xlabel('Predicted genre')

plt.show()

Get last song representations, apply TSNE and obtain centroids

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

# Define a new model to extract the the Dense(32) layer
second_last_layer_model = keras.models.Model(inputs=spectrogram_model.input, outputs=spectrogram_model.layers[-3].output)


def get_hidden_states():
  # create some empty variables
  output_list = []
  genre_track_num = {}

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]
    # get vector outputs
    vector_outputs = second_last_layer_model.predict(spectrogram_test_features)
    # save to list to plot later and count how many tracks there are for each genre
    output_list.append(vector_outputs)
    genre_track_num[f"{music_genre}"] = vector_outputs.shape[0]

  # flatten list of lists into numpy array
  outputs_array = np.vstack((output_list))

  return genre_track_num, outputs_array

genres_totals, raw_learned_representations = get_hidden_states()

In [None]:
# transform hidden states dimensions of 20 to 2 so we can plot it.
# use TSNE dimensionality reduction algorithm to reduce size from (100,330) --> (100,2)
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

tsne_model = sklearn.manifold.TSNE(learning_rate='auto', perplexity=50)
transformed_values = tsne_model.fit_transform(raw_learned_representations)

TSNE_dict = {}

for music_genre in genres_totals:
  num_tracks = genres_totals[music_genre]
  TSNE_dict[music_genre] = transformed_values[:num_tracks]
  transformed_values = transformed_values[num_tracks:]


/content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data


In [None]:
genre_centroids = {}

# find the average central point of each genre
for genre in TSNE_dict:
  # get each data for each genre
  data = TSNE_dict[genre]

  # calculate mean_x/y then add to dictionary
  mean_x = np.mean(data[:, 0])
  mean_y = np.mean(data[:, 1])
  centroid = np.array([mean_x, mean_y])

  # calculate euclidean distance (also called L2 norm) from the calculate centroid
  total = 0
  for datapoint in data:
    distance = np.linalg.norm(centroid-datapoint)
    total += distance

  # calculate average distance from centroid
  average_distance = total / len(data)
  genre_centroids[genre] = centroid

Plot learned representations

In [None]:
# Plotting each label's data points using plotly library
fig = px.scatter()

# Add traces for each label's data points
for label, data_points in TSNE_dict.items():
    fig.add_scatter(x=data_points[:, 0], y=data_points[:, 1], mode='markers', name=label)

# configure plot layout
fig.update_layout(
    title='CNN (Big) Song Representations',
    xaxis_title='X-axis',
    yaxis_title='Y-axis'
)

# Add centroids to the plot
for genre in genre_centroids:
    fig.add_trace(go.Scatter(
        x=[genre_centroids[genre][0]],
        y=[genre_centroids[genre][1]],
        mode='markers',
        marker=dict(size=15, symbol='star', line=dict(color='black', width=2)),
        showlegend=False
    ))

fig.show()

# Click for pre-trained model

In [None]:
# define our model: example take from here: https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
input_dims = (218, 336, 3)

# Load the VGG model without the top classification layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_dims)

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Add new classification layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
predictions = Dense(10, activation='softmax')(x)  # 10 classes

# Create the new model
spectrogram_model = keras.Model(inputs=base_model.input, outputs=predictions)


spectrogram_model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])


spectrogram_model.summary()


In [None]:
batch_size = 32
epochs = 20

class CustomEarlyStopping(Callback):
    def __init__(self, target_accuracy=1.0):
        super(CustomEarlyStopping, self).__init__()
        self.target_accuracy = target_accuracy

    def on_epoch_end(self, epoch, logs=None):
        if logs.get('accuracy') >= self.target_accuracy:
            print(f"\nReached target accuracy ({self.target_accuracy}). Training stopped.")

# Define custom early stopping callback
custom_early_stopping = CustomEarlyStopping(target_accuracy=1.0)

#experiments with epochs or batch_size!
history = spectrogram_model.fit(
                                x=spectrogram_train,
                                y=spectrogram_targets,
                                epochs=20,
                                batch_size=batch_size,
                                validation_split=0.2,
                                callbacks=[custom_early_stopping]
                               )


 Plot learning curve

In [None]:
#burrowed from https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
# summarize history for accuracy
# Plot accuracy
plt.plot(history.history['accuracy'])
plt.title('CNN (Pre-trained) Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.ylim(0, 1.2)  # Set the y-axis limit from 0 to 1.2
plt.show()

# Plot loss
plt.plot(history.history['loss'])
plt.title('CNN (Pre-trained) Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.show()

Create confusion matrix

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

def get_confusion_matrix():
  # create some empty variables
  correct_classifications_dict = {}
  y_true = []
  y_predictions = []

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    genre_index = music_genres_list.index(music_genre)
    correct_counter = 0

    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]

    # perform predictions
    predictions = spectrogram_model.predict(spectrogram_test_features)

    # Finding the index of the maximum value along each prediction scores
    max_indices = np.argmax(predictions, axis=1)

    for predicted_label in max_indices:
      # add to correct counter if scored correctly
      if predicted_label == genre_index:
        correct_counter += 1

      # add to previous dictionaries and list for confusion matrix later
      y_true.append(music_genre)
      y_predictions.append(music_genres_list[predicted_label])


    correct_classifications_dict[music_genre] = correct_counter

  # create the confusion matrix using sklearn
  generated_conf_matrix = sklearn.metrics.confusion_matrix(y_true,y_predictions)

  return generated_conf_matrix, correct_classifications_dict

spec_conf_matrix, correct_dict = get_confusion_matrix()
print(spec_conf_matrix)
print(correct_dict)

Display Confusion Matrix

In [None]:
# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
plt.imshow(spec_conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('CNN (Pre-trained) Confusion Matrix')
plt.colorbar()

tick_marks = np.arange(len(music_genres_list))
plt.xticks(tick_marks, music_genres_list, rotation=45)
plt.yticks(tick_marks, music_genres_list)

# Annotate each cell with the numeric value
for i in range(len(music_genres_list)):
    for j in range(len(music_genres_list)):
        text_color = 'white' if spec_conf_matrix[i, j] > np.max(spec_conf_matrix) / 2 else 'black'
        plt.text(j, i, str(spec_conf_matrix[i, j]), horizontalalignment='center',
                 verticalalignment='center', color=text_color)

plt.tight_layout()
plt.ylabel('True genre')
plt.xlabel('Predicted genre')

plt.show()

Get last song representations, apply TSNE and obtain centroids

In [None]:
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

# Define a new model to extract the the Dense(32) layer
second_last_layer_model = keras.models.Model(inputs=spectrogram_model.input, outputs=spectrogram_model.layers[-2].output)


def get_hidden_states():
  # create some empty variables
  output_list = []
  genre_track_num = {}

  # recall the seperate data we loaded from the start
  for music_genre in genre_seperate_data:
    # load data from that seperate dataset we created earlier
    spectrogram_test_features = genre_seperate_data[music_genre]
    # get vector outputs
    vector_outputs = second_last_layer_model.predict(spectrogram_test_features)
    # save to list to plot later and count how many tracks there are for each genre
    output_list.append(vector_outputs)
    genre_track_num[f"{music_genre}"] = vector_outputs.shape[0]

  # flatten list of lists into numpy array
  outputs_array = np.vstack((output_list))

  return genre_track_num, outputs_array

genres_totals, raw_learned_representations = get_hidden_states()

In [None]:
# transform hidden states dimensions of 20 to 2 so we can plot it.
# use TSNE dimensionality reduction algorithm to reduce size from (100,330) --> (100,2)
%cd /content/drive/MyDrive/Extended_Essay/Data_GZAN/spectrogram_data

tsne_model = sklearn.manifold.TSNE(learning_rate='auto', perplexity=50)
transformed_values = tsne_model.fit_transform(raw_learned_representations)

TSNE_dict = {}

for music_genre in genres_totals:
  num_tracks = genres_totals[music_genre]
  TSNE_dict[music_genre] = transformed_values[:num_tracks]
  transformed_values = transformed_values[num_tracks:]


In [None]:
genre_centroids = {}

# find the average central point of each genre
for genre in TSNE_dict:
  # get each data for each genre
  data = TSNE_dict[genre]

  # calculate mean_x/y then add to dictionary
  mean_x = np.mean(data[:, 0])
  mean_y = np.mean(data[:, 1])
  centroid = np.array([mean_x, mean_y])

  # calculate euclidean distance (also called L2 norm) from the calculate centroid
  total = 0
  for datapoint in data:
    distance = np.linalg.norm(centroid-datapoint)
    total += distance

  # calculate average distance from centroid
  average_distance = total / len(data)
  genre_centroids[genre] = centroid

In [None]:
# Plotting each label's data points using plotly library
fig = px.scatter()

# Add traces for each label's data points
for label, data_points in TSNE_dict.items():
    fig.add_scatter(x=data_points[:, 0], y=data_points[:, 1], mode='markers', name=label)

# configure plot layout
fig.update_layout(
    title='CNN (Pre-trained) Song Representations',
    xaxis_title='X-axis',
    yaxis_title='Y-axis'
)

# Add centroids to the plot
for genre in genre_centroids:
    fig.add_trace(go.Scatter(
        x=[genre_centroids[genre][0]],
        y=[genre_centroids[genre][1]],
        mode='markers',
        marker=dict(size=15, symbol='star', line=dict(color='black', width=2)),
        showlegend=False
    ))

fig.show()