In [26]:
import pandas as pd
import numpy as np
import os
import pathlib
import matplotlib.pyplot as plt
import tensorflow as tf
import sys
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
%cd '/content/drive/My Drive/Colab Notebooks/OSU/CS467_shared'

/content/drive/My Drive/Colab Notebooks/OSU/CS467_shared


In [29]:
IMG_PIXELS = 67000
IMG_WIDTH = 335
IMG_HEIGHT = 200
NUM_LABELS = 32
NUM_FEATURES = 45

In [30]:
FMA_images_dataset = list(tf.data.experimental.load('./Data/FMA_images_dataset',(tf.TensorSpec(shape=(IMG_PIXELS,), dtype=tf.uint8, name=None), tf.TensorSpec(shape=(NUM_LABELS,), dtype=tf.int8, name=None))))
FMA_features_dataset = list(tf.data.experimental.load('./Data/FMA_features_dataset',(tf.TensorSpec(shape=(NUM_FEATURES,), dtype=tf.float32, name=None), tf.TensorSpec(shape=(NUM_LABELS,), dtype=tf.int8, name=None))))
labels_key = pd.read_csv('./Data/labels_key.csv')['category']

In [31]:
images = []
labels = []
features = []
track_ids = []

for i, (x, y) in enumerate(FMA_images_dataset):
  images.append(x.numpy())
  features.append(FMA_features_dataset[i][0][1:].numpy())
  labels.append(y.numpy())
  track_ids.append(FMA_features_dataset[i][0][0].numpy())

images = np.array(images)
labels = np.array(labels)
features = np.array(features)
track_ids = np.array(track_ids)

FMA_images_dataset = None
FMA_features_dataset = None

In [34]:
print(len(images))
print(len(features))
print(len(labels))
print(len(track_ids))

100817
100817
100817
100817


In [35]:
model = tf.keras.models.load_model('FMA_model.h5')

In [37]:
images = images.reshape(images.shape[0], IMG_HEIGHT, IMG_WIDTH, 1)

# sc = pickle.load(open('./std_scaler.pkl','rb'))
# features = sc.transform(features)
# features = StandardScaler().fit_transform(features)

sc = StandardScaler()
features = sc.fit_transform(features)

# export scaler object to be used for 
pickle.dump(sc, open('std_scaler_B.pkl','wb'))

In [None]:
top5_count = 0
top1_count = 0
cur_score = 0
highest_score_possible = 0

for n, label in enumerate(labels):
  num_genres = np.count_nonzero(label)
  genre_weight = 1/num_genres
  highest_score_possible += 5

  # add indicies of track label to label_index array
  label_index = []
  for i, v in enumerate(label):
    if v == 1:
      label_index.append(i)

  # top 5 predictions
  prediction = model.predict([np.array([features[n]]), np.array([images[n]])])
  topN_index = []
  topN = np.argsort(-prediction[0])[:5]
  for i, val in enumerate(topN, start=1):
    topN_index.append(val)

  # compare label indicies to determine if prediction includes correct label
  # top1
  for i in label_index:
    if i == topN_index[0]:
      top1_count += 1
  # top5
  for i in topN_index:
    if i in label_index:
      top5_count += 1
      break

  # calculate per-track genre score and add to total model score 
  for j, i in enumerate(topN_index):
    if i in label_index:
      # if top most genre is correct and the track only has a single genre 
      # the formula doesnt work so we handle that case independenly
      if i == label_index[0] and num_genres == 1:
        genre_score = 5
        cur_score += genre_score
        break
      else:
        genre_score = genre_weight * (5-j) + genre_weight
        # print('genre_score: ', genre_score)
        cur_score += genre_score
        if genre_score == 0:
          print('genre not found')


  if n > 0 and n % 10000 == 0:
    print('''sample: {n} \n
          top1_count: {top1_count} \n
          top1 accuracy: {top1_acc} \n
          top5_count: {top5_count} \n
          top5 accuracy: {top5_acc} \n
          total score: {total_score} \n
          score: {score} \n'''.format(
                      n=n, top1_count=top1_count, top1_acc=top1_count/n, 
                      top5_count=top5_count, top5_acc=top5_count/n,
                      total_score=cur_score, 
                      score=cur_score/highest_score_possible))

total_n = len(labels)
top1_acc = top1_count / total_n
top5_count = top5_count
top5_acc = top5_count / total_n
total_score = cur_score
score = cur_score / highest_score_possible

sample: 10000 

          top1_count: 4625 

          top1 accuracy: 0.4625 

          top5_count: 7680 

          top5 accuracy: 0.768 

          total score: 32081.61666666686 

          score: 0.6415681765156857 

sample: 20000 

          top1_count: 9800 

          top1 accuracy: 0.49 

          top5_count: 15485 

          top5 accuracy: 0.77425 

          total score: 62602.61111111017 

          score: 0.6259948113705331 

sample: 30000 

          top1_count: 14590 

          top1 accuracy: 0.48633333333333334 

          top5_count: 22979 

          top5 accuracy: 0.7659666666666667 

          total score: 90252.3398989919 

          score: 0.6016622105862598 



In [None]:
print(total_n)
print(top1_acc)
print(top5_count)
print(top5_acc)
print(total_score)
print(score)

100817
0.501958995010762
79617
0.789718003908071
296158.4006132713
0.587516789059923
