In [66]:
import pandas as pd
import tensorflow as tf
import numpy as np
import numpy.ma as ma
from ipywidgets import IntProgress
from IPython.display import display
import time
import math
from numba import jit, cuda
import tabulate

In [67]:
df = pd.read_csv('data.csv')
filt = (df['year'] > 1999)
df = df[filt].reset_index(drop=True)
meta_df = df[['name', 'artists', 'year']]
song_df = df.drop(columns=list(meta_df.columns.values))
song_df.drop(columns=['release_date', 'id', 'duration_ms', 'mode', 'popularity'], inplace=True)
main_df = pd.concat([meta_df, song_df], axis=1)

In [68]:
song_array = np.array(song_df)
num_song_features = song_array.shape[1]

In [69]:
@jit(target_backend='cuda')
def sq_dist(a, b):
#     Args: a (ndarray) (n,): vector with n features
#     Args: b (ndarray) (n,): vector with n features
    n = a.shape[0]
    d = 0
    for i in range(n):
        d+= (a[i] - b[i])**2
    return d

In [70]:
dim = len(song_array)
total = dim**2

In [71]:
@jit(target_backend='cuda')
def generate_song_matrix(vectors, dimensions, iters):
    
    percentage = 0
    count = 0
    dist = np.zeros((dimensions, dimensions))
    for i in range(dimensions):
        for j in range(dimensions):
            dist[i, j] = sq_dist(vectors[i], vectors[j])
            count += 1
            if count % math.ceil(iters/100) == 0:
                percentage += 1
                print(f"{percentage}% loaded.")
    return dist

In [None]:
start_time = time.time()
dist = generate_song_matrix(song_array, dim, total)
print("--- %s seconds ---" % (time.time() - start_time))

In [65]:
small_dist = dist
m_dist = ma.masked_array(small_dist, mask=np.identity(small_dist.shape[0]))

MemoryError: Unable to allocate 12.8 GiB for an array with shape (41450, 41450) and data type float64

In [55]:
count = 50 

disp = [["song1", "artists", "song2", "artists"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    song1_series = meta_df.iloc[i]
    song2_series = meta_df.iloc[min_idx]
    disp.append( [song1_series['name'], song1_series['artists'],
                  song2_series['name'], song2_series['artists']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")

In [56]:
table

song1,artists,song2,artists.1
Yellow,['Coldplay'],You Are The Reason,['Calum Scott']
Ms. Jackson,['OutKast'],One Of Them Girls,['Lee Brice']
In the End,['Linkin Park'],Telepathy,['BTS']
Kryptonite,['3 Doors Down'],I'm a Mess,['Bebe Rexha']
The Real Slim Shady,['Eminem'],Call You Mine,"['The Chainsmokers', 'Bebe Rexha']"
Down with the Sickness,['Disturbed'],Renegades,['X Ambassadors']
Ride Wit Me,"['Nelly', 'City Spud']",CIVIL WAR - Bonus,['Russ']
Stan,"['Eminem', 'Dido']",parents,['YUNGBLUD']
Bye Bye Bye,['*NSYNC'],Unstoppable,['Sia']
It's A Great Day To Be Alive,['Travis Tritt'],Unconditionally,['Katy Perry']


In [46]:
filt = (df['name'] == "Yellow") | (df['name'] == "Conteo Regresivo - Salsa Version")
df.loc[filt]

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.285,2000,0.00239,['Coldplay'],0.429,266773,0.661,0,3AJwUDP919kvQ9QcozQPxg,0.000121,11,0.234,-7.227,1,Yellow,84,2000-07-10,0.0281,173.372
5721,0.96,2007,0.778,['Gilberto Santa Rosa'],0.777,266787,0.72,0,37nXsqE1XW0cxOQvIMsmFc,0.0,8,0.069,-6.162,1,Conteo Regresivo - Salsa Version,68,2007-11-20,0.0803,163.905
12607,0.285,2000,0.00239,['Coldplay'],0.429,266773,0.661,0,3e0wYnFxkqinmtXebYPMSt,0.000121,11,0.234,-7.227,1,Yellow,55,2000-06-26,0.0281,173.372
14191,0.96,2008,0.778,['Gilberto Santa Rosa'],0.777,266787,0.72,0,5YfqVqJwjxiyhxvofbCnHp,0.0,8,0.069,-6.162,1,Conteo Regresivo - Salsa Version,59,2008-07-21,0.0803,163.905
18039,0.0986,2006,0.898,['Rockabye Baby!'],0.454,265560,0.233,0,7fezkLNTs7KdKzL8OJJ0fK,0.81,11,0.0965,-19.064,1,Yellow,49,2006-08-29,0.0367,173.48
36557,0.217,2017,0.139,"['Aminé', 'Nelly']",0.869,180000,0.526,1,4Hj5yNHgu2dyrnzRQN7Gld,0.0,0,0.119,-5.766,1,Yellow,58,2017-07-28,0.111,128.996
