In [60]:
import pandas as pd
import tensorflow as tf
import numpy as np
import numpy.ma as ma
from ipywidgets import IntProgress
from IPython.display import display
import time
import math
from numba import jit, cuda
import heapq
import tabulate

In [95]:
df = pd.read_csv('data.csv')
filt = (df['year'] > 1999)
df = df[filt].reset_index(drop=True)
meta_df = df[['name', 'artists', 'year']]
song_df = df.drop(columns=list(meta_df.columns.values))
song_df.drop(columns=['release_date', 'id', 'duration_ms', 'mode','popularity'], inplace=True)
main_df = pd.concat([meta_df, song_df], axis=1)
song_df

Unnamed: 0,valence,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,speechiness,tempo
0,0.285,0.00239,0.429,0.661,0,0.000121,11,0.2340,-7.227,0.0281,173.372
1,0.613,0.14300,0.843,0.806,1,0.000000,4,0.0771,-5.946,0.2690,94.948
2,0.400,0.00958,0.556,0.864,0,0.000000,3,0.2090,-5.870,0.0584,105.143
3,0.543,0.00664,0.545,0.865,0,0.000011,11,0.1680,-5.708,0.0286,99.009
4,0.760,0.03020,0.949,0.661,1,0.000000,5,0.0454,-4.244,0.0572,104.504
...,...,...,...,...,...,...,...,...,...,...,...
41445,0.608,0.08460,0.786,0.808,0,0.000289,7,0.0822,-3.702,0.0881,105.029
41446,0.734,0.20600,0.717,0.753,0,0.000000,7,0.1010,-6.020,0.0605,137.936
41447,0.637,0.10100,0.634,0.858,0,0.000009,4,0.2580,-2.226,0.0809,91.688
41448,0.195,0.00998,0.671,0.623,1,0.000008,2,0.6430,-7.161,0.3080,75.055


In [72]:
song_array = np.array(song_df)
num_song_features = song_array.shape[1]

In [63]:
@jit(target_backend='cuda')
def sq_dist(a, b):
#     Args: a (ndarray) (n,): vector with n features
#     Args: b (ndarray) (n,): vector with n features
    n = a.shape[0]
    d = 0
    for i in range(n):
        d+= (a[i] - b[i])**2
    return d

In [64]:
dim = len(song_array)
total = dim**2

In [65]:
@jit(target_backend='cuda')
def generate_song_matrix(vectors, dimensions, iters):
    
    percentage = 0
    count = 0
    dist = np.zeros((dimensions, dimensions))
    for i in range(dimensions):
        for j in range(dimensions):
            dist[i, j] = sq_dist(vectors[i], vectors[j])
            count += 1
            if count % math.ceil(iters/100) == 0:
                percentage += 1
                print(f"{percentage}% loaded.")
    return dist

In [66]:
start_time = time.time()
dist = generate_song_matrix(song_array, dim, total)
print("--- %s seconds ---" % (time.time() - start_time))

1% loaded.
2% loaded.
3% loaded.
4% loaded.
5% loaded.
6% loaded.
7% loaded.
8% loaded.
9% loaded.
10% loaded.
11% loaded.
12% loaded.
13% loaded.
14% loaded.
15% loaded.
16% loaded.
17% loaded.
18% loaded.
19% loaded.
20% loaded.
21% loaded.
22% loaded.
23% loaded.
24% loaded.
25% loaded.
26% loaded.
27% loaded.
28% loaded.
29% loaded.
30% loaded.
31% loaded.
32% loaded.
33% loaded.
34% loaded.
35% loaded.
36% loaded.
37% loaded.
38% loaded.
39% loaded.
40% loaded.
41% loaded.
42% loaded.
43% loaded.
44% loaded.
45% loaded.
46% loaded.
47% loaded.
48% loaded.
49% loaded.
50% loaded.
51% loaded.
52% loaded.
53% loaded.
54% loaded.
55% loaded.
56% loaded.
57% loaded.
58% loaded.
59% loaded.
60% loaded.
61% loaded.
62% loaded.
63% loaded.
64% loaded.
65% loaded.
66% loaded.
67% loaded.
68% loaded.
69% loaded.
70% loaded.
71% loaded.
72% loaded.
73% loaded.
74% loaded.
75% loaded.
76% loaded.
77% loaded.
78% loaded.
79% loaded.
80% loaded.
81% loaded.
82% loaded.
83% loaded.
84% loaded.
8

In [67]:
small_dist = dist
m_dist = ma.masked_array(small_dist, mask=np.identity(small_dist.shape[0]))

In [107]:
count = 50 

disp = [["Song", "Artist", "Similar Song", "Artist"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    song1_series = meta_df.iloc[i]
    song2_series = meta_df.iloc[min_idx]
    if song1_series['name'] == song2_series['name']:
        tmp_m_dist = m_dist[i].copy()
        tmp_m_dist2 = np.sort(tmp_m_dist)
        min_val = tmp_m_dist2[1]
        min_idx = np.where(tmp_m_dist==min_val)
        min_idx = min_idx[0][0]
        song2_series = meta_df.iloc[min_idx]
    disp.append( [song1_series['name'], song1_series['artists'],
                  song2_series['name'], song2_series['artists']]
               )
    

table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")

In [108]:
table

Song,Artist,Similar Song,Artist.1
Yellow,['Coldplay'],Be With You,['Akon']
Ms. Jackson,['OutKast'],My Projects,['Coo Coo Cal']
In the End,['Linkin Park'],Raspberry,['Grouplove']
Kryptonite,['3 Doors Down'],Afrodisiac,['Brandy']
The Real Slim Shady,['Eminem'],The Real Slim Shady,['Eminem']
Down with the Sickness,['Disturbed'],Everyday Normal Guy 2,['Jon Lajoie']
Ride Wit Me,"['Nelly', 'City Spud']",I Took A Pill In Ibiza - Seeb Remix,"['Mike Posner', 'Seeb']"
Stan,"['Eminem', 'Dido']",Testimony,['Kodak Black']
Bye Bye Bye,['*NSYNC'],Candyman,['Christina Aguilera']
It's A Great Day To Be Alive,['Travis Tritt'],Unconditionally,['Katy Perry']
