# Spotify Song Similarity Search
This notebook takes in a dataset of Spotify songs with several fetures for each song.  We train a KDTree on the dataset, then query the tree for each song's 10 nearest neighbors.  We export the results as a CSV file that can be added (along with the original dataset) to a database for the website.

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

In [3]:
# Import the Spotify dataset as a pandas dataframe
df = pd.read_csv('dataset/SpotifyAudioFeaturesNov2018.csv')
df.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,44
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,10
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,63
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,9
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,8


In [3]:
# Create a dataframe the numerical columns that similarity searches will be based on.
df_features = df.drop(columns=['artist_name', 'track_id', 'track_name'])

# Create a numpy array where the numerical features are scaled to a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X = scaler.fit_transform(df_features)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [5]:
# Create the tree
tree = KDTree(X)

In [6]:
%%time
# Query the tree for the 11 nearest neighbors of each song (the first and closest neighbor is itself)
dist, ind = tree.query(X, k=11)

CPU times: user 6min 17s, sys: 1.6 s, total: 6min 18s
Wall time: 6min 22s


In [11]:
# ind is an array of the indices of all the nearest neighbors, as found in X and df.
ind[:10]

array([[     0,  95873,  22119,  38864,   8443, 101583,  29862, 113771,
        107896, 113505, 106525],
       [     1,  79984,  83083,  21375,  51301,  68037,  81472,  78112,
         55646,  84875,  69518],
       [     2, 112473,  27203, 107303, 102422, 101645, 115723, 111076,
        111204, 107523,  97299],
       [     3,     36,     34,  51118,     29,   2004,  93346,  59543,
         93409,   2369,  47635],
       [     4,  22270,  32639,  31740,   2406,  74248,  31700,  13342,
         15898,  32238,  29871],
       [     5,  20949,   7284,  31081,   2199,   2090,  26201,   2781,
           299,   7177,  18829],
       [     6,   2817,  47165,   2369,  16566,  31753,   1752,   2004,
         94030,  70694,  68281],
       [     7,  44545,  31378,  23086,  20855,  69518,  56788,  89771,
         54828,  88959,  68037],
       [     8,   2768,     18,  94386,   2420,  58920,     27,  72590,
         73625,  93439,  92879],
       [     9,    898,   4586,     43,   7569,  11492,

In [47]:
# ids_only is an array of all the IDs, with the same shape as the original dataframe, 
# so that the indices in ind correspond to the right entries
ids_only = np.array(df.track_id)
ids_only

array(['2RM4jf1Xa9zPgMGRDiht8O', '1tHDG53xJNGsItRA3vfVgs',
       '6Wosx2euFPMT14UXiWudMy', ..., '0B2LhMYcGR9Gmi6BQLdzlO',
       '0yzA9b21pJgnlLQDirsxAm', '2ud0K7X5LzmxJP2LEvkHio'], dtype=object)

In [48]:
def process(ids_only, ind): 
    
    similars = []
    
    for row in ind:
        ids = [ids_only[x] for x in row]
        similars.append(ids)
        
    return similars

In [49]:
%%time
similars = process(ids_only, ind)

CPU times: user 1.18 s, sys: 370 ms, total: 1.55 s
Wall time: 1.61 s


In [51]:
# The first item in each row is a song in the original database.  
# The next 10 rows are its closest neighbors in the KDTree.
similars[0]

['2RM4jf1Xa9zPgMGRDiht8O',
 '3EOuVx8f5Bb9AIt3PrRHKO',
 '61KEpN0YvwDrwaqqT65ecD',
 '0X8XHADrYzkNGK7AlEZNBN',
 '0UGcmjbvLBf7JR4PHntTpd',
 '7Jbt6KWzS8cpP1xnW0tQ96',
 '1qEu5WXxkmh4nwldLYJgYC',
 '0MiAP4EQGIuikH23RTyP53',
 '75OGmcs4DTNJqWHCKTGegx',
 '3cYNoOv4SEWPIZmh34JrUv',
 '3XcnvbkurHelpe04EETr9k']

In [53]:
# Turn this list into a dataframe for export
cols = ['id', 'sim1', 'sim2', 'sim3', 'sim4', 'sim5', 'sim6', 'sim7', 'sim8', 'sim9', 'sim10']
df2 = pd.DataFrame(similars, columns=cols)

In [54]:
# Remove duplicates
df3 = df2.drop_duplicates(subset=['id'], keep='first')

In [None]:
# Export to CSV
df3.to_csv('dataset/similars.csv')