In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
# load data
data = pd.read_csv(
    "https://raw.githubusercontent.com/Lambda-Spotify-Song-Suggester-3/datascience/master/kaggle_data/encoded.csv")
df = data.copy()

dictionary = df[['artist_name', 'track_name', 'track_key', 'track_id']]

In [3]:
print(df.shape)
df.head()

(130663, 19)


Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,track_key,artist_key
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15,1,1
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0,2,1
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56,3,2
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0,4,3
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0,5,3


In [4]:
dictionary.head()

Unnamed: 0,artist_name,track_name,track_key,track_id
0,YG,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",1,2RM4jf1Xa9zPgMGRDiht8O
1,YG,BAND DRUM (feat. A$AP Rocky),2,1tHDG53xJNGsItRA3vfVgs
2,R3HAB,Radio Silence,3,6Wosx2euFPMT14UXiWudMy
3,Chris Cooq,Lactose,4,3J2Jpw61sO7l6Hc7qdYV91
4,Chris Cooq,Same - Original mix,5,2jbYvQCyPgX3CdmAzeVeuS


In [5]:
# drop columns for training
df = df.drop(
    columns=[
        'artist_name',
        'track_id',
        'track_name',
        'track_key',
        'duration_ms',
        'mode',
        'loudness',
        'time_signature'])


In [6]:
df.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,speechiness,tempo,valence,popularity,artist_key
0,0.00582,0.743,0.339,0.0,1,0.0812,0.409,203.927,0.118,15,1
1,0.0244,0.846,0.557,0.0,8,0.286,0.457,159.009,0.371,0,1
2,0.025,0.603,0.723,0.0,9,0.0824,0.0454,114.966,0.382,56,2
3,0.0294,0.8,0.579,0.912,5,0.0994,0.0701,123.003,0.641,0,3
4,3.5e-05,0.783,0.792,0.878,7,0.0332,0.0661,120.047,0.928,0,3


In [7]:
# Scale the data
scaler = StandardScaler()
df_s = scaler.fit_transform(df)

In [8]:
def predictor(track_key):
    '''
    Function to take "track key" of a song of interest from dataframe, and
    return a list of track_ids of the closest related 10songs.

    output format:
  ['4fbaKWFRghusXd4bSBvvfN']

    '''

    # Convert "track_key" to the index of the song
    input_dictionary_entry = dictionary[dictionary['track_key'] == track_key]
    input_index = input_dictionary_entry.index[0]

    # Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
    nn.fit(df_s)

    neighbor_predictions = nn.kneighbors([df_s[input_index]])

    # This is a list of the INDEXES of the songs
    list_of_predictions = neighbor_predictions[1][0].tolist()

    similar_tracks = []
    for item in list_of_predictions:
        track_hash = dictionary['track_id'].iloc[item]
        similar_tracks.append(track_hash)

    return similar_tracks

In [9]:
# testing functionality
track_key = 1
print(predictor(track_key))

['2RM4jf1Xa9zPgMGRDiht8O', '02s1Voowwhr0qTSOrMVEXk', '6PIdcDdMuoyAWeVqMRMLlR', '0MiAP4EQGIuikH23RTyP53', '2lD4dQpcac8H2Rq5nttJJh', '07lPxulz2gpCAVvgemzedl', '5MccaBOgJh4MsJNoj72eij', '3QBTZEqaCbJxztWh5NTKHs', '6yoM6KoDpWXKIWo9UnZ3Ef', '7Jbt6KWzS8cpP1xnW0tQ96']


In [10]:
# song features, for plotting
def feature_average(track_key):
    '''
    This function returns the sum of the features for the ten recommended songs.
    '''
    similar_tracks = predictor(track_key)
    # Return a dataframe with only the ten most similar tracks
    similar_tracks = data[data["track_id"].isin(similar_tracks)]
    similar_tracks = similar_tracks[['acousticness', 'danceability',
                                     'energy', 'instrumentalness',
                                     'liveness', 'mode',
                                     'speechiness', 'valence']]
    # Average features of ten tracks
    acousticness = round(similar_tracks['acousticness'].mean(), 2)
    danceability = round(similar_tracks['danceability'].mean(), 2)
    energy = round(similar_tracks['energy'].mean(), 2)
    instrumentalness = round(similar_tracks['instrumentalness'].mean(), 2)
    liveness = round(similar_tracks['liveness'].mean(), 2)
    mode = round(similar_tracks['mode'].mean(), 2)
    speechiness = round(similar_tracks['speechiness'].mean(), 2)
    valence = round(similar_tracks['valence'].mean(), 2)
    # Store all to "features" variable
    features = []
    attributes = [
        acousticness,
        danceability,
        energy,
        instrumentalness,
        liveness,
        mode,
        speechiness,
        valence]
    # features.append(acousticness)
    for attribute in attributes:
        features.append(attribute)
    return features

In [11]:
# testing functionality
print(feature_average(track_key))

[0.06, 0.73, 0.42, 0.0, 0.12, 1.0, 0.41, 0.25]


In [12]:
# plotply radar-chart (https://plotly.com/python/radar-chart/)
import plotly.express as px
import pandas as pd

r =feature_average(track_key)

attributes = [
        'acousticness',
        'danceability',
        'energy',
        'instrumentalness',
        'liveness',
        'mode',
        'speechiness',
        'valence']

fig = px.line_polar(r=r, theta=attributes, line_close=True)
fig.update_traces(fill='toself')
fig.show()