# Song Recommender

Given a Spotify username and a playlist, we introduce an algorithm that outputs song recommendations based on and similar to songs seen in the playlist. We draw upon existing datasets of 170,000+ songs, 2,900+ genres, and information about songs over timeto implement our algorithm.

Reference: https://towardsdatascience.com/how-to-build-an-amazing-music-recommendation-system-4cce2719a572
Dataset source: https://github.com/AmolMavuduru/SpotifyRecommenderSystem/tree/master/data

### Import libraries

In [None]:
# import libraries
import numpy as np
import pandas as pd
import spotipy
import os

### Read in data

In [None]:
# read in data and display first ten rows of the songs dataset as example
spotify_data = pd.read_csv('./data/data.csv.zip')
genre_data = pd.read_csv('./data/data_by_genres.csv')
data_by_year = pd.read_csv('./data/data_by_year.csv')
spotify_data.head(10)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665
5,0.196,1921,0.579,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.697,395076,0.346,0,4pyw9DVHGStUre4J6hPngr,0.168,2,0.13,-12.506,1,Gati Mardika,6,1921,0.07,119.824
6,0.406,1921,0.996,['John McCormack'],0.518,159507,0.203,0,5uNZnElqOS3W4fRmRYPk4T,0.0,0,0.115,-10.589,1,The Wearing of the Green,4,1921,0.0615,66.221
7,0.0731,1921,0.993,['Sergei Rachmaninoff'],0.389,218773,0.088,0,02GDntOXexBFUvSgaXLPkd,0.527,1,0.363,-21.091,0,"Morceaux de fantaisie, Op. 3: No. 2, Prélude i...",2,1921,0.0456,92.867
8,0.721,1921,0.996,['Ignacio Corsini'],0.485,161520,0.13,0,05xDjWH9ub67nJJk82yfGf,0.151,5,0.104,-21.508,0,La Mañanita - Remasterizado,0,1921-03-20,0.0483,64.678
9,0.771,1921,0.982,['Fortugé'],0.684,196560,0.257,0,08zfJvRLp7pjAb94MA9JmF,0.0,8,0.504,-16.415,1,Il Etait Syndiqué,0,1921,0.399,109.378


In [None]:
# get information about songs dataset
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [None]:
# get information about genres dataset
genre_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB


In [None]:
# get information about years dataset
data_by_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              100 non-null    int64  
 1   year              100 non-null    int64  
 2   acousticness      100 non-null    float64
 3   danceability      100 non-null    float64
 4   duration_ms       100 non-null    float64
 5   energy            100 non-null    float64
 6   instrumentalness  100 non-null    float64
 7   liveness          100 non-null    float64
 8   loudness          100 non-null    float64
 9   speechiness       100 non-null    float64
 10  tempo             100 non-null    float64
 11  valence           100 non-null    float64
 12  popularity        100 non-null    float64
 13  key               100 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 11.1 KB


### Create the pipelines that run clustering and dimensionality reduction algorithms on the song and genre datasets

In [None]:
'''Cluster 2900+ genres into 10 clusters (based on similar features) 
using the unsupervised K-means algorithm'''

# import libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])

X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
'''Use tSNE (t-Distributed Stochastic Neighbor Embedding), 
an unsupervised dimensionality reduction algorithm, to compress
and project the multi-dimensional data onto a two-dimensional space
'''

# import libraries
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])
genre_embedding = tsne_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.005s...
[t-SNE] Computed neighbors for 2973 samples in 0.251s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] Computed conditional probabilities in 0.106s
[t-SNE] Iteration 50: error = 81.9701538, gradient norm = 0.0088686 (50 iterations in 1.595s)
[t-SNE] Iteration 100: error = 77.4159164, gradient norm = 0.0103270 (50 iterations in 1.612s)
[t-SNE] Iteration 150: error = 76.4589081, gradient norm = 0.0116691 (50 iterations in 1.287s)
[t-SNE] Iteration 200: error = 76.2070007, gradient norm = 0.0020013 (50 iterations in 0.993s)
[t-SNE] Iteration 250: error = 76.1452026, gradient norm = 0.0007351 (50 iterations in 1.043s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.145203
[t-SNE] Iteration 300: erro

In [None]:
'''Cluster 160k+ songs into 20 clusters (based on similar features) 
using the unsupervised K-means algorithm'''

song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2, n_init=4))], verbose=True)

'''X: an array of numerical data corresponding to each song -- i.e. valence, acousticness, danceability, energy, duration,
instrumentalness, key, liveness, loudness'''
X = spotify_data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s
Initialization complete
Iteration 0, inertia 1561516.2872809106
Iteration 1, inertia 1187251.4876109627
Iteration 2, inertia 1132468.9030263308
Iteration 3, inertia 1114156.4008767358
Iteration 4, inertia 1104009.5306416806
Iteration 5, inertia 1097014.7721047723
Iteration 6, inertia 1092804.2839283496
Iteration 7, inertia 1089293.8844267218
Iteration 8, inertia 1086063.9742611637
Iteration 9, inertia 1082496.080640119
Iteration 10, inertia 1078351.8049275968
Iteration 11, inertia 1074575.3232945558
Iteration 12, inertia 1071992.220184221
Iteration 13, inertia 1070199.108665891
Iteration 14, inertia 1069133.5735853238
Iteration 15, inertia 1068508.9377803025
Iteration 16, inertia 1068117.5263037367
Iteration 17, inertia 1067858.1339910775
Iteration 18, inertia 1067684.4205020852
Iteration 19, inertia 1067556.4681490725
Iteration 20, inertia 1067459.2489195762
Iteration 21, inertia 1067375.8895289965
Iteration 22, in

Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, n_init=4, verbose=2))],
         verbose=True)

In [None]:
song_cluster_labels = song_cluster_pipeline.predict(X)

spotify_data['cluster_label'] = song_cluster_labels

NameError: name 'song_cluster_pipeline' is not defined

In [None]:
'''Use PCA (principal components analysis), another dimensionality reduction technique that runs faster than TSNE,
to compress the multi-dimensional song data onto a two-dimensional space,
preserving only the most important two dimensions'''

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

KeyError: 'cluster_label'

### Build a song recommendation algorithm

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

CLIENT_ID = 'a1252d2f88a748328ae048b52770ffb5'
CLIENT_SECRET = 'b57de54acf5740c88028684f1b1a224e'

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=CLIENT_ID,
                                                           client_secret=CLIENT_SECRET))


def find_song(name, year):
    
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)
    

In [None]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict
        

def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')


In [None]:
import pprint

def getPlaylists(username):
    # create a user object using Spotify API
    user = sp.user(username)
    
    # use Spotify API to fetch information about user's playlists
    user_playlists = sp.user_playlists(username, limit=50, offset=0)
    
    # store ID's of each of the user's playlists
    ids = []

    num_playlists = len(user_playlists['items'])

    for i in range(num_playlists):
        ids.append((user_playlists['items'][i]['id'], user_playlists['items'][i]['name']))

    return ids


def get_song_list_from_playlist(playlist_id):

    results = sp.playlist(playlist_id)

    # exctract song IDs of each song in the playlist
    song_ids = []
    for item in results['tracks']['items']:
            track = item['track']['id']
            song_ids.append(track)

    # list storing dictionaries, one per song, containing the song's information
    song_metadata = []

    for song_id in song_ids:
        # each song's entry includes its name and year released
        song_metadata_entry = {}
        # get song's metadata using Spotify API
        data = sp.track(song_id)
        pp = pprint.PrettyPrinter(indent=1)

        # extract the song's name and year released from the song metadata object released by sp.track()
        song_metadata_entry['name'] = data['name']
        song_metadata_entry['year'] = int(data['album']['release_date'][:4])
        song_metadata.append(song_metadata_entry)

    return song_metadata

In [None]:
import pprint

# flow for user to analyze their own playlists

# ask user for Spotify username
username = input("Enter a Spotify username")

# print out a numbered list of user's Spotify playlists (by title of playlist)
count = 0
playlists = getPlaylists(username)
for playlist in playlists:
    if count > 9:
        print(str(count) + ' | ' + playlist[1])
    else:
        print(str(count) + '  | ' + playlist[1])
    count += 1

'''Have user choose one playlist, and the algorithm will give the user some
song recommendations based on and similar to songs present in that playlist'''

def printRecs(recOut):
    count = 1
    for rec in recOut:
        artists = ''
        for i in range(len(eval(rec['artists']))-1):
            artists += (eval(rec['artists'])[i] + ', ')
        artists += eval(rec['artists'])[len(eval(rec['artists']))-1]
        print(str(count) + ') ' + rec['name'] + ' by ' + artists + ' (' + str(rec['year']) + ')')
        count += 1

val = int(input("Enter the number next to the playlist you want to view recommendations based off:"))
while val >= 0:
    playlist = playlists[val]
    print()
    print('You have chosen to view recs for ' + playlist[1])
    print()
    song_metadata = get_song_list_from_playlist(playlist[0])
    pp = pprint.PrettyPrinter(indent=1)




    # pp.pprint(recommend_songs(song_metadata, spotify_data))
    print()
    print("HERE ARE YOU RECOMMENDATIONS BASED OFF \"" + playlist[1] + "\"")
    printRecs(recommend_songs(song_metadata, spotify_data))
    val = int(input("Enter another number to see recs for another playlist, or enter -1 to quit"))

NameError: name 'getPlaylists' is not defined

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=69771c6e-a76e-4b75-8043-409b087b6b70' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>