In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import spotipy
import os
import plotly.express as px
%matplotlib inline

In [6]:
spotify_data = pd.read_csv('./data/data.csv')
genre_data = pd.read_csv('./data/data_by_genres.csv')
data_by_year = pd.read_csv('./data/data_by_year.csv')
spotify_data.head(10)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920
5,0.996,['Mamie Smith & Her Jazz Hounds'],0.424,198627,0.245,0,3HnrHGLE9u2MjHtdobfWl9,0.799,5,0.235,-11.47,1,Crazy Blues - 78rpm Version,9,1920,0.0397,103.87,0.477,1920
6,0.992,['Mamie Smith'],0.782,195200,0.0573,0,5DlCyqLyX2AOVDTjjkDZ8x,2e-06,5,0.176,-12.453,1,Don't You Advertise Your Man,5,1920,0.0592,85.652,0.487,1920
7,0.996,['Mamie Smith & Her Jazz Hounds'],0.474,186173,0.239,0,02FzJbHtqElixxCmrpSCUa,0.186,9,0.195,-9.712,1,Arkansas Blues,0,1920,0.0289,78.784,0.366,1920
8,0.996,['Francisco Canaro'],0.469,146840,0.238,0,02i59gYdjlhBmbbWhf8YuK,0.96,8,0.149,-18.717,1,La Chacarera - Remasterizado,0,1920-07-08,0.0741,130.06,0.621,1920
9,0.00682,['Meetya'],0.571,476304,0.753,0,06NUxS2XL3efRh0bloxkHm,0.873,8,0.092,-6.943,1,Broken Puppet - Original Mix,0,1920-01-01,0.0446,126.993,0.119,1920


In [7]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2, n_jobs=4))], verbose=True)
X = spotify_data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
spotify_data['cluster_label'] = song_cluster_labels
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.1s




Initialization complete
Iteration 0, inertia 1645285.6328622971
Iteration 1, inertia 1224867.160118111
Iteration 2, inertia 1190552.104857106
Iteration 3, inertia 1175921.1641476492
Iteration 4, inertia 1168383.060432393
Iteration 5, inertia 1163370.4599964956
Iteration 6, inertia 1159020.0086025405
Iteration 7, inertia 1154779.7493209122
Iteration 8, inertia 1150904.4783640439
Iteration 9, inertia 1147119.195053228
Iteration 10, inertia 1143369.4142073758
Iteration 11, inertia 1139499.7024702183
Iteration 12, inertia 1135622.183202473
Iteration 13, inertia 1132546.9447171816
Iteration 14, inertia 1130217.319463924
Iteration 15, inertia 1128294.2324696963
Iteration 16, inertia 1126571.4624146926
Iteration 17, inertia 1125102.3635550437
Iteration 18, inertia 1123489.8200416123
Iteration 19, inertia 1121347.2093086313
Iteration 20, inertia 1120135.8144706097
Iteration 21, inertia 1119319.2625483614
Iteration 22, inertia 1118504.3587510416
Iteration 23, inertia 1117638.892102035
Iteration

In [8]:
def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return None
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict
        

def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [9]:
recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993}],  spotify_data)



[{'name': 'Kiss Me', 'year': 1997, 'artists': "['Sixpence None The Richer']"},
 {'name': 'Hanging By A Moment', 'year': 2000, 'artists': "['Lifehouse']"},
 {'name': "Breakfast At Tiffany's",
  'year': 1995,
  'artists': "['Deep Blue Something']"},
 {'name': 'Wherever You Will Go', 'year': 2001, 'artists': "['The Calling']"},
 {'name': 'Otherside', 'year': 1999, 'artists': "['Red Hot Chili Peppers']"},
 {'name': 'Runaway (U & I)', 'year': 2015, 'artists': "['Galantis']"},
 {'name': 'No Excuses', 'year': 1994, 'artists': "['Alice In Chains']"},
 {'name': "It's Not Living (If It's Not With You)",
  'year': 2018,
  'artists': "['The 1975']"},
 {'name': 'Tongue Tied', 'year': 2011, 'artists': "['Grouplove']"},
 {'name': 'Heart Of Glass (Live from the iHeart Festival)',
  'year': 2020,
  'artists': "['Miley Cyrus']"}]