# Building Recommender System with Spotify Data

### Import Libraries

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spotipy
import os
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from collections import defaultdict
from scipy.spatial.distance import cdist

## Mainpulating Spotify Dataset

### Reading the Data

In [25]:
spotify_data = pd.read_csv('./data/tracks_features.csv')
spotify_data.head(5)

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


In [3]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204025 non-null  object 
 2   album             1204025 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode              1204025 non-null  int64  
 14  speechiness       1204025 non-null  float64
 15  acousticness      1204025 non-null  float64
 16  

### Clustering Songs using K-Means

In [None]:
# Remove rows with invalid values
spotify_data.dropna(inplace=True)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, verbose=2))], verbose=True)

# Select a subset of columns to use in the clustering process
columns_to_use = ['explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'year']   
X = spotify_data[columns_to_use]
# X = (X - X.mean()) / X.std()

# Fit the model based on spotify data
cluster_pipeline.fit(X)

### Add Cluster Labels to Songs

In [5]:
# Predict what cluster each song belongs to
cluster_labels = cluster_pipeline.predict(X)

# Add cluster labels as a final column to spotify data
spotify_data['cluster_label'] = cluster_labels
spotify_data.head(5)

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,cluster_label
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,6
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,5
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,10
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,18
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,6


### Visualize Song Clusters with PCA

In [6]:
# PCA for dimension reduction (faster than t-SNE)
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

In [None]:
# Visualize song clusters in a 2D space
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

## Building Content-Based Recommender System

### Utility Functions

In [8]:
# Establish spotipy connection
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ["SPOTIFY_CLIENT_ID"],
                                                           client_secret=os.environ["SPOTIFY_CLIENT_SECRET"]))

# Returns a dataframe with data for a song given the name and release year.
# Uses Spotipy to fetch audio features and metadata for the specified song.
def find_song(name, year):
    
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)

In [9]:
# Gets the song data for a specific song
# Song argument is a dictionary with key-value pairs for the name and release year
def get_song_data(song, spotify_data):
    
    # Check if song is in the spotify dataset, otherwise use find_song method
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    except IndexError:
        return find_song(song['name'], song['year'])

In [10]:
# Calculates mean vector from a list of songs based on audio/metadata features
def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    # Add all songs to song_vectors
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[columns_to_use].values
        song_vectors.append(song_vector)
    
    # Convert to numpy array then use np.mean
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [11]:
# Flattens a list of dictionaries.
def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

### Recommender Function

In [12]:
# Recommends songs based on a list of previous songs that a user has listened to.
def recommend_songs(song_list, spotify_data, n_songs=10):
    
    # Compute average vector of input songs
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[columns_to_use])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))

    # Find closest songs in dataset to the average vector using cosine distance
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    # Recommend corresponding songs from the dataset
    rec_songs = spotify_data.iloc[index]
    song_dict = flatten_dict_list(song_list)
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]

    # Format output
    metadata_cols = ['name', 'year', 'artists']
    return rec_songs[metadata_cols].to_dict(orient='records')


## Testing Recommender

In [24]:
recommend_songs([{'name': 'HiTek Tek', 'year': 2020},
                {'name': 'Ridin Strikers', 'year': 2020},
                {'name': 'One Of My', 'year': 2020},
                {'name': 'Hard To Choose One', 'year': 2020},
                {'name': 'Touch The Sky', 'year': 2020},
                {'name': 'Solitaires (feat. Travis Scott)', 'year': 2020},
                {'name': 'Harlem Shake (feat. Young Thug)', 'year': 2020}], spotify_data, 15)


X does not have valid feature names, but StandardScaler was fitted with feature names



[{'name': 'WIFI LIT', 'year': 2018, 'artists': "['Future']"},
 {'name': 'Walking Ticket', 'year': 2018, 'artists': "['Key Glock']"},
 {'name': 'Vibin Wit My Tribe',
  'year': 2020,
  'artists': "['Shawn7', 'WYZE', 'Deshazer J']"},
 {'name': 'BRINK$',
  'year': 2019,
  'artists': "['Rhomar Jessy', 'Parris Chariz']"},
 {'name': 'What Goes Around Comes Around',
  'year': 2020,
  'artists': "['Key Glock']"},
 {'name': "I'm Not Goin' (feat. Kevin Gates)",
  'year': 2018,
  'artists': "['Gucci Mane', 'Kevin Gates']"},
 {'name': 'Blixky Gang Freestyle', 'year': 2020, 'artists': "['22Gz']"},
 {'name': 'Slang', 'year': 2020, 'artists': "['Just Ray']"},
 {'name': 'So Kool',
  'year': 2020,
  'artists': "['Wolfgang Gartner', 'Walt Anderson']"},
 {'name': 'Find You (New Jerusalem)',
  'year': 2020,
  'artists': "['Peter CottonTale']"},
 {'name': 'Clarity', 'year': 2018, 'artists': "['Don Q']"},
 {'name': 'Press', 'year': 2019, 'artists': "['Cardi B']"},
 {'name': 'Stop Playin', 'year': 2020, 'arti