# Building Recommender System with Spotify Data

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spotipy
import os
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from collections import defaultdict
from scipy.spatial.distance import cdist

## Mainpulating Spotify Dataset

### Reading the Data

In [None]:
spotify_data = pd.read_csv('./data/charts_with_audio_features.csv')
spotify_data.head(5)

In [None]:
spotify_data.info()

### Clustering Songs using K-Means

In [None]:
# Remove rows with invalid values
spotify_data.dropna(inplace=True)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=5, verbose=2))], verbose=True)

# Select a subset of columns to use in the clustering process
columns_to_use = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'valence', 'tempo']  
X = spotify_data[columns_to_use]

# Fit the model based on spotify data
cluster_pipeline.fit(X)

### Add Cluster Labels to Songs

In [None]:
# Predict what cluster each song belongs to
cluster_labels = cluster_pipeline.predict(X)

# Add cluster labels as a final column to spotify data
spotify_data['cluster_label'] = cluster_labels
spotify_data.head(5)

### Visualize Song Clusters with PCA

In [None]:
# PCA for dimension reduction (faster than t-SNE)
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

In [None]:
# Visualize song clusters in a 2D space
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

## Building Content-Based Recommender System

### Utility Functions

In [None]:
# Establish spotipy connection
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ["SPOTIFY_CLIENT_ID"],
                                                           client_secret=os.environ["SPOTIFY_CLIENT_SECRET"]))

# Returns a dataframe with data for a song given the name and release year.
# Uses Spotipy to fetch audio features and metadata for the specified song.
def find_song(name, year):
    
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)

In [None]:
# Gets the song data for a specific song
# Song argument is a dictionary with key-value pairs for the name and release year
def get_song_data(song, spotify_data):
    
    # Check if song is in the spotify dataset, otherwise use find_song method
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    except IndexError:
        return find_song(song['name'], song['year'])

In [None]:
# Calculates mean vector from a list of songs based on audio/metadata features
def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    # Add all songs to song_vectors
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[columns_to_use].values
        song_vectors.append(song_vector)
    
    # Convert to numpy array then use np.mean
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [None]:
# Flattens a list of dictionaries.
def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

### Recommender Function

In [None]:
# Recommends songs based on a list of previous songs that a user has listened to.
def recommend_songs(song_list, spotify_data, n_songs=10):
    
    # Compute average vector of input songs
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[columns_to_use])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))

    # Find closest songs in dataset to the average vector using cosine distance
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    # Recommend corresponding songs from the dataset
    rec_songs = spotify_data.iloc[index]
    song_dict = flatten_dict_list(song_list)
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]

    # Format output
    metadata_cols = ['name', 'year', 'artists']
    return rec_songs[metadata_cols].to_dict(orient='records')


## Testing Recommender

In [None]:
recommend_songs([{'name': 'HiTek Tek', 'year': 2020},
                {'name': 'Ridin Strikers', 'year': 2020},
                {'name': 'One Of My', 'year': 2020},
                {'name': 'Hard To Choose One', 'year': 2020},
                {'name': 'Touch The Sky', 'year': 2020},
                {'name': 'Solitaires (feat. Travis Scott)', 'year': 2020},
                {'name': 'Harlem Shake (feat. Young Thug)', 'year': 2020}], spotify_data, 15)