In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

  import pandas.util.testing as tm


## Data Import

In [2]:
dat = pd.read_csv("01_Data/spotify_dat.csv", index_col = 0)

In [3]:
# select track info and audio features
dat_reco = dat[['df_label', 'name', 'album_name', 'artist_name', 'uri', 'popularity', 'artist_popularity', 'explicit', 'acousticness', 'danceability', 'energy', 'instrumentalness', 
                'liveness', 'loudness', 'speechiness', 'valence', 'tempo', 'mode', 'num_of_sections', 'num_of_keys', 'num_of_modes', 
                'num_of_time_signatures', 'duration_minutes', 'section_durations_variance', 'section_durations_min', 'section_durations_max', 'section_loudnesses_variance', 
                'section_loudnesses_min', 'section_loudnesses_max', 'section_tempos_variance', 'section_tempos_min', 'section_tempos_max', 'album_big_label', 'album_medium_label', 
                'genre_alternative metal/rock', 'genre_background', 'genre_baroque classic', 'genre_classical', 'genre_country', 'genre_hip hop', 'genre_house', 'genre_indie', 
                'genre_mexican', 'genre_pop', 'genre_rap', 'genre_rock', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'overall_key_1', 
                'overall_key_2', 'overall_key_3', 'overall_key_4', 'overall_key_5', 'overall_key_6', 'overall_key_7', 'overall_key_8', 'overall_key_9', 'overall_key_10', 
                'overall_key_11']]

In [4]:
# create list of the names of the songs in the user's profile
user_songs = list(dat_reco[dat_reco['df_label'] == "user4"]["name"].values)

## Song-Based Recommendation Engine
Goal: Generate themed playlists based on songs a user listened to in the past. We take two approaches for this recommendation:
1. We recommend new songs based on the audio features of a user's single favorite song.
2. We recommend new songs based on the average of the audio features of all songs a user has in its profile.

In [5]:
def playlist_recommendation(song, data, theme, n_songs):
    """
    This function recommends a playlist of n_songs  based on a single song or a collection of songs leveraging cosine similarity between the songs' features.
    It is possible to pass the function a single song or a list of songs (i.e. all songs in a user profile). If a list of songs is passed, the average of the songs' audio features
    is considered for cosine similarity. 
    It is possible to define a theme for the palylist.
    it is possible to define the length of the playlist.
    """
    
    # error handling: song must be included in data
    if pd.Series(song).isin(data["name"])[0] != True:
        raise Exception("Found an error. Song must be included in data.")
    
    # error handling: data must include the following columns
    if set(['name', 'album_name', 'artist_name', 'uri', 'popularity', 'artist_popularity', 'explicit', 'acousticness', 'danceability', 'energy', 'instrumentalness', 
            'liveness', 'loudness', 'speechiness', 'valence', 'tempo', 'mode', 'num_of_sections', 'num_of_keys', 'num_of_modes',  'num_of_time_signatures', 'duration_minutes', 
            'section_durations_variance', 'section_durations_min', 'section_durations_max', 'section_loudnesses_variance',  'section_loudnesses_min', 'section_loudnesses_max', 
            'section_tempos_variance', 'section_tempos_min', 'section_tempos_max', 'album_big_label', 'album_medium_label',  'genre_alternative metal/rock', 'genre_background', 
            'genre_baroque classic', 'genre_classical', 'genre_country', 'genre_hip hop', 'genre_house', 'genre_indie',  'genre_mexican', 'genre_pop', 'genre_rap', 'genre_rock', 
            'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'overall_key_1',  'overall_key_2', 'overall_key_3', 'overall_key_4', 'overall_key_5', 
            'overall_key_6', 'overall_key_7', 'overall_key_8', 'overall_key_9', 'overall_key_10', 'overall_key_11']).issubset(data.columns) != True:
        raise Exception("Found an error. Data provided does not contain required columns.")
    
    # error handling: the following themes can be passed to the function
    if theme not in ["none", "dance", "chill", "discover"]:
        raise Exception("Found an error. Supported themes are none, dance, chill, and discover")
    
    # slice data by theme
    if theme == "dance":
        data_pool = data[data["danceability"] > 0.75]
    elif theme == "chill":
        data_pool = data[(data["tempo"] < 95) & (data["valence"] > 0.5)]
    elif theme == "discover":
        data_pool = data[(data["popularity"] < 60) & (data["artist_popularity"] < 80)]
    elif theme == "none":
        data_pool = data
        
    # required columns for analysis
    required_columns = ['popularity', 'artist_popularity', 'explicit', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 
                          'valence', 'tempo', 'mode', 'num_of_sections', 'num_of_keys', 'num_of_modes',  'num_of_time_signatures', 'duration_minutes',
                          'section_durations_variance', 'section_durations_min', 'section_durations_max', 'section_loudnesses_variance',  'section_loudnesses_min', 
                          'section_loudnesses_max',  'section_tempos_variance', 'section_tempos_min', 'section_tempos_max', 'album_big_label', 'album_medium_label',
                          'genre_alternative metal/rock', 'genre_background',  'genre_baroque classic', 'genre_classical', 'genre_country', 'genre_hip hop', 'genre_house', 
                          'genre_indie',  'genre_mexican', 'genre_pop', 'genre_rap', 'genre_rock',  'time_signature_1', 'time_signature_3', 'time_signature_4', 
                          'time_signature_5', 'overall_key_1',  'overall_key_2', 'overall_key_3', 'overall_key_4', 'overall_key_5',  'overall_key_6', 'overall_key_7', 
                          'overall_key_8', 'overall_key_9', 'overall_key_10', 'overall_key_11']

    # handling input of song variable (single song vs. list of songs)
    if type(song) == list:
        # create user dataframe with user data from original dataframe and filter out user data from data_pool
        user = data[data['name'].isin(song)]
        data_pool = data_pool[~data_pool['name'].isin(song)]
        # calculate mean of user input
        user_temp = user[required_columns]
        user_row = pd.DataFrame(np.mean(user_temp, axis = 0)).T
        user_row["name"] = "songs_agg"
        # concat user input and global as data_pool
        data_pool = pd.concat([data_pool, user_row])

    # add song to data_pool if not yet in there (i.e. because of filter)
    else: 
        if song not in data_pool["name"].values:
            required_row = data[data["name"] == song]
            data_pool = pd.concat([data_pool, required_row])
        
    # get indices of all songs
    original_indices = pd.Series(data_pool["name"]) # for matching songs with data
    temp_indices = pd.Series(data_pool["name"].reset_index(drop = True))
    
    # prepare dataframe for cosine similarity with only audio features
    data_pool_mtrx = data_pool[required_columns].values
    
    # calculate cosine similarity
    cosine_sim = cosine_similarity(data_pool_mtrx, data_pool_mtrx)
    
    # initialize list for recommended playlist
    recommended_playlist = []
    
    # get index of the song matching the given song depending on input (song vs. list of songs)
    if type(song) == list:
        idx_original = original_indices[original_indices == "songs_agg"].index[0]
        idx_temp = temp_indices[temp_indices == "songs_agg"].index[0]
    
    elif type(song) == str:
        idx_original = original_indices[original_indices == song].index[0]
        idx_temp = temp_indices[temp_indices == song].index[0]
    
    # create series with similarity scores in descending order
    scored_songs = pd.Series(cosine_sim[idx_temp]).sort_values(ascending = False)
    
    # get the indices of most similar songs
    top_songs_idx_temp = list(scored_songs.iloc[1:n_songs].index)
    
    # convert top_songs_idx_temp to top_songs_idx_original
    top_songs_idx_original = [original_indices[original_indices == item].index[0] for item in list(temp_indices[top_songs_idx_temp].values)]
    
    # append information on the top songs to recommended_playlist
    for i in top_songs_idx_original:
        recommended_playlist.append(list(data_pool[["name", "artist_name", "album_name", "uri"]].loc[i,:]))
    
    # return results
    recommended_playlist = pd.DataFrame(recommended_playlist)
    recommended_playlist.rename(columns = {0: "name", 1: "artist_name", 2: "album_name", 3: "uri"}, inplace = True)
    return pd.DataFrame(recommended_playlist)

In [6]:
# make user-profile based recommendation
chill_playlist_avg = playlist_recommendation(user_songs, dat_reco, "chill", 10)
discover_playlist_avg = playlist_recommendation(user_songs, dat_reco, "discover", 10)
dance_playlist_avg = playlist_recommendation(user_songs, dat_reco, "dance", 10)

# save playlists
chill_playlist_avg.to_csv("03_Playlists/chill_playlist_avg.csv")
discover_playlist_avg.to_csv("03_Playlists/discover_playlist_avg.csv")
dance_playlist_avg.to_csv("03_Playlists/dance_playlist_avg.csv")

In [7]:
# find favorite songs
fav_song = dat[(dat["Rating0-5"] == 5) & (dat["top_tracks"] == 1) & (dat["recently_played"] == 1)].iloc[1, :]["name"]

# make song-based recommendation
chill_playlist_fav_song = playlist_recommendation(fav_song, dat_reco, "chill", 10)
discover_playlist_fav_song = playlist_recommendation(fav_song, dat_reco, "discover", 10)
dance_playlist_fav_song = playlist_recommendation(fav_song, dat_reco, "dance", 10)

# save playlists
chill_playlist_fav_song.to_csv("03_Playlists/chill_playlist_fav_song.csv")
discover_playlist_fav_song.to_csv("03_Playlists/discover_playlist_fav_song.csv")
dance_playlist_fav_song.to_csv("03_Playlists/dance_playlist_fav_song.csv")