# 2 - Generating a full playlist with your trained Random Forest Model
---------------
In this notebook, we take the trained model in the last notebook to make a new playlist!

In [1]:
import pickle as pkl
import random
from typing import Iterable, Union

from dotenv import dotenv_values
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , roc_auc_score
from sklearn import tree
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Custom packages
import helpers as h
from helpers import Playlist, PlaylistCluster

In [2]:
# Set up Spotify API client credentials
config = dotenv_values('.env')
client_id = config["SPOTIFY_CLIENT_ID"]
client_secret = config["SPOTIFY_CLIENT_SECRET"]

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# Load the trained random forest classifier from the previous notebook
with open('./trained_rf.pkl', 'rb') as f:
    rf_classifier = pkl.load(f)

# Load the playlist cluster from the previous notebook
with open('./playlist_cluster.pkl', 'rb') as f:
    PLC = pkl.load(f)

rb_playlist = PLC.playlist_list[0]
lnv_playlist = PLC.playlist_list[1]

Let's ask the random forest classifier if we should add a song to the playlist!

In [4]:
class Song():
    '''Access audio features of a Spotify song.'''

    def __init__(self, song_id:str=None, song_name:str=None, song_dict:dict=None) -> None:
        for ikwarg, kwarg in enumerate([song_id, song_name, song_dict]):
            if kwarg is not None:
                self.input = kwarg
                if ikwarg == 0:
                    self.id = self.input
                    self.attributes = sp.track(self.id)
                    self.name = self.attributes['name']
                elif ikwarg == 2:
                    self.id = self.get_info_from_dict(self.input, 'id')
                    self.name = self.get_info_from_dict(self.input, 'name')

                    
        self.audio_features = sp.audio_features(self.id)
        
        self.data = pd.DataFrame(data=self.audio_features).sort_index(axis='columns')
        # initialize like to none
        self.data['like'] = np.nan
        self.ml_likes = np.nan

        data_multiIndex = pd.MultiIndex.from_frame(pd.DataFrame({'id':[self.id], 'name':[self.name]}))
        self.data.index = data_multiIndex
        
        self.audio_feature_labels = self.data.columns

        ## ML specifically (e.g. random forest)
        self.ml_feature_labels = list(set(self.audio_feature_labels)-\
                {'type', 'id','uri','track_href','analysis_url','like'})
        self.ml_data = self.data.loc[:, self.ml_feature_labels].sort_index(axis='columns')

    def get_info_from_dict(self, track:dict, info_tag:str):
        '''Unnest information from raw_tracks dict.'''

        if info_tag in track['track'].keys():
            attributes = track['track'][info_tag] 
            return attributes

In [5]:
s = Song(song_dict=rb_playlist.raw_tracks[0])
s.data

Unnamed: 0_level_0,Unnamed: 1_level_0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence,like
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5awNIWVrh2ISfvPd5IUZNh,PTT (Paint The Town),0.0735,https://api.spotify.com/v1/audio-analysis/5awN...,0.781,201120,0.814,5awNIWVrh2ISfvPd5IUZNh,1.9e-05,8,0.0565,-2.437,1,0.1,124.028,4,https://api.spotify.com/v1/tracks/5awNIWVrh2IS...,audio_features,spotify:track:5awNIWVrh2ISfvPd5IUZNh,0.546,


In [6]:
# Get a song from Spotify and its features
def grab_a_song():
    # Get a random search term or a random track ID
    search_term = random.choice(['love', 'happy', 'dance', 'rock', 'jazz'])
    results = sp.search(q=search_term, type='track', limit=50)

    # Get a random track from the search results
    track = random.choice(results['tracks']['items'])
    song = Song(track['id'])

    return song

In [7]:
s = grab_a_song()
s.ml_data

Unnamed: 0_level_0,Unnamed: 1_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2woGHWgrFfjXAgdUsitbqr,Happy Hour,0.162,0.616,199840,0.835,0,7,0.0977,-4.277,1,0.0354,97.96,4,0.8


In [8]:
rf_classifier.predict(s.ml_data)

array([1.])

We need a way to query a lot of random songs from Spotify to build a full playlist! Luckily Spotipy has a recommendations method that enables us to query Spotify based on target features.

In [10]:
rf_classifier.feature_names_in_

array(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'], dtype=object)

In [11]:
import scipy.stats as stats

In [12]:
stats.norm()

<scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x7f550cb85190>

In [13]:
lnv_playlist.track_artists

[[{'external_urls': {'spotify': 'https://open.spotify.com/artist/20wkVLutqVOYrc0kxFs7rA'},
   'href': 'https://api.spotify.com/v1/artists/20wkVLutqVOYrc0kxFs7rA',
   'id': '20wkVLutqVOYrc0kxFs7rA',
   'name': 'Daniel Caesar',
   'type': 'artist',
   'uri': 'spotify:artist:20wkVLutqVOYrc0kxFs7rA'}],
 [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3MZsBdqDrRTJihTHQrO6Dq'},
   'href': 'https://api.spotify.com/v1/artists/3MZsBdqDrRTJihTHQrO6Dq',
   'id': '3MZsBdqDrRTJihTHQrO6Dq',
   'name': 'Joji',
   'type': 'artist',
   'uri': 'spotify:artist:3MZsBdqDrRTJihTHQrO6Dq'}],
 [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3MZsBdqDrRTJihTHQrO6Dq'},
   'href': 'https://api.spotify.com/v1/artists/3MZsBdqDrRTJihTHQrO6Dq',
   'id': '3MZsBdqDrRTJihTHQrO6Dq',
   'name': 'Joji',
   'type': 'artist',
   'uri': 'spotify:artist:3MZsBdqDrRTJihTHQrO6Dq'},
  {'external_urls': {'spotify': 'https://open.spotify.com/artist/6icQOAFXDZKsumw3YXyusw'},
   'href': 'https://api.spot

In [64]:
import importlib
importlib.reload(h)

<module 'helpers' from '/mnt/c/Users/aarpe/OneDrive - The University of Chicago/AIMEMS/leyden/spotify_recs/tutorial/helpers.py'>

In [67]:
def generate_avg_playlist(nsongs:int, batch_size:int=None, user_id:str=None, new_playlist_name:str=None):
    numbered_ml_data = PLC.ml_data.reset_index()
    liked_songs = numbered_ml_data[PLC.ml_likes.reset_index()['like'] == 1]
    liked_song_features = liked_songs.drop(columns=['id', 'name'])
    if batch_size is None:
        batch_size = int(nsongs*0.20)

    # types of features
    features = liked_song_features.columns.to_list()
    int_features = ['duration_ms', 'key', 'mode', 'time_signature', 'tempo', 'loudness']
    float_features = list(set(features) - set(int_features))

    # First filter songs by the Gaussian statistics of the audio features 
    # in liked playlists
    feature_targets = liked_song_features.mean()

    # take feature_min if it is greater than 0, otherwise take 0
    feature_targets.loc[float_features] = feature_targets.loc[float_features
                                                ].apply(lambda x: min(max(x, 0), 1)).round(3)
    feature_targets.loc[int_features] = feature_targets.loc[int_features
                                                        ].round(0)
    feature_targets.loc['time_signature'] = min(feature_targets.loc['time_signature'], 11)
    feature_targets.loc['key'] = min(max(feature_targets.loc['key'], 0), 11)
    feature_targets.loc['mode'] = min(max(feature_targets.loc['mode'], 0), 1)

    feature_targets.index = 'target_' + feature_targets.index
    
    ft_dict = {**feature_targets.loc[feature_targets.index.str.contains(
                                '|'.join(float_features))].to_dict(),
                 **feature_targets.loc[feature_targets.index.str.contains(
                                '|'.join(int_features))].astype(int).to_dict()}

    # Get seed artists, songs, and tracks
    args = {
        'seed_tracks': liked_songs['id'].to_list()[:4],
    }
    
    if 'artist' in liked_songs.keys():
        args['seed_artists'] = liked_songs['artist'].to_list()[:4]
    if 'genre' in liked_songs.keys():
        args['seed_genres'] = liked_songs['genre'].to_list()[:4]

    for key, val in ft_dict.items():
        print(key, val)

    tracks = sp.recommendations(limit=nsongs,
                                **args,
                                **ft_dict
                                )
    
    #print(tracks)
    """playlist = h.Playlist(playlist=tracks)

    if new_playlist_name is not None:
        if user_id is None:
            user_id = client_id
        sp.user_playlist_create(user_id, new_playlist_name, public=False)
        sp.playlist_add_items(playlist_id=playlist.id, items=playlist.track_ids)"""
        
    return tracks

tracks = generate_avg_playlist(10)
tracks

target_acousticness 0.09
target_danceability 0.823
target_energy 0.787
target_instrumentalness 0.001
target_liveness 0.133
target_speechiness 0.137
target_valence 0.656
target_duration_ms 176282
target_key 6
target_loudness -4
target_mode 1
target_tempo 112
target_time_signature 4


{'tracks': [{'album': {'album_type': 'SINGLE',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/64KEffDW9EtZ1y2vBYgq8T'},
      'href': 'https://api.spotify.com/v1/artists/64KEffDW9EtZ1y2vBYgq8T',
      'id': '64KEffDW9EtZ1y2vBYgq8T',
      'name': 'Marshmello',
      'type': 'artist',
      'uri': 'spotify:artist:64KEffDW9EtZ1y2vBYgq8T'},
     {'external_urls': {'spotify': 'https://open.spotify.com/artist/2p4aN0Uxkk3iT3HK0cJ2cJ'},
      'href': 'https://api.spotify.com/v1/artists/2p4aN0Uxkk3iT3HK0cJ2cJ',
      'id': '2p4aN0Uxkk3iT3HK0cJ2cJ',
      'name': 'Tokischa',
      'type': 'artist',
      'uri': 'spotify:artist:2p4aN0Uxkk3iT3HK0cJ2cJ'}],
    'available_markets': ['AR',
     'AU',
     'AT',
     'BE',
     'BO',
     'BR',
     'BG',
     'CA',
     'CL',
     'CO',
     'CR',
     'CY',
     'CZ',
     'DK',
     'DO',
     'DE',
     'EC',
     'EE',
     'SV',
     'FI',
     'FR',
     'GR',
     'GT',
     'HN',
     'HK',
     'HU',
     'IS

In [69]:
tracks['tracks'][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [38]:
def generate_gaussian_playlist(nsongs:int, batch_size:int=None):
    numbered_ml_data = PLC.ml_data.reset_index()
    liked_songs = numbered_ml_data[PLC.ml_likes.reset_index()['like'] == 1]
    liked_song_features = liked_songs.drop(columns=['id', 'name'])
    if batch_size is None:
        batch_size = int(nsongs*0.20)

    # 'popularity',
    # types of features
    features = liked_song_features.columns.to_list()
    int_features = ['duration_ms', 'key', 'mode', 'time_signature', 'tempo', 'loudness']
    float_features = list(set(features) - set(int_features))

    # First filter songs by the Gaussian statistics of the audio features 
    # in liked playlists
    feature_mins = liked_song_features.mean() - liked_song_features.std()
    feature_maxs = liked_song_features.mean() + liked_song_features.std()

    # take feature_min if it is greater than 0, otherwise take 0
    feature_mins.loc[float_features] = feature_mins.loc[float_features
                                                        ].apply(lambda x: max(x, 0)).round(3)
    feature_mins.loc[int_features] = feature_mins.loc[int_features
                                                        ].round(0)
    feature_mins.loc['time_signature'] = min(feature_mins.loc['time_signature'], 11)
    feature_mins.loc['key'] = max(feature_mins.loc['key'], 0)
    feature_mins.loc['mode'] = max(feature_mins.loc['mode'], 0)

    # take feature_min if it is less than 1, otherwise take 1
    feature_maxs.loc[float_features] = feature_maxs.loc[float_features
                                                        ].apply(lambda x: min(x, 1)).round(3)
    feature_maxs.loc[int_features] = feature_maxs.loc[int_features
                                                        ].round(0)
    feature_maxs.loc['key'] = min(feature_maxs.loc['key'], 11)
    feature_maxs.loc['mode'] = min(feature_maxs.loc['mode'], 1)

    feature_mins.index = 'min_' + feature_mins.index
    feature_maxs.index = 'max_' + feature_maxs.index
    
    fmin_dict = {**feature_mins.loc[feature_mins.index.str.contains('|'.join(float_features))].to_dict(),
                 **feature_mins.loc[feature_mins.index.str.contains('|'.join(int_features))].astype(int).to_dict()}
    fmax_dict = {**feature_maxs.loc[feature_maxs.index.str.contains('|'.join(float_features))].to_dict(),
                 **feature_maxs.loc[feature_maxs.index.str.contains('|'.join(int_features))].astype(int).to_dict()}

    # Get seed artists, songs, and tracks
    args = {
        'seed_tracks': liked_songs['id'].to_list()[:4],
    }
    
    if 'artist' in liked_songs.keys():
        args['seed_artists'] = liked_songs['artist'].to_list()[:4]
    if 'genre' in liked_songs.keys():
        args['seed_genres'] = liked_songs['genre'].to_list()[:4]

    for key, val in fmin_dict.items():
        print(key, val)

    for key, val in fmax_dict.items():
        print(key, val)

    tracks = sp.recommendations(limit=nsongs,
                                **args,
                                **fmin_dict,
                                **fmax_dict)
    return tracks

tracks = generate_gaussian_playlist(10)

HTTP Error for GET to https://api.spotify.com/v1/recommendations with Params: {'limit': 10, 'min_acousticness': 0.0, 'max_acousticness': 0.229, 'min_danceability': 0.747, 'max_danceability': 0.899, 'min_duration_ms': 143141, 'max_duration_ms': 209424, 'min_energy': 0.689, 'max_energy': 0.884, 'min_instrumentalness': 0.0, 'max_instrumentalness': 0.007, 'min_key': 3, 'max_key': 10, 'min_liveness': 0.034, 'max_liveness': 0.231, 'min_loudness': -5, 'max_loudness': -3, 'min_mode': 0, 'max_mode': 1, 'min_speechiness': 0.054, 'max_speechiness': 0.22, 'min_tempo': 91, 'max_tempo': 134, 'min_time_signature': 4, 'max_time_signature': 4, 'min_valence': 0.49, 'max_valence': 0.822} returned 400 due to invalid request


min_acousticness 0.0
min_danceability 0.747
min_energy 0.689
min_instrumentalness 0.0
min_liveness 0.034
min_speechiness 0.054
min_valence 0.49
min_duration_ms 143141
min_key 3
min_loudness -5
min_mode 0
min_tempo 91
min_time_signature 4
max_acousticness 0.229
max_danceability 0.899
max_energy 0.884
max_instrumentalness 0.007
max_liveness 0.231
max_speechiness 0.22
max_valence 0.822
max_duration_ms 209424
max_key 10
max_loudness -3
max_mode 1
max_tempo 134
max_time_signature 4


SpotifyException: http status: 400, code:-1 - https://api.spotify.com/v1/recommendations?limit=10&min_acousticness=0.0&max_acousticness=0.229&min_danceability=0.747&max_danceability=0.899&min_duration_ms=143141&max_duration_ms=209424&min_energy=0.689&max_energy=0.884&min_instrumentalness=0.0&max_instrumentalness=0.007&min_key=3&max_key=10&min_liveness=0.034&max_liveness=0.231&min_loudness=-5&max_loudness=-3&min_mode=0&max_mode=1&min_speechiness=0.054&max_speechiness=0.22&min_tempo=91&max_tempo=134&min_time_signature=4&max_time_signature=4&min_valence=0.49&max_valence=0.822:
 invalid request, reason: None

In [None]:
tracks = generate_gaussian_playlist(20)
tracks

TypeError: Could not convert ['PTT (Paint The Town)Sistema De PatioBack DoorXT4S1SPING PONGMY BAGBIG MADShut DownBIZCOCHITOGet Into It (Yuh)Boss Bitch212Run BTSMirror MirrorWomanBest Friend (feat. Doja Cat, Jamie & CHANMINA) [Remix]ANTIFRAGILENOBODY LIKE YOUBulletproof - Tiborg RemixSweatProblem'] to numeric

## Scratch

In [None]:
for dt in rf_classifier.estimators_:

    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    print(
        "The binary tree structure has {n} nodes and has "
        "the following tree structure:\n".format(n=n_nodes)
    )
    for i in range(n_nodes):
        if is_leaves[i]:
            print(
                "{space}node={node} is a leaf node.".format(
                    space=node_depth[i] * "\t", node=i
                )
            )
        else:
            print(
                "{space}node={node} is a split node: "
                "go to node {left} if X[:, {feature}] <= {threshold} "
                "else to node {right}.".format(
                    space=node_depth[i] * "\t",
                    node=i,
                    left=children_left[i],
                    feature=feature[i],
                    threshold=threshold[i],
                    right=children_right[i],
                )
            )

The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.07660000026226044 else to node 4.
	node=1 is a split node: go to node 2 if X[:, 3] <= 0.6875 else to node 3.
		node=2 is a leaf node.
		node=3 is a leaf node.
	node=4 is a split node: go to node 5 if X[:, 6] <= 0.096950002014637 else to node 6.
		node=5 is a leaf node.
		node=6 is a leaf node.
The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.056050000712275505 else to node 4.
	node=1 is a split node: go to node 2 if X[:, 0] <= 0.021585000562481582 else to node 3.
		node=2 is a leaf node.
		node=3 is a leaf node.
	node=4 is a split node: go to node 5 if X[:, 4] <= 0.35689999908208847 else to node 6.
		node=5 is a leaf node.
		node=6 is a leaf node.
The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.056