# 2 - Generating a full playlist with your trained Random Forest Model
---------------
In this notebook, we take the trained model in the last notebook to make a new playlist!

In [3]:
import pickle as pkl
import random
from typing import Iterable, Union

from dotenv import dotenv_values
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , roc_auc_score
from sklearn import tree
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Custom packages
import helpers as h
from helpers import Playlist, PlaylistCluster

In [4]:
# Set up Spotify API client credentials
config = dotenv_values('.env')
client_id = config["SPOTIFY_CLIENT_ID"]
client_secret = config["SPOTIFY_CLIENT_SECRET"]

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
# Load the trained random forest classifier from the previous notebook
with open('./trained_rf.pkl', 'rb') as f:
    rf_classifier = pkl.load(f)

# Load the playlist cluster from the previous notebook
with open('./playlist_cluster.pkl', 'rb') as f:
    PLC = pkl.load(f)

rb_playlist = PLC.playlist_list[0]
lnv_playlist = PLC.playlist_list[1]

Let's ask the random forest classifier if we should add a song to the playlist!

In [7]:
class Song():
    '''Access audio features of a Spotify song.'''

    def __init__(self, song_id:str=None, song_name:str=None, song_dict:dict=None) -> None:
        for ikwarg, kwarg in enumerate([song_id, song_name, song_dict]):
            if kwarg is not None:
                self.input = kwarg
                if ikwarg == 0:
                    self.id = self.input
                    self.attributes = sp.track(self.id)
                    self.name = self.attributes['name']
                elif ikwarg == 2:
                    self.id = self.get_info_from_dict(self.input, 'id')
                    self.name = self.get_info_from_dict(self.input, 'name')

                    
        self.audio_features = sp.audio_features(self.id)
        
        self.data = pd.DataFrame(data=self.audio_features).sort_index(axis='columns')
        # initialize like to none
        self.data['like'] = np.nan
        self.ml_likes = np.nan

        data_multiIndex = pd.MultiIndex.from_frame(pd.DataFrame({'id':[self.id], 'name':[self.name]}))
        self.data.index = data_multiIndex
        
        self.audio_feature_labels = self.data.columns

        ## ML specifically (e.g. random forest)
        self.ml_feature_labels = list(set(self.audio_feature_labels)-\
                {'type', 'id','uri','track_href','analysis_url','like'})
        self.ml_data = self.data.loc[:, self.ml_feature_labels].sort_index(axis='columns')

    def get_info_from_dict(self, track:dict, info_tag:str):
        '''Unnest information from raw_tracks dict.'''

        if info_tag in track['track'].keys():
            attributes = track['track'][info_tag] 
            return attributes

In [8]:
s = Song(song_dict=rb_playlist.raw_tracks[0])
s.data

Unnamed: 0_level_0,Unnamed: 1_level_0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence,like
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5awNIWVrh2ISfvPd5IUZNh,PTT (Paint The Town),0.0735,https://api.spotify.com/v1/audio-analysis/5awN...,0.781,201120,0.814,5awNIWVrh2ISfvPd5IUZNh,1.9e-05,8,0.0565,-2.437,1,0.1,124.028,4,https://api.spotify.com/v1/tracks/5awNIWVrh2IS...,audio_features,spotify:track:5awNIWVrh2ISfvPd5IUZNh,0.546,


In [9]:
# Get a song from Spotify and its features
def grab_a_song():
    # Get a random search term or a random track ID
    search_term = random.choice(['love', 'happy', 'dance', 'rock', 'jazz'])
    results = sp.search(q=search_term, type='track', limit=50)

    # Get a random track from the search results
    track = random.choice(results['tracks']['items'])
    song = Song(track['id'])

    return song

In [10]:
s = grab_a_song()
s.ml_data

Unnamed: 0_level_0,Unnamed: 1_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3cJSJTrEKNfm1TmoswErkW,Happy Birthday,0.005,0.782,355387,0.655,1.2e-05,1,0.0493,-12.464,1,0.0435,116.208,4,0.957


In [11]:
rf_classifier.predict(s.ml_data)

array([0.])

We need a way to query a lot of random songs from Spotify to build a full playlist! Luckily Spotipy has a recommendations method that enables us to query Spotify based on target features.

In [63]:
rf_classifer.feature_names_in_

array(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'], dtype=object)

In [152]:
import scipy.stats as stats

In [None]:
stats.norm()

In [160]:
lnv_playlist.track_artists

[[{'external_urls': {'spotify': 'https://open.spotify.com/artist/20wkVLutqVOYrc0kxFs7rA'},
   'href': 'https://api.spotify.com/v1/artists/20wkVLutqVOYrc0kxFs7rA',
   'id': '20wkVLutqVOYrc0kxFs7rA',
   'name': 'Daniel Caesar',
   'type': 'artist',
   'uri': 'spotify:artist:20wkVLutqVOYrc0kxFs7rA'}],
 [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3MZsBdqDrRTJihTHQrO6Dq'},
   'href': 'https://api.spotify.com/v1/artists/3MZsBdqDrRTJihTHQrO6Dq',
   'id': '3MZsBdqDrRTJihTHQrO6Dq',
   'name': 'Joji',
   'type': 'artist',
   'uri': 'spotify:artist:3MZsBdqDrRTJihTHQrO6Dq'}],
 [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3MZsBdqDrRTJihTHQrO6Dq'},
   'href': 'https://api.spotify.com/v1/artists/3MZsBdqDrRTJihTHQrO6Dq',
   'id': '3MZsBdqDrRTJihTHQrO6Dq',
   'name': 'Joji',
   'type': 'artist',
   'uri': 'spotify:artist:3MZsBdqDrRTJihTHQrO6Dq'},
  {'external_urls': {'spotify': 'https://open.spotify.com/artist/6icQOAFXDZKsumw3YXyusw'},
   'href': 'https://api.spot

In [None]:
def generate_playlist_from_rf(nsongs:int, batch_size:int=None):
    numbered_ml_data = PLC.ml_data.reset_index()
    liked_songs = numbered_ml_data[PLC.ml_likes.reset_index()['like'] == 1]

    if batch_size is None:
        batch_size = int(nsongs*0.20)

    # First filter songs by the Gaussian statistics of the audio features 
    # in liked playlists
    feature_mins = liked_songs.mean() - liked_songs.std()
    feature_maxs = liked_songs.mean() + liked_songs.std()

    feature_mins.index = 'min_' + feature_mins.index
    feature_maxs.index = 'max_' + feature_maxs.index
    
    fmin_dict = feature_mins.to_dict()
    fmax_dict = feature_maxs.to_dict()

    # Get seed artists, songs, and tracks
    seed_songs = liked_songs['id']
    seed_artists = liked_songs['artist']
    seed_genres = liked_songs['genre']

    tracks = sp.recommendations(seed_artists=seed_artists,
                                seed_genres=seed_genres,
                                seed_tracks=seed_songs,
                                limit=nsongs,
                                **fmin_dict,
                                **fmax_dict)
    return tracks

## Scratch

In [68]:
for dt in rf_classifier.estimators_:

    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    print(
        "The binary tree structure has {n} nodes and has "
        "the following tree structure:\n".format(n=n_nodes)
    )
    for i in range(n_nodes):
        if is_leaves[i]:
            print(
                "{space}node={node} is a leaf node.".format(
                    space=node_depth[i] * "\t", node=i
                )
            )
        else:
            print(
                "{space}node={node} is a split node: "
                "go to node {left} if X[:, {feature}] <= {threshold} "
                "else to node {right}.".format(
                    space=node_depth[i] * "\t",
                    node=i,
                    left=children_left[i],
                    feature=feature[i],
                    threshold=threshold[i],
                    right=children_right[i],
                )
            )

The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.07660000026226044 else to node 4.
	node=1 is a split node: go to node 2 if X[:, 3] <= 0.6875 else to node 3.
		node=2 is a leaf node.
		node=3 is a leaf node.
	node=4 is a split node: go to node 5 if X[:, 6] <= 0.096950002014637 else to node 6.
		node=5 is a leaf node.
		node=6 is a leaf node.
The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.056050000712275505 else to node 4.
	node=1 is a split node: go to node 2 if X[:, 0] <= 0.021585000562481582 else to node 3.
		node=2 is a leaf node.
		node=3 is a leaf node.
	node=4 is a split node: go to node 5 if X[:, 4] <= 0.35689999908208847 else to node 6.
		node=5 is a leaf node.
		node=6 is a leaf node.
The binary tree structure has 7 nodes and has the following tree structure:

node=0 is a split node: go to node 1 if X[:, 9] <= 0.056