### Here we will retrieve songs for each artist in our giant playlist of songs as well as in my personal library

#### These artists will be the artists we are able to recommend for our artist recommender as we will have all of their features stored

In [1]:
import pandas as pd
import numpy as np
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import os, json

In [2]:
cur_dir = os.getcwd()

In [3]:
df_full = pd.read_csv('MyData/biggest_playlist_ever.csv')
df_pers = pd.read_csv('MyData/songs_for_personal_analysis.csv')

In [4]:
df_pers.head()

Unnamed: 0.1,Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,trackName,artistName
0,0,5Le9sSLxWIaIEPPppZ9EuF,0.33,0.673,254118.0,0.8,0.0,7.0,0.689,-6.253,1.0,0.214,135.997,4.0,0.268,'Round Here,IODONTPLAY
1,1,5F7bIFd3xWuoXmvXFqFl5M,0.0825,0.744,365950.0,0.57,0.0,3.0,0.121,-4.359,0.0,0.0357,75.019,4.0,0.6,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES
2,2,2PzU4IB8Dr6mxV3lHuaG34,0.0383,0.723,222813.0,0.863,0.0317,2.0,0.128,-7.89,1.0,0.0338,136.302,4.0,0.931,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones
3,3,3qLfQNPEE27KI3Hgd9Om8A,0.991,0.295,135653.0,0.0706,0.92,9.0,0.101,-20.157,0.0,0.0439,76.425,1.0,0.139,(prelude),Zachary Bruno
4,4,4txn9qnwK3ILQqv5oq2mO3,0.388,0.519,264213.0,0.809,0.0,1.0,0.275,-6.362,1.0,0.556,146.02,4.0,0.262,03' Adolescence,J. Cole


In [5]:
artists1 = df_full.artist.unique()
artists2 = df_pers.artistName.unique()

In [6]:
artists = np.append(artists1,artists2)

In [7]:
artists = artists.astype('str')

In [58]:
artists = np.unique(artists)

#### Retrieve 50 songs for each artist

In [17]:
CLIENT_ID = os.environ.get('SPOTIFY_CLIENT_ID')
CLIENT_SECRET = os.environ.get('SPOTIFY_SECRET_ID')

In [18]:
client_credentials_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [25]:
name = 'Radiohead'

results = sp.search(q='artist:' + name, type='artist')
items = results['artists']['items']

In [26]:
items

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4Z8W4fKeB5YxbusRsdQVPb'},
  'followers': {'href': None, 'total': 5650121},
  'genres': ['alternative rock',
   'art rock',
   'melancholia',
   'oxford indie',
   'permanent wave',
   'rock'],
  'href': 'https://api.spotify.com/v1/artists/4Z8W4fKeB5YxbusRsdQVPb',
  'id': '4Z8W4fKeB5YxbusRsdQVPb',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/afcd616e1ef2d2786f47b3b4a8a6aeea24a72adc',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/563754af10b3d9f9f62a3458e699f58c4a02870f',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/4067ea225d8b42fa6951857d3af27dd07d60f3c6',
    'width': 160}],
  'name': 'Radiohead',
  'popularity': 80,
  'type': 'artist',
  'uri': 'spotify:artist:4Z8W4fKeB5YxbusRsdQVPb'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/17mBFWKyCyp506a3n6XUWA'},
  'followers': {'href': None, 'total': 1},
  'genres': [],
  'href': 'ht

In [27]:
items[0]['id']

'4Z8W4fKeB5YxbusRsdQVPb'

In [30]:
def get_artist_ids(artist_names):
    ids = []
    for name in artist_names:
        try:
            results = sp.search(q='artist:' + name, type='artist')
            items = results['artists']['items']
            artist_id = items[0]['id']
            ids.append(artist_id)
        except:
            print(name)
    
    return ids
        

In [31]:
artist_ids = get_artist_ids(artists)

Aamupojat
Ella Lymi
Eternal Flames
Groove Sisters Media
Hotelli Vantaa
IDOLS 2007
Ipanapa
Kauniit & Uhkarohkeat
Linda Vink
Munamies
Profeetat
Puhuva Kone
Pää-äijät
Sueco the Child
Suurlähettiläät
Valvomo
Yung Anime


In [60]:
len(artist_ids)

1945

In [35]:
def get_top_tracks(artist_ids):
    """
    Retrieve features for each artists top 10 tracks
    
    """
    top_10s = {}
    
    for artist in artist_ids:
        top = sp.artist_top_tracks(artist, country='US')['tracks']
        top_tracks = []
        for track in top:
            top_tracks.append(track['id'])
        
        top_10s[artist] = top_tracks
        

    return top_10s

In [36]:
top_10s = get_top_tracks(artist_ids)

In [70]:
len(top_10s.keys())

1760

In [67]:
to_delete = []
for key, value in top_10s.items():
    if len(value)!=10:
        to_delete.append(key)
        
for key in to_delete:
    del top_10s[key]

In [68]:
artist_dic = dict(zip(artist_ids, artists))

#### What we have for our 500+ artists at this point:
- A dictionary of artist IDs and their top 10 track IDs
- A dictionary of artist names and their corresponding IDs

#### What's next
- Get song features for each artists top tracks
- Create a dataframe with columns: artist_id, artist_name, song_id, and all song features
- Aggregate the data frame by artist id to get a mean of song features.

In [51]:
df = pd.DataFrame.from_dict(top_10s, orient='index')

In [52]:
df = df.stack().reset_index()
df

Unnamed: 0,level_0,level_1,0
0,5IbEL2xjRtKsunfmsahLuO,0,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,1,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,2,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,4,4aTyb0MBTzJXE75aHxeGW5
...,...,...,...
17595,1luoTtYQjMoJPSzl9YCO1B,5,4s4nBjnM0Sa7h60zQyMNmL
17596,1luoTtYQjMoJPSzl9YCO1B,6,4twmjsLIFyuqnwhTEHq4HC
17597,1luoTtYQjMoJPSzl9YCO1B,7,4jqvzScw5GpULd7XoAM5Y7
17598,1luoTtYQjMoJPSzl9YCO1B,8,4aUhHjaW2S0eBrPRWNY5N7


In [73]:
df.rename(columns = {'level_0':'artist_id',0:'track_id'}, inplace=True)
df.drop(columns = 'level_1',inplace=True)

In [74]:
df.head()

Unnamed: 0,artist_id,track_id
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,4aTyb0MBTzJXE75aHxeGW5


In [75]:
df['artist_name'] = df.artist_id.apply(lambda x: artist_dic[x])

In [76]:
df

Unnamed: 0,artist_id,track_id,artist_name
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy,$NOT
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu,$NOT
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg,$NOT
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj,$NOT
4,5IbEL2xjRtKsunfmsahLuO,4aTyb0MBTzJXE75aHxeGW5,$NOT
...,...,...,...
17595,1luoTtYQjMoJPSzl9YCO1B,4s4nBjnM0Sa7h60zQyMNmL,iann dior
17596,1luoTtYQjMoJPSzl9YCO1B,4twmjsLIFyuqnwhTEHq4HC,iann dior
17597,1luoTtYQjMoJPSzl9YCO1B,4jqvzScw5GpULd7XoAM5Y7,iann dior
17598,1luoTtYQjMoJPSzl9YCO1B,4aUhHjaW2S0eBrPRWNY5N7,iann dior


#### Now we need the song features for each track

In [77]:
def create_feature_df(features):
    df_temp = pd.DataFrame.from_dict(features, orient = 'index')[:11]
    df_temp = df_temp.unstack().to_frame().T[0]
    return df_temp


def get_song_features(songs):
    features = ["danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo"]
    df = pd.DataFrame(columns = features)
    for song in songs:
        spot_feats = sp.audio_features(song)[0]
        df_temp = create_feature_df(spot_feats)
        df = df.append(df_temp)
    
    return df

In [None]:
#Splitting df into two for processing as it times out if we do all of the songs at once
df_features1 = get_song_features(df['track_id'][:8800])

In [None]:
df_features2 = get_song_features(df['track_id'][8800:])

In [None]:
df_features1 = df_features.astype('float64')
df_features2 = df_features.astype('float64')

In [None]:
df_features = pd.concat([df_features1,df_features2])

In [None]:
df['tmp'] = range(len(df))
df_features['tmp'] = range(len(df))

df = pd.merge(df, df_features, on=['tmp'])
df = df.drop('tmp', axis=1)

In [None]:
df.head()

In [None]:
df_grouped = df.groupby(['artist_id','artist_name']).mean().reset_index()

In [None]:
df_grouped

#### Before we save the aggregated features, we are going to remove the features we previously found to have no correlation to me enjoying a song

#### Lets also see if we can find similar artists based on cosign similarity 

In [None]:
df_grouped.drop(columns = ['key','mode'], inplace=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
df_grouped.iloc[:,2:]

In [None]:
sd = StandardScaler()
scaled = sd.fit_transform(df_grouped.iloc[:,2:])

In [None]:
df_grouped.iloc[:,2:] = scaled

In [None]:
df_grouped.head()

In [None]:
df_grouped.artist_name[80:100]

In [None]:
cosine_similarity([scaled[80]],[scaled[98]])

In [None]:
cosine_similarity([scaled[80]],[scaled[99]])

#### It works!

We can see that artists like ASAP Rocky and DMX who both rap are more similar than ASAP Rocky and Jack U (edm music)

#### Let's save our artist feature df and our standard scaler so that we can use it to scale artist features the same way in the future

In [None]:
import joblib

In [None]:
scaler_filename = "MyData/artist_feature_scaler.save"
joblib.dump(sd, scaler_filename) 

In [None]:
artist_feature_file_name = 'MyData/artist_features.csv'
df_grouped.to_csv(artist_feature_file_name)