### Here we will retrieve songs for each artist in our giant playlist of songs as well as in my personal library

#### These artists will be the artists we are able to recommend for our artist recommender as we will have all of their features stored

In [1]:
import pandas as pd
import numpy as np
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import os, json

In [2]:
cur_dir = os.getcwd()

In [3]:
df_full = pd.read_csv('MyData/biggest_playlist_ever.csv')
df_pers = pd.read_csv('MyData/songs_for_personal_analysis.csv')

In [4]:
df_pers.head()

Unnamed: 0.1,Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,trackName,artistName
0,0,5Le9sSLxWIaIEPPppZ9EuF,0.33,0.673,254118.0,0.8,0.0,7.0,0.689,-6.253,1.0,0.214,135.997,4.0,0.268,'Round Here,IODONTPLAY
1,1,5F7bIFd3xWuoXmvXFqFl5M,0.0825,0.744,365950.0,0.57,0.0,3.0,0.121,-4.359,0.0,0.0357,75.019,4.0,0.6,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES
2,2,2PzU4IB8Dr6mxV3lHuaG34,0.0383,0.723,222813.0,0.863,0.0317,2.0,0.128,-7.89,1.0,0.0338,136.302,4.0,0.931,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones
3,3,3qLfQNPEE27KI3Hgd9Om8A,0.991,0.295,135653.0,0.0706,0.92,9.0,0.101,-20.157,0.0,0.0439,76.425,1.0,0.139,(prelude),Zachary Bruno
4,4,4txn9qnwK3ILQqv5oq2mO3,0.388,0.519,264213.0,0.809,0.0,1.0,0.275,-6.362,1.0,0.556,146.02,4.0,0.262,03' Adolescence,J. Cole


In [5]:
artists1 = df_full.artist.unique()
artists2 = df_pers.artistName.unique()

In [6]:
artists = np.append(artists1,artists2)

In [7]:
artists = artists.astype('str')

In [8]:
artists = np.unique(artists)

#### Retrieve 50 songs for each artist

In [9]:
CLIENT_ID = os.environ.get('SPOTIFY_CLIENT_ID')
CLIENT_SECRET = os.environ.get('SPOTIFY_SECRET_ID')

In [10]:
client_credentials_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [11]:
name = 'Radiohead'

results = sp.search(q='artist:' + name, type='artist')
items = results['artists']['items']

In [12]:
items

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4Z8W4fKeB5YxbusRsdQVPb'},
  'followers': {'href': None, 'total': 5650121},
  'genres': ['alternative rock',
   'art rock',
   'melancholia',
   'oxford indie',
   'permanent wave',
   'rock'],
  'href': 'https://api.spotify.com/v1/artists/4Z8W4fKeB5YxbusRsdQVPb',
  'id': '4Z8W4fKeB5YxbusRsdQVPb',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/afcd616e1ef2d2786f47b3b4a8a6aeea24a72adc',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/563754af10b3d9f9f62a3458e699f58c4a02870f',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/4067ea225d8b42fa6951857d3af27dd07d60f3c6',
    'width': 160}],
  'name': 'Radiohead',
  'popularity': 80,
  'type': 'artist',
  'uri': 'spotify:artist:4Z8W4fKeB5YxbusRsdQVPb'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/17mBFWKyCyp506a3n6XUWA'},
  'followers': {'href': None, 'total': 1},
  'genres': [],
  'href': 'ht

In [13]:
items[0]['id']

'4Z8W4fKeB5YxbusRsdQVPb'

In [14]:
def get_artist_ids(artist_names):
    ids = []
    for name in artist_names:
        try:
            results = sp.search(q='artist:' + name, type='artist')
            items = results['artists']['items']
            artist_id = items[0]['id']
            ids.append(artist_id)
        except:
            print(name)
    
    return ids
        

In [15]:
artist_ids = get_artist_ids(artists)

Aamupojat
Ella Lymi
Eternal Flames
Groove Sisters Media
Hotelli Vantaa
IDOLS 2007
Ipanapa
Kauniit & Uhkarohkeat
Linda Vink
Munamies
Profeetat
Puhuva Kone
Pää-äijät
Sueco the Child
Suurlähettiläät
Valvomo
Yung Anime


In [16]:
len(artist_ids)

1945

In [17]:
def get_top_tracks(artist_ids):
    """
    Retrieve features for each artists top 10 tracks
    
    """
    top_10s = {}
    
    for artist in artist_ids:
        top = sp.artist_top_tracks(artist, country='US')['tracks']
        top_tracks = []
        for track in top:
            top_tracks.append(track['id'])
        
        top_10s[artist] = top_tracks
        

    return top_10s

In [18]:
top_10s = get_top_tracks(artist_ids)

In [19]:
len(top_10s.keys())

1909

In [20]:
to_delete = []
for key, value in top_10s.items():
    if len(value)!=10:
        to_delete.append(key)
        
for key in to_delete:
    del top_10s[key]

In [21]:
artist_dic = dict(zip(artist_ids, artists))

#### What we have for our 500+ artists at this point:
- A dictionary of artist IDs and their top 10 track IDs
- A dictionary of artist names and their corresponding IDs

#### What's next
- Get song features for each artists top tracks
- Create a dataframe with columns: artist_id, artist_name, song_id, and all song features
- Aggregate the data frame by artist id to get a mean of song features.

In [22]:
df = pd.DataFrame.from_dict(top_10s, orient='index')

In [23]:
df = df.stack().reset_index()
df

Unnamed: 0,level_0,level_1,0
0,5IbEL2xjRtKsunfmsahLuO,0,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,1,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,2,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,4,4aTyb0MBTzJXE75aHxeGW5
...,...,...,...
17605,1luoTtYQjMoJPSzl9YCO1B,5,4s4nBjnM0Sa7h60zQyMNmL
17606,1luoTtYQjMoJPSzl9YCO1B,6,4twmjsLIFyuqnwhTEHq4HC
17607,1luoTtYQjMoJPSzl9YCO1B,7,4jqvzScw5GpULd7XoAM5Y7
17608,1luoTtYQjMoJPSzl9YCO1B,8,4aUhHjaW2S0eBrPRWNY5N7


In [24]:
df.rename(columns = {'level_0':'artist_id',0:'track_id'}, inplace=True)
df.drop(columns = 'level_1',inplace=True)

In [25]:
df.head()

Unnamed: 0,artist_id,track_id
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,4aTyb0MBTzJXE75aHxeGW5


In [26]:
df['artist_name'] = df.artist_id.apply(lambda x: artist_dic[x])

In [27]:
df

Unnamed: 0,artist_id,track_id,artist_name
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy,$NOT
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu,$NOT
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg,$NOT
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj,$NOT
4,5IbEL2xjRtKsunfmsahLuO,4aTyb0MBTzJXE75aHxeGW5,$NOT
...,...,...,...
17605,1luoTtYQjMoJPSzl9YCO1B,4s4nBjnM0Sa7h60zQyMNmL,iann dior
17606,1luoTtYQjMoJPSzl9YCO1B,4twmjsLIFyuqnwhTEHq4HC,iann dior
17607,1luoTtYQjMoJPSzl9YCO1B,4jqvzScw5GpULd7XoAM5Y7,iann dior
17608,1luoTtYQjMoJPSzl9YCO1B,4aUhHjaW2S0eBrPRWNY5N7,iann dior


#### Now we need the song features for each track

In [28]:
def create_feature_df(features):
    df_temp = pd.DataFrame.from_dict(features, orient = 'index')[:11]
    df_temp = df_temp.unstack().to_frame().T[0]
    return df_temp


def get_song_features(songs):
    features = ["danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo"]
    df = pd.DataFrame(columns = features)
    for song in songs:
        spot_feats = sp.audio_features(song)[0]
        df_temp = create_feature_df(spot_feats)
        df = df.append(df_temp)
    
    return df

In [29]:
#Splitting df into two for processing as it times out if we do all of the songs at once
df_features1 = get_song_features(df['track_id'][:8800])

In [30]:
df_features2 = get_song_features(df['track_id'][8800:])

In [32]:
df_features1 = df_features1.astype('float64')
df_features2 = df_features2.astype('float64')

In [33]:
df_features = pd.concat([df_features1,df_features2])

In [34]:
df['tmp'] = range(len(df))
df_features['tmp'] = range(len(df))

df = pd.merge(df, df_features, on=['tmp'])
df = df.drop('tmp', axis=1)

In [35]:
df.head()

Unnamed: 0,artist_id,track_id,artist_name,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy,$NOT,0.847,0.448,0.0,-10.14,1.0,0.416,1.3e-05,0.325,0.488,79.975,0.81
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu,$NOT,0.713,0.47,9.0,-8.698,1.0,0.0864,1.3e-05,0.306,0.0825,84.518,0.606
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg,$NOT,0.902,0.462,7.0,-7.945,1.0,0.0979,2e-06,0.094,0.646,103.984,0.19
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj,$NOT,0.844,0.72,1.0,-7.418,0.0,0.3,0.0,0.0797,0.631,99.917,0.169
4,5IbEL2xjRtKsunfmsahLuO,4aTyb0MBTzJXE75aHxeGW5,$NOT,0.829,0.619,1.0,-7.258,1.0,0.198,0.000371,0.0975,0.351,96.035,0.29


In [36]:
df_grouped = df.groupby(['artist_id','artist_name']).mean().reset_index()

In [37]:
df_grouped

Unnamed: 0,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0027wHZDQXpRll4ckwDGad,Disco,0.4823,0.8873,7.5,-4.7972,0.4,0.07973,7.599521e-03,0.20245,0.49240,146.5553,0.004361
1,00FQb4jTyendYWaN8pK0wa,La Santa Cecilia,0.4983,0.4583,4.7,-9.4793,0.4,0.04338,3.535402e-02,0.11510,0.21570,107.0478,0.515377
2,00IjdWQ46sSBP4gZYObAMx,Ola,0.6761,0.6972,5.6,-6.5155,0.6,0.06595,3.130000e-04,0.25510,0.69340,117.7095,0.377210
3,00TKPo9MxwZ0j4ooveIxWZ,Little Big,0.5964,0.6456,5.0,-6.6473,0.8,0.07934,8.830000e-07,0.13651,0.47980,122.8356,0.188410
4,01Er12nK5rrnHx8usFPJAs,Deorro,0.4993,0.3076,4.6,-15.8738,0.1,0.03299,8.505000e-01,0.17755,0.49173,127.2996,0.639789
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1756,7yPPzu5UdAK7yagQqjEZQm,PSY,0.6493,0.8460,4.6,-4.3033,0.8,0.13094,1.599634e-01,0.25312,0.47800,140.4023,0.043568
1757,7ypOZKaKGrCf3V6pOuaXiM,Alexander Charles,0.6773,0.7110,6.3,-5.2721,0.3,0.06517,2.658000e-05,0.20243,0.48540,107.9111,0.099248
1758,7z2avKuuiMAT4XZJFv8Rvh,Tina Turner,0.5302,0.5401,5.6,-6.7333,0.8,0.16382,8.134000e-06,0.12267,0.33560,116.3840,0.566260
1759,7zX44fpv6srJt3HfBv0GCn,Pasi ja Anssi,0.5686,0.6862,5.0,-5.7686,0.6,0.04078,1.817175e-03,0.13095,0.33580,114.7642,0.022741


#### Before we save the aggregated features, we are going to remove the features we previously found to have no correlation to me enjoying a song

#### Lets also see if we can find similar artists based on cosign similarity 

In [38]:
df_grouped.drop(columns = ['key','mode'], inplace=True)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [40]:
df_grouped.iloc[:,2:]

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0.4823,0.8873,-4.7972,0.07973,7.599521e-03,0.20245,0.49240,146.5553,0.004361
1,0.4983,0.4583,-9.4793,0.04338,3.535402e-02,0.11510,0.21570,107.0478,0.515377
2,0.6761,0.6972,-6.5155,0.06595,3.130000e-04,0.25510,0.69340,117.7095,0.377210
3,0.5964,0.6456,-6.6473,0.07934,8.830000e-07,0.13651,0.47980,122.8356,0.188410
4,0.4993,0.3076,-15.8738,0.03299,8.505000e-01,0.17755,0.49173,127.2996,0.639789
...,...,...,...,...,...,...,...,...,...
1756,0.6493,0.8460,-4.3033,0.13094,1.599634e-01,0.25312,0.47800,140.4023,0.043568
1757,0.6773,0.7110,-5.2721,0.06517,2.658000e-05,0.20243,0.48540,107.9111,0.099248
1758,0.5302,0.5401,-6.7333,0.16382,8.134000e-06,0.12267,0.33560,116.3840,0.566260
1759,0.5686,0.6862,-5.7686,0.04078,1.817175e-03,0.13095,0.33580,114.7642,0.022741


In [41]:
sd = StandardScaler()
scaled = sd.fit_transform(df_grouped.iloc[:,2:])

In [42]:
df_grouped.iloc[:,2:] = scaled

In [43]:
df_grouped.head()

Unnamed: 0,artist_id,artist_name,danceability,energy,loudness,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0027wHZDQXpRll4ckwDGad,Disco,-1.034519,1.320228,0.671936,-0.325285,-0.44797,0.376144,0.088414,2.145843,-1.013277
1,00FQb4jTyendYWaN8pK0wa,La Santa Cecilia,-0.918269,-0.746987,-0.195465,-0.809523,-0.340752,-1.09433,-1.585244,-1.021044,0.919824
2,00IjdWQ46sSBP4gZYObAMx,Ola,0.373561,0.404197,0.353605,-0.508856,-0.476118,1.262468,1.30419,-0.166411,0.397158
3,00TKPo9MxwZ0j4ooveIxWZ,Little Big,-0.20551,0.155553,0.329188,-0.330481,-0.477324,-0.733908,0.012201,0.244493,-0.317045
4,01Er12nK5rrnHx8usFPJAs,Deorro,-0.911003,-1.473162,-1.380104,-0.947933,2.808225,-0.04303,0.084361,0.602323,1.390456


In [44]:
df_grouped.artist_name[80:100]

80            Raappana
81        Edvard Grieg
82      Sunrise Avenue
83                ABBA
84           JP Cooper
85    Darnell Williams
86      Ellie Goulding
87           Sir Chloe
88                Sima
89            Aly & AJ
90        Robin Thicke
91       Missy Elliott
92    Carrie Underwood
93       Agustín Amigó
94        Alli Simpson
95        Vincent Boot
96      Eternal Flames
97                dvsn
98          The B-52's
99               Tiagz
Name: artist_name, dtype: object

In [45]:
cosine_similarity([scaled[80]],[scaled[98]])

array([[0.25151238]])

In [46]:
cosine_similarity([scaled[80]],[scaled[99]])

array([[-0.28397049]])

#### It works!

We can see that artists like ASAP Rocky and DMX who both rap are more similar than ASAP Rocky and Jack U (edm music)

#### Let's save our artist feature df and our standard scaler so that we can use it to scale artist features the same way in the future

In [47]:
import joblib

In [48]:
scaler_filename = "MyData/artist_feature_scaler.save"
joblib.dump(sd, scaler_filename) 

['MyData/artist_feature_scaler.save']

In [49]:
artist_feature_file_name = 'MyData/artist_features.csv'
df_grouped.to_csv(artist_feature_file_name)