### Here we will retrieve songs for each artist in our giant playlist of songs as well as in my personal library

#### These artists will be the artists we are able to recommend for our artist recommender as we will have all of their features stored

In [1]:
import pandas as pd
import numpy as np
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import os, json

In [2]:
cur_dir = os.getcwd()

In [3]:
df_full = pd.read_csv('MyData/biggest_playlist_ever.csv')
df_pers = pd.read_csv('MyData/songs_for_personal_analysis.csv')

In [4]:
df_pers.head()

Unnamed: 0.1,Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,trackName,artistName
0,0,5Le9sSLxWIaIEPPppZ9EuF,0.33,0.673,254118.0,0.8,0.0,7.0,0.689,-6.253,1.0,0.214,135.997,4.0,0.268,'Round Here,IODONTPLAY
1,1,5F7bIFd3xWuoXmvXFqFl5M,0.0825,0.744,365950.0,0.57,0.0,3.0,0.121,-4.359,0.0,0.0357,75.019,4.0,0.6,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES
2,2,2PzU4IB8Dr6mxV3lHuaG34,0.0383,0.723,222813.0,0.863,0.0317,2.0,0.128,-7.89,1.0,0.0338,136.302,4.0,0.931,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones
3,3,3qLfQNPEE27KI3Hgd9Om8A,0.991,0.295,135653.0,0.0706,0.92,9.0,0.101,-20.157,0.0,0.0439,76.425,1.0,0.139,(prelude),Zachary Bruno
4,4,4txn9qnwK3ILQqv5oq2mO3,0.388,0.519,264213.0,0.809,0.0,1.0,0.275,-6.362,1.0,0.556,146.02,4.0,0.262,03' Adolescence,J. Cole


In [5]:
artists1 = df_full.artist.unique()
artists2 = df_pers.artistName.unique()

In [6]:
artists = np.append(artists1,artists2)

In [7]:
artists = artists.astype('str')

In [8]:
artists = np.unique(artists)

#### Retrieve 50 songs for each artist

In [9]:
CLIENT_ID = os.environ.get('SPOTIFY_CLIENT_ID')
CLIENT_SECRET = os.environ.get('SPOTIFY_SECRET_ID')

In [10]:
client_credentials_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [11]:
name = 'Radiohead'

results = sp.search(q='artist:' + name, type='artist')
items = results['artists']['items']

In [12]:
items

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4Z8W4fKeB5YxbusRsdQVPb'},
  'followers': {'href': None, 'total': 5698213},
  'genres': ['alternative rock',
   'art rock',
   'melancholia',
   'oxford indie',
   'permanent wave',
   'rock'],
  'href': 'https://api.spotify.com/v1/artists/4Z8W4fKeB5YxbusRsdQVPb',
  'id': '4Z8W4fKeB5YxbusRsdQVPb',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/afcd616e1ef2d2786f47b3b4a8a6aeea24a72adc',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/563754af10b3d9f9f62a3458e699f58c4a02870f',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/4067ea225d8b42fa6951857d3af27dd07d60f3c6',
    'width': 160}],
  'name': 'Radiohead',
  'popularity': 79,
  'type': 'artist',
  'uri': 'spotify:artist:4Z8W4fKeB5YxbusRsdQVPb'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/17mBFWKyCyp506a3n6XUWA'},
  'followers': {'href': None, 'total': 1},
  'genres': [],
  'href': 'ht

In [13]:
items[0]['id']

'4Z8W4fKeB5YxbusRsdQVPb'

In [14]:
def get_artist_ids(artist_names):
    ids = []
    names = []
    for name in artist_names:
        try:
            results = sp.search(q='artist:' + name, type='artist')
            items = results['artists']['items']
            artist_id = items[0]['id']
            #This seems silly but the items[0] approach seems to not take direct matches sometimes, resulting in some artists
            #beings skipped and other duplicated. This ensures no duplicates
            name = sp.artist(artist_id)['name']
            if artist_id not in ids:
                ids.append(artist_id)
                names.append(name)
                
        except:
            print(name)
    
    return ids, names
        

In [15]:
artist_ids, artist_names = get_artist_ids(artists)

Aamupojat
Ella Lymi
Eternal Flames
Groove Sisters Media
Hotelli Vantaa
IDOLS 2007
Ipanapa
Kauniit & Uhkarohkeat
Linda Vink
Munamies
Profeetat
Puhuva Kone
Pää-äijät
Sueco the Child
Suurlähettiläät
Valvomo
Yung Anime


In [16]:
artists

array(['$NOT', '$uicideBoy$', '$wave', ..., 'will.i.am', 'Ólafur Arnalds',
       'Ēriks Ešenvalds'], dtype='<U46')

In [17]:
len(artist_names)

1910

In [18]:
len(artist_ids)

1910

In [19]:
def get_top_tracks(artist_ids):
    """
    Retrieve features for each artists top 10 tracks
    
    """
    top_10s = {}
    
    for artist in artist_ids:
        top = sp.artist_top_tracks(artist, country='US')['tracks']
        top_tracks = []
        for track in top:
            top_tracks.append(track['id'])
        
        top_10s[artist] = top_tracks
        

    return top_10s

In [20]:
top_10s = get_top_tracks(artist_ids)

In [21]:
len(top_10s.keys())

1910

In [22]:
to_delete = []
for key, value in top_10s.items():
    if len(value)!=10:
        to_delete.append(key)
        
for key in to_delete:
    del top_10s[key]

In [23]:
artist_dic = dict(zip(artist_ids, artist_names))

#### What we have for our 1900+ artists at this point:
- A dictionary of artist IDs and their top 10 track IDs
- A dictionary of artist names and their corresponding IDs

#### What's next
- Get song features for each artists top tracks
- Create a dataframe with columns: artist_id, artist_name, song_id, and all song features
- Aggregate the data frame by artist id to get a mean of song features.

In [24]:
df = pd.DataFrame.from_dict(top_10s, orient='index')

In [25]:
df = df.stack().reset_index()
df

Unnamed: 0,level_0,level_1,0
0,5IbEL2xjRtKsunfmsahLuO,0,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,1,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,2,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,4,0lB24x4SNbi8lLhUAsdoMO
...,...,...,...
17625,1luoTtYQjMoJPSzl9YCO1B,5,4s4nBjnM0Sa7h60zQyMNmL
17626,1luoTtYQjMoJPSzl9YCO1B,6,4jqvzScw5GpULd7XoAM5Y7
17627,1luoTtYQjMoJPSzl9YCO1B,7,6lbey3UxHRqajxd3OAoIOq
17628,1luoTtYQjMoJPSzl9YCO1B,8,4aUhHjaW2S0eBrPRWNY5N7


In [26]:
df.rename(columns = {'level_0':'artist_id',0:'track_id'}, inplace=True)
df.drop(columns = 'level_1',inplace=True)

In [27]:
df.head()

Unnamed: 0,artist_id,track_id
0,5IbEL2xjRtKsunfmsahLuO,0j8ppsOOawdPCJnSTcXgOy
1,5IbEL2xjRtKsunfmsahLuO,3oGbHF3Kdwf3AsRCbBjUxu
2,5IbEL2xjRtKsunfmsahLuO,706ZrLifsm0nwlucKr4kQg
3,5IbEL2xjRtKsunfmsahLuO,3PXi72ZtSqx1PZc40KS0Qj
4,5IbEL2xjRtKsunfmsahLuO,0lB24x4SNbi8lLhUAsdoMO


In [28]:
df['artist_name'] = df.artist_id.apply(lambda x: artist_dic[x])

In [29]:
df.loc[df['artist_name']=="Tina Turner"]

Unnamed: 0,artist_id,track_id,artist_name
15610,1zuJe6b1roixEKMOtyrEak,3Be7CLdHZpyzsVijme39cW,Tina Turner
15611,1zuJe6b1roixEKMOtyrEak,3ErsOxqe2RmXkR65wkygDz,Tina Turner
15612,1zuJe6b1roixEKMOtyrEak,5LNSHyPAr0qWGapTj54VU1,Tina Turner
15613,1zuJe6b1roixEKMOtyrEak,6gJdDnF2TzfA1WPMXuCa3x,Tina Turner
15614,1zuJe6b1roixEKMOtyrEak,6pPWRBubXOBAHnjl5ZIujB,Tina Turner
15615,1zuJe6b1roixEKMOtyrEak,50XXRUFNjs85P0MjCZ1c9X,Tina Turner
15616,1zuJe6b1roixEKMOtyrEak,2TxWkdptNrm2Z0CZfMr3Iq,Tina Turner
15617,1zuJe6b1roixEKMOtyrEak,19DVNifOaTnuP0iq1kTqgW,Tina Turner
15618,1zuJe6b1roixEKMOtyrEak,5JjdJK0uGRUk4skRTuQdUZ,Tina Turner
15619,1zuJe6b1roixEKMOtyrEak,5xGsNXXTu545MWoeuFfjxT,Tina Turner


#### Now we need the song features for each track

In [30]:
def create_feature_df(features):
    df_temp = pd.DataFrame.from_dict(features, orient = 'index')[:11]
    df_temp = df_temp.unstack().to_frame().T[0]
    return df_temp


def get_song_features(songs):
    features = ["danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo"]
    df = pd.DataFrame(columns = features)
    for song in songs:
        spot_feats = sp.audio_features(song)[0]
        df_temp = create_feature_df(spot_feats)
        df = df.append(df_temp)
    
    return df

In [31]:
#Splitting df into two for processing as it times out if we do all of the songs at once
df_features1 = get_song_features(df['track_id'][:8800])

In [32]:
df_features2 = get_song_features(df['track_id'][8800:])

In [33]:
df_features1 = df_features1.astype('float64')
df_features2 = df_features2.astype('float64')

In [34]:
df_features = pd.concat([df_features1,df_features2])

In [35]:
df['tmp'] = range(len(df))
df_features['tmp'] = range(len(df))

df_full = pd.merge(df, df_features, on=['tmp'])
df_full = df_full.drop('tmp', axis=1)

In [36]:
df_grouped = df_full.groupby(['artist_id','artist_name']).mean().reset_index()

In [37]:
df_grouped

Unnamed: 0,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0027wHZDQXpRll4ckwDGad,Disco Ensemble,0.4823,0.8873,7.5,-4.7972,0.4,0.07973,7.599521e-03,0.20245,0.49240,146.5553,0.004361
1,00FQb4jTyendYWaN8pK0wa,Lana Del Rey,0.4983,0.4583,4.7,-9.4793,0.4,0.04338,3.535402e-02,0.11510,0.21570,107.0478,0.515377
2,00IjdWQ46sSBP4gZYObAMx,On The Rocks,0.6761,0.6972,5.6,-6.5155,0.6,0.06595,3.130000e-04,0.25510,0.69340,117.7095,0.377210
3,00TKPo9MxwZ0j4ooveIxWZ,Loote,0.6294,0.6818,5.7,-6.2586,0.8,0.07885,8.830000e-07,0.13791,0.49920,122.1219,0.145210
4,01Er12nK5rrnHx8usFPJAs,Derek Fiechter,0.4993,0.3076,4.6,-15.8738,0.1,0.03299,8.505000e-01,0.17755,0.49173,127.2996,0.639789
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,7yPPzu5UdAK7yagQqjEZQm,Party Favor,0.6114,0.8261,5.5,-4.5408,0.8,0.11784,1.601823e-01,0.25483,0.41450,132.8922,0.036414
1759,7ypOZKaKGrCf3V6pOuaXiM,Alexander Oscar,0.6773,0.7110,6.3,-5.2721,0.3,0.06517,2.658000e-05,0.20243,0.48540,107.9111,0.099248
1760,7z2avKuuiMAT4XZJFv8Rvh,Tom Walker,0.5302,0.5401,5.6,-6.7333,0.8,0.16382,8.134000e-06,0.12267,0.33560,116.3840,0.566260
1761,7zX44fpv6srJt3HfBv0GCn,Pete Parkkonen,0.5686,0.6862,5.0,-5.7686,0.6,0.04078,1.817175e-03,0.13095,0.33580,114.7642,0.022741


#### Before we save the aggregated features, we are going to remove the features we previously found to have no correlation to me enjoying a song

#### Lets also see if we can find similar artists based on cosign similarity 

In [38]:
df_grouped.drop(columns = ['mode'], inplace=True)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [40]:
df_grouped.iloc[:,2:]

Unnamed: 0,danceability,energy,key,loudness,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0.4823,0.8873,7.5,-4.7972,0.07973,7.599521e-03,0.20245,0.49240,146.5553,0.004361
1,0.4983,0.4583,4.7,-9.4793,0.04338,3.535402e-02,0.11510,0.21570,107.0478,0.515377
2,0.6761,0.6972,5.6,-6.5155,0.06595,3.130000e-04,0.25510,0.69340,117.7095,0.377210
3,0.6294,0.6818,5.7,-6.2586,0.07885,8.830000e-07,0.13791,0.49920,122.1219,0.145210
4,0.4993,0.3076,4.6,-15.8738,0.03299,8.505000e-01,0.17755,0.49173,127.2996,0.639789
...,...,...,...,...,...,...,...,...,...,...
1758,0.6114,0.8261,5.5,-4.5408,0.11784,1.601823e-01,0.25483,0.41450,132.8922,0.036414
1759,0.6773,0.7110,6.3,-5.2721,0.06517,2.658000e-05,0.20243,0.48540,107.9111,0.099248
1760,0.5302,0.5401,5.6,-6.7333,0.16382,8.134000e-06,0.12267,0.33560,116.3840,0.566260
1761,0.5686,0.6862,5.0,-5.7686,0.04078,1.817175e-03,0.13095,0.33580,114.7642,0.022741


In [41]:
normalizer = MinMaxScaler()
normalizer.fit(df_grouped.iloc[:,2:])
scaled = normalizer.transform(df_grouped.iloc[:,2:])

In [42]:
gp_scaled = df_grouped.copy()
gp_scaled.iloc[:,2:] = scaled

In [43]:
gp_scaled.head()

Unnamed: 0,artist_id,artist_name,danceability,energy,key,loudness,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,0027wHZDQXpRll4ckwDGad,Disco Ensemble,0.44927,0.916454,0.797468,0.933256,0.122197,0.007989404,0.254524,0.517554,0.76587,0.004168
1,00FQb4jTyendYWaN8pK0wa,Lana Del Rey,0.469411,0.469555,0.443038,0.811524,0.037283,0.03716781,0.08952,0.202043,0.42259,0.517811
2,00IjdWQ46sSBP4gZYObAMx,On The Rocks,0.693228,0.718422,0.556962,0.888581,0.090007,0.000329058,0.35398,0.746747,0.51523,0.378934
3,00TKPo9MxwZ0j4ooveIxWZ,Loote,0.634441,0.70238,0.56962,0.895261,0.120141,9.283011e-07,0.132608,0.525308,0.553569,0.145742
4,01Er12nK5rrnHx8usFPJAs,Derek Fiechter,0.47067,0.312567,0.43038,0.645269,0.013012,0.8941337,0.207488,0.51679,0.598558,0.642863


In [44]:
gp_scaled.artist_name[10:50]

10                                  Weiland
11                                   G-Eazy
12                                   Strobe
13    Nathaniel Rateliff & The Night Sweats
14                          Spencer Barnett
15                           J. Karjalainen
16                        Takeharu Ishimoto
17                             Portion Boys
18                            Austin Mahone
19                              Landon Cube
20                                 Maroon 5
21                                   Grimes
22                                   Spekti
23                             Taylor Swift
24                                MettaForm
25                                 Owl City
26                                     KISS
27                             Jason Derulo
28                                 KING SOL
29                                will.i.am
30                               Happoradio
31                      Yusuf / Cat Stevens
32                            Fl

In [45]:
cosine_similarity([scaled[37]],[scaled[38]])

array([[0.95856794]])

In [46]:
cosine_similarity([scaled[26]],[scaled[37]])

array([[0.86139882]])

In [47]:
import joblib

In [48]:
scaler_filename = "MyData/artist_feature_scaler.save"
scaler_filename_app = "flask_app/MyData/artist_feature_scaler.save"
joblib.dump(normalizer, scaler_filename) 
joblib.dump(normalizer, scaler_filename_app) 

['flask_app/MyData/artist_feature_scaler.save']

In [49]:
artist_feature_filename = 'MyData/artist_features.csv'
artist_feature_filename_app = 'flask_app/MyData/artist_features.csv'
gp_scaled.to_csv(artist_feature_filename)
gp_scaled.to_csv(artist_feature_filename_app)