### Here we will retrieve songs for each artist in our giant playlist of songs as well as in my personal library

#### These artists will be the artists we are able to recommend for our artist recommender as we will have all of their features stored

In [27]:
import pandas as pd
import numpy as np
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import os, json

In [4]:
cur_dir = os.getcwd()

In [5]:
df_full = pd.read_csv('MyData/biggest_playlist_ever.csv')
df_pers = pd.read_csv('MyData/songs_for_personal_analysis.csv')

In [32]:
df_pers.head()

Unnamed: 0.1,Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,trackName,artistName
0,0,5Le9sSLxWIaIEPPppZ9EuF,0.33,0.673,254118.0,0.8,0.0,7.0,0.689,-6.253,1.0,0.214,135.997,4.0,0.268,'Round Here,IODONTPLAY
1,1,5F7bIFd3xWuoXmvXFqFl5M,0.0825,0.744,365950.0,0.57,0.0,3.0,0.121,-4.359,0.0,0.0357,75.019,4.0,0.6,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES
2,2,2PzU4IB8Dr6mxV3lHuaG34,0.0383,0.723,222813.0,0.863,0.0317,2.0,0.128,-7.89,1.0,0.0338,136.302,4.0,0.931,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones
3,3,3qLfQNPEE27KI3Hgd9Om8A,0.991,0.295,135653.0,0.0706,0.92,9.0,0.101,-20.157,0.0,0.0439,76.425,1.0,0.139,(prelude),Zachary Bruno
4,4,4txn9qnwK3ILQqv5oq2mO3,0.388,0.519,264213.0,0.809,0.0,1.0,0.275,-6.362,1.0,0.556,146.02,4.0,0.262,03' Adolescence,J. Cole


In [13]:
artist1 = df_full.artist.unique()
artists2 = df_pers.artistName.unique()

In [18]:
artists = np.append(artists1,artists2)

In [21]:
artists = np.unique(artists)

In [23]:
len(artists)

673

#### Retrieve 50 songs for each artist

In [29]:
CLIENT_ID = os.environ.get('SPOTIFY_CLIENT_ID')
CLIENT_SECRET = os.environ.get('SPOTIFY_SECRET_ID')

In [30]:
client_credentials_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [60]:
name = 'Radiohead'

results = sp.search(q='artist:' + name, type='artist')
items = results['artists']['items']

In [61]:
items

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4Z8W4fKeB5YxbusRsdQVPb'},
  'followers': {'href': None, 'total': 5647505},
  'genres': ['alternative rock',
   'art rock',
   'melancholia',
   'oxford indie',
   'permanent wave',
   'rock'],
  'href': 'https://api.spotify.com/v1/artists/4Z8W4fKeB5YxbusRsdQVPb',
  'id': '4Z8W4fKeB5YxbusRsdQVPb',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/afcd616e1ef2d2786f47b3b4a8a6aeea24a72adc',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/563754af10b3d9f9f62a3458e699f58c4a02870f',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/4067ea225d8b42fa6951857d3af27dd07d60f3c6',
    'width': 160}],
  'name': 'Radiohead',
  'popularity': 80,
  'type': 'artist',
  'uri': 'spotify:artist:4Z8W4fKeB5YxbusRsdQVPb'},
 {'external_urls': {'spotify': 'https://open.spotify.com/artist/17mBFWKyCyp506a3n6XUWA'},
  'followers': {'href': None, 'total': 1},
  'genres': [],
  'href': 'ht

In [52]:
items[0]['id']

'4Z8W4fKeB5YxbusRsdQVPb'

In [70]:
def get_artist_ids(artist_names):
    ids = []
    for name in artist_names:
        results = sp.search(q='artist:' + name, type='artist')
        items = results['artists']['items']
        artist_id = items[0]['id']
        ids.append(artist_id)
    
    return ids
        

In [71]:
artist_ids = get_artist_ids(artists)

In [72]:
artist_ids[0:5]

['17lzZA2AlOHwCwFALHttmp',
 '1URnnhqYAYcrqrcwql10ft',
 '0Je74SitssvJg1w4Ra2EK7',
 '3q7HBObVc0L8jNeTe5Gofh',
 '31W5EY0aAly4Qieq6OFu6I']

In [85]:
def get_top_tracks(artist_ids):
    """
    Retrieve features for each artists top 10 tracks
    
    """
    top_10s = {}
    
    for artist in artist_ids:
        top = sp.artist_top_tracks(artist, country='US')['tracks']
        top_tracks = []
        for track in top:
            top_tracks.append(track['id'])
        
        top_10s[artist] = top_tracks
        
    #Delete any artists that don't have 10 top tracks.
    for key, value in top_10s.items():
        if len(value)<10:
            del top_10s[key]

    return top_10s

In [90]:
top_10s = get_top_tracks(artist_ids)

In [91]:
top_10s

{'17lzZA2AlOHwCwFALHttmp': ['6H0AwSQ20mo62jGlPGB8S6',
  '00QyLmjxaSEE8qIZQjBXBj',
  '6jzW19SQaLj1kLArvipGQZ',
  '1nX9KhK3Fff27SnrIor2Yb',
  '4LmAnpjlhWTahvRkYR8xJa',
  '4JfCA7yaiEORC7NcKBS9nk',
  '2nBI3iWLhupR7LyAJ5GGkE',
  '7oN7mBg1r39crVwTK5PrWt',
  '1GxoFQZxD7tFhQFUD9F0dD',
  '5S1IUPueD0xE0vj4zU3nSf'],
 '1URnnhqYAYcrqrcwql10ft': ['5SWnsxjhdcEDc7LJjq9UHk',
  '4Q34FP1AT7GEl9oLgNtiWj',
  '6pcywuOeGGWeOQzdUyti6k',
  '2t8yVaLvJ0RenpXUIAC52d',
  '2fQrGHiQOvpL9UgPvtYy6G',
  '3ruoIF2UnoXdzK8mR61ebq',
  '3CDVMejYHnB1SkEEx0T1N4',
  '7fxSLnklnayJNBIKntXedZ',
  '50a8bKqlwDEqeiEknrzkTO',
  '59JWp4PjZ9TRM8cmtaDYB1'],
 '0Je74SitssvJg1w4Ra2EK7': ['0jWgAnTrNZmOGmqgvHhZEm',
  '2AiUFAeAsSR9lcpGxdc2Sj',
  '5sEpqYUOx5y6TH8EeQVWKa',
  '2m4qhLcXkiYHT4ttRRajop',
  '48Qp9fQyumixKBLsi73xUg',
  '4JwOBVIlajGrkE0LXhAFvZ',
  '478fXlKeUafcUo0rSD2WDt',
  '0BEiMf0IidmdEeY2Kp1nUA',
  '4iTQ3u6nnYgrrbj13aJBLg',
  '4oNw03N6mKIFCIa59Ip9NF'],
 '3q7HBObVc0L8jNeTe5Gofh': ['5D2mYZuzcgjpchVY1pmTPh',
  '4RY96Asd9IefaL3X4LOLZ8

In [112]:
to_delete = []
for key, value in top_10s.items():
    if len(value)<10:
        to_delete.append(key)
        
for key in to_delete:
    del top_10s[key]

In [149]:
artist_dic = dict(zip(artist_ids, artists))

In [150]:
artist_dic

{'17lzZA2AlOHwCwFALHttmp': '2 Chainz',
 '1URnnhqYAYcrqrcwql10ft': '21 Savage',
 '0Je74SitssvJg1w4Ra2EK7': '4 Non Blondes',
 '3q7HBObVc0L8jNeTe5Gofh': '50 Cent',
 '31W5EY0aAly4Qieq6OFu6I': 'Boogie',
 '5P3qoSf6XGEggz0Zayqc27': 'A Jackson Sound',
 '09hVIj6vWgoCDtT03h8ZCa': 'A Tribe Called Quest',
 '5dHt1vcEm9qb8fCyLcB3HL': 'A$AP Ferg',
 '13ubrt8QOOCPljQ2FL1Kca': 'A$AP Rocky',
 '7LrPg3JeBn5t0IvaWjuW46': 'A&G',
 '3Pc4rifKK1cGDjk2bSNe8U': 'A-Game',
 '0pkwrPVI8UyXtPkavyJoZ4': 'A.CHAL',
 '3W1hStTokvA9xevaphlEUI': 'A.K. Tribe',
 '711MCceyCBcFnzjGY4Q7Un': 'AC/DC',
 '1y9a1IcgzjFM6hf0DZpBCD': 'ANTHM',
 '0nJvyjVTb8sAULPYyA1bqU': 'Aaron Copland',
 '62olK5zZHSgFUXGDykgBL8': 'Ace Frehley',
 '7BMccF0hQFBpP6417k1OtQ': 'Action Bronson',
 '0bnPfchFpM2qLv1xrCK727': 'Adrian Kwiatkowski',
 '7Ey4PD4MYsKc5I2dolUwbH': 'Aerosmith',
 '6lO3fSdhsdpeOcrbqAJsRU': 'Aero Chord',
 '4Icvbp9RDt5aY2TWDOVDsr': 'Afroman',
 '3hUFjtgMr2bvq6E6tY7yQB': 'Agustín Amigó',
 '0z4gvV4rjIZ9wHck67ucSV': 'Akon',
 '2pRLtdlkVCLhj205LHW7Ne'

#### What we have for our 500+ artists at this point:
- A dictionary of artist IDs and their top 10 track IDs
- A dictionary of artist names and their corresponding IDs

#### What's next
- Get song features for each artists top tracks
- Create a dataframe with columns: artist_id, artist_name, song_id, and all song features
- Aggregate the data frame by artist id to get a mean of song features.

In [304]:
df = pd.DataFrame.from_dict(top_10s, orient='index')

In [305]:
df = df.stack().reset_index()
df

Unnamed: 0,level_0,level_1,0
0,17lzZA2AlOHwCwFALHttmp,0,6H0AwSQ20mo62jGlPGB8S6
1,17lzZA2AlOHwCwFALHttmp,1,00QyLmjxaSEE8qIZQjBXBj
2,17lzZA2AlOHwCwFALHttmp,2,6jzW19SQaLj1kLArvipGQZ
3,17lzZA2AlOHwCwFALHttmp,3,1nX9KhK3Fff27SnrIor2Yb
4,17lzZA2AlOHwCwFALHttmp,4,4LmAnpjlhWTahvRkYR8xJa
...,...,...,...
6445,1luoTtYQjMoJPSzl9YCO1B,5,4s4nBjnM0Sa7h60zQyMNmL
6446,1luoTtYQjMoJPSzl9YCO1B,6,4twmjsLIFyuqnwhTEHq4HC
6447,1luoTtYQjMoJPSzl9YCO1B,7,4jqvzScw5GpULd7XoAM5Y7
6448,1luoTtYQjMoJPSzl9YCO1B,8,5KCHfR2RGdpieZoQ5NQd5g


In [306]:
df.rename(columns = {'level_0':'artist_id',0:'track_id'}, inplace=True)
df.drop(columns = 'level_1',inplace=True)

In [307]:
df.head()

Unnamed: 0,artist_id,track_id
0,17lzZA2AlOHwCwFALHttmp,6H0AwSQ20mo62jGlPGB8S6
1,17lzZA2AlOHwCwFALHttmp,00QyLmjxaSEE8qIZQjBXBj
2,17lzZA2AlOHwCwFALHttmp,6jzW19SQaLj1kLArvipGQZ
3,17lzZA2AlOHwCwFALHttmp,1nX9KhK3Fff27SnrIor2Yb
4,17lzZA2AlOHwCwFALHttmp,4LmAnpjlhWTahvRkYR8xJa


In [308]:
df['artist_name'] = df.artist_id.apply(lambda x: artist_dic[x])

In [309]:
df

Unnamed: 0,artist_id,track_id,artist_name
0,17lzZA2AlOHwCwFALHttmp,6H0AwSQ20mo62jGlPGB8S6,2 Chainz
1,17lzZA2AlOHwCwFALHttmp,00QyLmjxaSEE8qIZQjBXBj,2 Chainz
2,17lzZA2AlOHwCwFALHttmp,6jzW19SQaLj1kLArvipGQZ,2 Chainz
3,17lzZA2AlOHwCwFALHttmp,1nX9KhK3Fff27SnrIor2Yb,2 Chainz
4,17lzZA2AlOHwCwFALHttmp,4LmAnpjlhWTahvRkYR8xJa,2 Chainz
...,...,...,...
6445,1luoTtYQjMoJPSzl9YCO1B,4s4nBjnM0Sa7h60zQyMNmL,Ēriks Ešenvalds
6446,1luoTtYQjMoJPSzl9YCO1B,4twmjsLIFyuqnwhTEHq4HC,Ēriks Ešenvalds
6447,1luoTtYQjMoJPSzl9YCO1B,4jqvzScw5GpULd7XoAM5Y7,Ēriks Ešenvalds
6448,1luoTtYQjMoJPSzl9YCO1B,5KCHfR2RGdpieZoQ5NQd5g,Ēriks Ešenvalds


#### Now we need the song features for each track

In [280]:
def create_feature_df(features):
    df_temp = pd.DataFrame.from_dict(features, orient = 'index')[:11]
    df_temp = df_temp.unstack().to_frame().T[0]
    return df_temp


def get_song_features(songs):
    features = ["danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo"]
    df = pd.DataFrame(columns = features)
    for song in songs:
        spot_feats = sp.audio_features(song)[0]
        df_temp = create_feature_df(spot_feats)
        df = df.append(df_temp)
    
    return df

In [262]:
df_features = get_song_features(df['track_id'])

In [310]:
df_features = df_features.astype('float64')

In [311]:
df['tmp'] = range(len(df))
df_features['tmp'] = range(len(df))

df = pd.merge(df, df_features, on=['tmp'])
df = df.drop('tmp', axis=1)

In [312]:
df.head()

Unnamed: 0,artist_id,track_id,artist_name,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,17lzZA2AlOHwCwFALHttmp,6H0AwSQ20mo62jGlPGB8S6,2 Chainz,0.822,0.502,7.0,-7.38,1.0,0.148,0.000887,0.114,0.525,73.003,0.0312
1,17lzZA2AlOHwCwFALHttmp,00QyLmjxaSEE8qIZQjBXBj,2 Chainz,0.554,0.899,8.0,-4.573,1.0,0.408,0.0,0.0568,0.552,171.966,0.0521
2,17lzZA2AlOHwCwFALHttmp,6jzW19SQaLj1kLArvipGQZ,2 Chainz,0.897,0.767,11.0,-4.544,1.0,0.0959,0.000364,0.129,0.675,134.913,0.0679
3,17lzZA2AlOHwCwFALHttmp,1nX9KhK3Fff27SnrIor2Yb,2 Chainz,0.796,0.5,1.0,-7.21,1.0,0.425,0.0,0.155,0.227,75.012,0.118
4,17lzZA2AlOHwCwFALHttmp,4LmAnpjlhWTahvRkYR8xJa,2 Chainz,0.768,0.471,2.0,-8.406,1.0,0.259,0.0,0.268,0.405,131.023,0.0201


In [331]:
df_grouped = df.groupby(['artist_id','artist_name']).mean().reset_index()

In [332]:
df_grouped

Unnamed: 0,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,acousticness
0,00IjdWQ46sSBP4gZYObAMx,On The Rocks,0.6761,0.697200,5.6,-6.5155,0.6,0.06595,3.130000e-04,0.25510,0.69340,117.7095,0.377210
1,01Er12nK5rrnHx8usFPJAs,Derek Fiechter,0.4993,0.307600,4.6,-15.8738,0.1,0.03299,8.505000e-01,0.17755,0.49173,127.2996,0.639789
2,01QTIT5P1pFP3QnnFSdsJf,Lupe Fiasco,0.6240,0.786500,6.3,-5.4355,0.4,0.20002,1.750000e-07,0.22270,0.51310,104.1452,0.093690
3,01hRNr3yF5bYnPq4wZ88iI,Tom & Jame,0.5667,0.571800,7.2,-10.7439,0.7,0.06290,3.232507e-02,0.27634,0.68100,106.4643,0.179940
4,02kJSzxNuaWGqwubyUba0Z,G-Eazy,0.7119,0.689800,3.6,-5.5622,0.6,0.18437,1.540000e-07,0.14845,0.39950,129.0344,0.055819
...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,7wa9AhhXtpXyN7huAtsgiC,Alan Shoesmith,0.6133,0.126560,6.3,-17.5602,0.7,0.06410,8.946000e-01,0.11389,0.24714,115.3648,0.977000
641,7wss9YIazFrVYmOYMGp2af,Bobby Macavelli,0.7062,0.532800,4.8,-10.4738,0.9,0.22084,4.100000e-07,0.15715,0.52180,101.5754,0.305620
642,7xTcuBOIAAIGDOSvwYFPzk,Daniel Powter,0.5525,0.660000,4.1,-6.3999,0.9,0.03225,3.000749e-02,0.21609,0.41900,126.5473,0.472545
643,7y97mc3bZRFXzT2szRM4L4,Frédéric Chopin,0.3324,0.013654,2.3,-29.6338,0.6,0.04557,9.101000e-01,0.10633,0.09823,89.5586,0.990700


#### Before we save the aggregated features, we are going to remove the features we previously found to have no correlation to me enjoying a song

#### Lets also see if we can find similar artists based on cosign similarity 

In [333]:
df_grouped.drop(columns = ['key','mode','valence'], inplace=True)

In [334]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [354]:
df_grouped.iloc[:,2:]

Unnamed: 0,danceability,energy,loudness,speechiness,instrumentalness,liveness,tempo,acousticness
0,0.6761,0.697200,-6.5155,0.06595,3.130000e-04,0.25510,117.7095,0.377210
1,0.4993,0.307600,-15.8738,0.03299,8.505000e-01,0.17755,127.2996,0.639789
2,0.6240,0.786500,-5.4355,0.20002,1.750000e-07,0.22270,104.1452,0.093690
3,0.5667,0.571800,-10.7439,0.06290,3.232507e-02,0.27634,106.4643,0.179940
4,0.7119,0.689800,-5.5622,0.18437,1.540000e-07,0.14845,129.0344,0.055819
...,...,...,...,...,...,...,...,...
640,0.6133,0.126560,-17.5602,0.06410,8.946000e-01,0.11389,115.3648,0.977000
641,0.7062,0.532800,-10.4738,0.22084,4.100000e-07,0.15715,101.5754,0.305620
642,0.5525,0.660000,-6.3999,0.03225,3.000749e-02,0.21609,126.5473,0.472545
643,0.3324,0.013654,-29.6338,0.04557,9.101000e-01,0.10633,89.5586,0.990700


In [355]:
sd = StandardScaler()
scaled = sd.fit_transform(df_grouped.iloc[:,2:])

In [376]:
df_grouped.iloc[:,2:] = scaled

In [377]:
df_grouped.head()

Unnamed: 0,artist_id,artist_name,danceability,energy,loudness,speechiness,instrumentalness,liveness,tempo,acousticness
0,00IjdWQ46sSBP4gZYObAMx,On The Rocks,0.546376,0.69441,0.615833,-0.513913,-0.674312,1.42936,0.085562,-0.030344
1,01Er12nK5rrnHx8usFPJAs,Derek Fiechter,-0.487032,-0.812143,-0.630002,-0.919692,1.708837,0.128562,0.772016,0.73323
2,01QTIT5P1pFP3QnnFSdsJf,Lupe Fiasco,0.241848,1.039726,0.759609,1.136661,-0.675189,0.885893,-0.885364,-0.854814
3,01hRNr3yF5bYnPq4wZ88iI,Tom & Jame,-0.093075,0.209498,0.052922,-0.551462,-0.58458,1.785633,-0.719364,-0.604001
4,02kJSzxNuaWGqwubyUba0Z,G-Eazy,0.755629,0.665795,0.742742,0.943989,-0.675189,-0.359552,0.896192,-0.964943


In [370]:
df_grouped.artist_name[80:100]

80          A$AP Rocky
81          Gucci Mane
82          Chief Keef
83               Lorde
84       Maurice Ravel
85            2 Chainz
86          Raz Simone
87    SIIMBA SELASSIIE
88     Marlowe Carruth
89       Roberto Diana
90      Jules Massenet
91         Manuel Zito
92              Honors
93          Nik Davies
94                 Jet
95        Mick Jenkins
96             OutKast
97          Atmosphere
98                 DMX
99              Jack Ü
Name: artist_name, dtype: object

In [371]:
cosine_similarity([scaled[80]],[scaled[98]])

array([[0.80649321]])

In [372]:
cosine_similarity([scaled[80]],[scaled[99]])

array([[0.70500269]])

#### It works!

We can see that artists like ASAP Rocky and DMX who both rap are more similar than ASAP Rocky and Jack U (edm music)

#### Let's save our artist feature df and our standard scaler so that we can use it to scale artist features the same way in the future

In [374]:
import joblib

In [375]:
scaler_filename = "MyData/artist_feature_scaler.save"
joblib.dump(sd, scaler_filename) 

['MyData/artist_feature_scaler.save']

In [379]:
artist_feature_file_name = 'MyData/artist_features.csv'
df_grouped.to_csv(artist_feature_file_name)