# Recommender Engine: Spotify Data Extraction with Tekore

In [1]:
# Import Libraries
import os
import numpy as np
import pandas as pd
import dotenv
import tekore as tk

## Authentification steps

In [2]:
# Load ID and Secret
dotenv.load_dotenv()
client_id = os.getenv('ClientID')
client_secret=os.getenv('ClientSecret')
redirect_uri='http://localhost:8888/callback'

#generate app token
app_token = tk.request_client_token(client_id, client_secret)

In [3]:
#connect to api
spotify = tk.Spotify(app_token,chunked_on=True,max_limits_on=False)

In [5]:
#generate a user token to access user data
user_token = tk.prompt_for_user_token(
    client_id,
    client_secret,
    redirect_uri,
    scope=tk.scope.every
)

Opening browser for Spotify login...


Please paste redirect URL:  http://localhost:8888/callback?code=AQDrefXkvW2Wt0NYQAPFNmp92IHl8Yip7jYVQwEQuW-vhfR8le1pOUqN1VqVFVKvyQHVruKLmI9eKvXBfiCvo3lraNRGlQd9Nc_A8Y0dhMVV24No-0ubWgFESRgcYDjTrwrkzVVBo087kaha0CWkQc0Z66zX1stELaAB3W1DstH_scRhTQdfT10DtqqC4uiXb4P_jJnPuZbFTR6n-LNZf8-swQ9hb0UqxFXO26MJ6P0SfBZrukl373-I1VmiTbqcEfSgHqy73S9heQylWlBWJpOW_VMcVq5yXwxmxm3I6uFNjrspjVArVhM4zJ-R0uje-m2r2SaCC0gWzBhVwFz1_IusZ2D-B1IYWFurkmKA4b1QmC_bQs3jRkjNnpvePz0VjiRdzspd_5K6dJU7P73Fl4c3SdeP1w0kfEqDf1KQZ6tBedyriyEUsh1UhIAyV6R0isKI7-4C41WE8bEkzHyAwm_EpOitKnHIDWCGwfYX6nK-NcxjgpAf-y_Y7nmcxK7L8nFKRkc2TKtSh4ldkzPCGRK6dc5vJMKUXMTy2-41hF2V5eKojLxivpEEEC8ZGG4f1_2-adKthr_z6Ux5lDjTS4OFSUU0xOhrAvhmDUyBUNtY78rdCzCJow55f7OzQ3ZCTrZJXYjQdXipD6Jheny_Ikdol8ASC1wREKDPzOatzkKPcs7eKg&state=eMlLc8EjPYNVQJ44p5StO0k-2F8iggPGi86tHoDHUJQ


In [6]:
spotify.token = user_token

## Data Extraction

In [7]:
#get top user tracks
tracks = spotify.current_user_top_tracks(limit=20)

#print top tracks and their Ids
for track in tracks.items:
    print(track.name,track.uri)

Whoever You Are spotify:track:5Yq2IBzA24Kex9RQ2By7MQ
Everywhere I'll Ever Be spotify:track:10mQ8GRpBfQuySHPvJoWHL
On the Cusp spotify:track:6F3iddkEZ25Tm6eF00Uoha
ME vs ME spotify:track:3NCGvnHRoedGXaqR42fxmc
In a Minute spotify:track:7k1c4N0lvcP5RtDBKTHoTW
Astrology Joint spotify:track:1jcGbrSCZ32Q2vyvOpOZzg
White Gloves spotify:track:4AKUOaCRcoKTFnVI9LtsrN
Straight Talker spotify:track:6Gq99VdfbBlsw9qlAZp6x4
Say You Want Me - Single Version spotify:track:1xTYy3K3hiTcoZfoD7culn
POF spotify:track:1NY9g5OFUoezXeTveFYWSM
ENERGY (feat. Beam) spotify:track:0314PeD1sQNonfVWix3B2K
HEATED spotify:track:1w7cgGZR86yWz1pA2puVJD
NEON PEACH feat. Tyler, The Creator spotify:track:5PF2WtSZV4EtmGx4oER1zt
ALIEN SUPERSTAR spotify:track:1Hohk6AufHZOrrhMXZppax
Where I Go (feat. H.E.R.) spotify:track:3MlQPB0wJuopo3NTZGlMpI
After Last Night (with Thundercat & Bootsy Collins) spotify:track:6jGAh1bFnXt1Muj9zeHveZ
Cómo Me Quieres spotify:track:11AURg9Kbju7LOcwQnR17f
Sidepiece spotify:track:3o0zYn7dQtUZ1FRevTR

In [8]:
## extract all songs from liked songs library
songs=[]
for item in spotify.all_items(spotify.saved_tracks()):
    songs.append(item)


In [9]:
## extract track features

#initialize empty lists
album_name=[]
artist_name=[]
track_id=[]
track_name=[]
track_popularity=[]
artist_id=[]
track_uri=[]
track_duration=[]
track_releaseDate=[]
album_id=[]

# for each song extract each of the following features from the song object
for i in range(len(songs)):
    track_duration.append(songs[i].track.duration_ms)
    track_id.append(songs[i].track.id)
    track_name.append(songs[i].track.name)
    track_popularity.append(songs[i].track.popularity)
    track_uri.append(songs[i].track.uri)
    artist_name.append(songs[i].track.artists[0].name)
    artist_id.append(songs[i].track.artists[0].id)
    album_id.append(songs[i].track.album.id)
    album_name.append(songs[i].track.album.name)
    track_releaseDate.append(songs[i].track.album.release_date)

In [10]:
# create user dataframe with all of the tracks and most of the features for each song
user_library=pd.DataFrame({'track_id':track_id,
             'track_name':track_name,
             'track_duration':track_duration,
             'track_popularity':track_popularity,
             'track_releaseDate':track_releaseDate,
             'track_uri':track_uri,
             'artist_name':artist_name,
             'artist_id':artist_id,
             'album_id':album_id,
             'album_name':album_name,
             })

In [11]:
#Get AudioFeatures for each track
track_audioFeatures=[]
with spotify.chunked(True):
    track_audioFeatures.append(spotify.tracks_audio_features(track_id))

In [12]:
#initialize empty lists
acousticness=[]
danceability =[]
energy=[]
instrumentalness=[]
key=[]
liveness=[]
loudness=[]
mode=[]
speechiness=[]
tempo=[]
time_signature=[]
valence=[]
#extract each tracks audio features
for i in range(len(track_audioFeatures[0])):
    acousticness.append(track_audioFeatures[0][i].acousticness)
    danceability.append(track_audioFeatures[0][i].danceability)
    energy.append(track_audioFeatures[0][i].energy)
    instrumentalness.append(track_audioFeatures[0][i].instrumentalness)
    key.append(track_audioFeatures[0][i].key)
    liveness.append(track_audioFeatures[0][i].liveness)
    loudness.append(track_audioFeatures[0][i].loudness)
    mode.append(track_audioFeatures[0][i].mode)
    speechiness.append(track_audioFeatures[0][i].speechiness)
    tempo.append(track_audioFeatures[0][i].tempo)
    time_signature.append(track_audioFeatures[0][i].time_signature)
    valence.append(track_audioFeatures[0][i].valence)


In [13]:
#Get Album Details for each track
album_info=[]
artist_info=[]
with spotify.chunked():
    album_info.append(spotify.albums(album_id))
    artist_info.append(spotify.artists(artist_id))

Response contains unknown attribute: `album_group`
  return post_func(*args, **kwargs)


In [14]:
# get album and artist data
album_genre=[]
album_popularity=[]
artist_popularity=[]
artist_genres=[]
for i in range(len(album_info[0])):
    album_genre.append(album_info[0][i].genres)
    album_popularity.append(album_info[0][i].popularity)
    artist_popularity.append(artist_info[0][i].popularity)
    artist_genres.append(artist_info[0][i].genres)


In [16]:
# create data frame of all of the track features
track_feats=pd.DataFrame({'acousticness':acousticness,
              'danceability':danceability,
              'energy':energy,
              'instrumentalness':instrumentalness,
              'key':key,
              'liveness':liveness,
              'loudness':loudness,
              'mode':mode,
              'speechiness':speechiness,
              'tempo':tempo,
              'time_signature':time_signature,
              'valence':valence,           
              'album_genre':album_genre,
              'album_popularity':album_popularity,
              'artist_popularity':artist_popularity,
              'artist_genres':artist_genres})
track_data=pd.concat([user_library,track_feats],axis=1)

## Data Cleaning

In [17]:
#track_release_date: split into month/year
track_data['track_releaseYear']=track_data.track_releaseDate.str.split('-',expand=True)[0]
track_data['track_releaseMonth']=track_data.track_releaseDate.str.split('-',expand=True)[1]
track_data=track_data.drop('track_releaseDate',axis=1)

In [18]:
#key column: one hot encode
track_data.key=track_data.key.astype(str)
track_data=pd.get_dummies(track_data,prefix='key',columns=['key'])

In [19]:
# drop album genre columns: no data
track_data.drop('album_genre',axis=1,inplace=True)

In [20]:
#artist genre: obtain top 100 genres and one hot encode
artist_genre=[]
for i in range(len(track_data)):
    artist_genre=artist_genre+list(track_data.artist_genres[i])

In [21]:
# initalize empty genre dataframe
genre_df=pd.DataFrame()

In [25]:
# generate a list of top 100 genres
genre_list=list(pd.DataFrame({'artist_genre':artist_genre}).value_counts().sort_values(ascending=False)[0:100].reset_index()['artist_genre'])

# create one-hot encoded df of genres
for i in range(len(track_data)):
    for genre in genre_list[0:100]:
        if genre in list(track_data.artist_genres[i]): #compare genre label to original data frame's genres
            genre_df.loc[i,genre]=1
        else:
            genre_df.loc[i,genre]=0


In [26]:
# combine track and genre dfs
user_data=pd.concat([track_data,genre_df],axis=1)

In [27]:
# drop list of genres that are now one hot encoded
user_data.drop('artist_genres',axis=1,inplace=True)

In [28]:
#check top ten genres
pd.DataFrame({'genres':genre_list})[:10]

Unnamed: 0,genres
0,r&b
1,pop
2,rap
3,hip hop
4,neo soul
5,dance pop
6,alternative r&b
7,urban contemporary
8,indie soul
9,indie r&b


## Export File

In [29]:
#Export csv
user_data.to_csv('angie_spotify_data.csv')