In [None]:
# references:
"""
https://support.spotify.com/us/article/understanding-my-data/
https://developer.spotify.com/dashboard/applications/07e518e5e13b469d81f8d41e7227fbc7
https://stmorse.github.io/journal/spotify-api.html
https://towardsdatascience.com/visualizing-spotify-data-with-python-tableau-687f2f528cdd
Ironhack notes
https://developer.spotify.com/documentation/web-api/reference/#/
https://developer.spotify.com/documentation/web-api/guides/rate-limits/
https://towardsdatascience.com/visualizing-spotify-songs-with-python-an-exploratory-data-analysis-fc3fae3c2c09


"""

In [1]:
import pandas as pd
import numpy as np
import requests
import json

In [2]:
# imported behavioral data from spotify (streamings)
stream0 = pd.read_json('StreamingHistory0.json')
stream1 = pd.read_json('StreamingHistory1.json')
df_stream = pd.concat([stream0, stream1])
df_stream['endTime'] = pd.to_datetime(df_stream['endTime'])
df_stream.sample(3)

Unnamed: 0,endTime,artistName,trackName,msPlayed
1361,2020-11-27 14:28:00,Bloodhound Gang,Foxtrot Uniform Charlie Kilo,171973
1798,2020-12-04 14:27:00,Bloc Party,Signs,278674
5884,2021-08-11 21:02:00,Jack Johnson,Washing Dishes,203947


In [3]:
# Get dates and time in separated columns
df_stream['year'] = pd.DatetimeIndex((df_stream.endTime)).year
df_stream['month'] = pd.DatetimeIndex((df_stream.endTime)).month
df_stream['day'] = pd.DatetimeIndex((df_stream.endTime)).day
df_stream['hour'] = pd.DatetimeIndex((df_stream.endTime)).hour

In [4]:
# transformed miliseconds to minutes for better understanding
df_stream['minutesPlayed'] = round(df_stream['msPlayed']/60000,2)

In [5]:
### cleaned json library ####

In [6]:
library0 = open('YourLibrary.json')
library1 = json.load(library0)
df_library = pd.DataFrame(library1['tracks'])

In [7]:
df_library['uri'] = df_library.uri.str.split('spotify:track:', expand=True)[1]

In [8]:
df_library.sample(3) #cleaned uri column to iterate with API afterwards

Unnamed: 0,artist,album,track,uri
530,Dean Evenson,Chakra Meditations & Tones,Throat Meditation,4takZfQSv4g8HKq9ZRJfJP
18,El Mató a un Policía Motorizado,La Sintesis O'Konor,El Mundo Extraño,7F5jWzjyyTqN73HwhL9swo
214,The O'Neill Brothers Group,Happy Instrumental Guitar Songs: Under the Sea,Colors of the Wind,48lj1kwH4J95fpWyJ6mr8Y


In [9]:
###Merge streamings + uri's from library

In [10]:
#df_stream, artistName, trackName
#df_library, artist, track
df_stream = df_stream.merge(df_library, how='inner', 
                              left_on=['artistName', 'trackName'],
                              right_on=['artist', 'track'])

In [11]:
df_stream = df_stream[['endTime', 'year', 'month', 'day', 'hour', 'artist', 'album', 'track', 'minutesPlayed', 'uri']]
df_stream.sample(3) #final database

Unnamed: 0,endTime,year,month,day,hour,artist,album,track,minutesPlayed,uri
259,2021-03-04 15:55:00,2021,3,4,15,NOFX,Punk In Drublic,Perfect Government,2.05,0pe2du168zXMGZJlMcyyIb
424,2021-05-01 00:39:00,2021,5,1,0,Frank Sinatra,Music From The Motion Picture Ocean's Thirteen,This Town,0.05,55b3OAHcjBm8KgISfcNaJl
805,2021-10-19 14:35:00,2021,10,19,14,Screeching Weasel,Bark Like a Dog,Cool Kids,2.22,2XTggG0CO00DkQy3U9rj1e


In [12]:
df_stream.info() # change date format?

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1719 entries, 0 to 1718
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   endTime        1719 non-null   datetime64[ns]
 1   year           1719 non-null   int64         
 2   month          1719 non-null   int64         
 3   day            1719 non-null   int64         
 4   hour           1719 non-null   int64         
 5   artist         1719 non-null   object        
 6   album          1719 non-null   object        
 7   track          1719 non-null   object        
 8   minutesPlayed  1719 non-null   float64       
 9   uri            1719 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(4)
memory usage: 147.7+ KB


In [13]:
### Spotify API Credentials #### Testing the API

In [14]:
client_ID = '07e518e5e13b469d81f8d41e7227fbc7'
client_secret = 'ce06fa16de144e48868fe95cde6db023'
auth_url = 'https://accounts.spotify.com/api/token'
auth_response = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': client_ID,
    'client_secret': client_secret
})
auth_response_data = auth_response.json()
access_token = auth_response_data['access_token']
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

In [15]:
base_url = 'https://api.spotify.com/v1/'
track_id = '02wrYv8k1VgsZI0WD1xh7r' ### iterate per track ID

In [16]:
# audio features end point
# create audio features df
r = requests.get(base_url + 'audio-features/' + track_id, headers=headers).json()

In [17]:
"""
Some other interesting endpoints:
-audio-analysis
-audio-features
-tracks
-artists/

"""

'\nSome other interesting endpoints:\n-audio-analysis\n-audio-features\n-tracks\n-artists/\n\n'

In [18]:
uris = list(df_library['uri'].unique())
len(uris)

686

In [19]:
def song_attributes (endPoint, uri):
    dummyList = []
    for i in uri:
        base_url = 'https://api.spotify.com/v1/'
        r = requests.get(base_url + endPoint + i, headers=headers).json()
        dummyList.append(r)
    return pd.DataFrame(dummyList)

In [20]:
df_features = song_attributes('audio-features/', uris)
df_features.sample(3)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
117,0.624,0.713,11,-12.121,0,0.0294,0.0187,0.000549,0.063,0.792,114.76,audio_features,1jSSzUufGxJCzhSuJQaTrx,spotify:track:1jSSzUufGxJCzhSuJQaTrx,https://api.spotify.com/v1/tracks/1jSSzUufGxJC...,https://api.spotify.com/v1/audio-analysis/1jSS...,212840,4
483,0.187,0.129,6,-17.636,1,0.047,0.984,0.875,0.0976,0.0358,121.989,audio_features,5mhP9OIwGYag3tIGVk954B,spotify:track:5mhP9OIwGYag3tIGVk954B,https://api.spotify.com/v1/tracks/5mhP9OIwGYag...,https://api.spotify.com/v1/audio-analysis/5mhP...,236107,4
59,0.449,0.375,0,-9.329,1,0.0286,0.302,2e-06,0.144,0.466,133.845,audio_features,6Y6UBWhifUnkJIO2mdy0S3,spotify:track:6Y6UBWhifUnkJIO2mdy0S3,https://api.spotify.com/v1/tracks/6Y6UBWhifUnk...,https://api.spotify.com/v1/audio-analysis/6Y6U...,181413,4


In [21]:
df_features = df_features[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'id']]

In [22]:
# pull artists IDs from track IDs to get genres

In [23]:
df_tracks = song_attributes('tracks/', uris)
df_tracks.head(3)

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,name,popularity,preview_url,track_number,type,uri
0,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,[],1,168120,False,{'isrc': 'GBAYE0601510'},{'spotify': 'https://open.spotify.com/track/5e...,https://api.spotify.com/v1/tracks/5e8uc7f0v5jp...,5e8uc7f0v5jpY5SF1emxHl,False,Getting Better - Remastered,0,,4,track,spotify:track:5e8uc7f0v5jpY5SF1emxHl
1,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",1,250160,True,{'isrc': 'USTB10250051'},{'spotify': 'https://open.spotify.com/track/62...,https://api.spotify.com/v1/tracks/62h3HmtlAmot...,62h3HmtlAmot4A9zwKVRSq,False,"Mama, I'm in Love Wit a Gangsta - Mix",26,https://p.scdn.co/mp3-preview/7bfced09b64662fe...,3,track,spotify:track:62h3HmtlAmot4A9zwKVRSq
2,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",1,291440,False,{'isrc': 'ES5150103002'},{'spotify': 'https://open.spotify.com/track/0H...,https://api.spotify.com/v1/tracks/0HmQkmfYXRPw...,0HmQkmfYXRPweIg1ycw1R8,False,Morena mía,67,https://p.scdn.co/mp3-preview/3164e42e1f32003f...,8,track,spotify:track:0HmQkmfYXRPweIg1ycw1R8


In [24]:
#artist ID + track ID
artist_track_id_dict = {}

for i in range(len(df_tracks.artists)):
    x = df_tracks.artists[i][0]['id']
    y = df_tracks.id[i]
    artist_track_id_dict[x] = y
    
art_track_ids = pd.DataFrame([artist_track_id_dict]).transpose()
art_track_ids = art_track_ids.reset_index()
art_track_ids = art_track_ids.rename(columns={'index': 'artist_id', 0:'track_id'})
art_track_ids.head(3)

Unnamed: 0,artist_id,track_id
0,3WrFJ7ztbogyGnTHbHJFl2,19aATlcb67bbIdjcMA0rOa
1,3y24n3XhZ96wgwRXjvS17T,5QMTmFY0KC0RfyRbQBEzzE
2,7mWCSSOYqm4E9mB7V4ot6S,2kpHaB82qVWd4ccYXGt5BE


In [25]:
art_ids = art_track_ids.artist_id.to_list()

In [26]:
df_artists = song_attributes('artists/', art_ids)
df_artists.head(3)

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri
0,{'spotify': 'https://open.spotify.com/artist/3...,"{'href': None, 'total': 20973124}","[beatlesque, british invasion, classic rock, m...",https://api.spotify.com/v1/artists/3WrFJ7ztbog...,3WrFJ7ztbogyGnTHbHJFl2,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",The Beatles,88,artist,spotify:artist:3WrFJ7ztbogyGnTHbHJFl2
1,{'spotify': 'https://open.spotify.com/artist/3...,"{'href': None, 'total': 526482}","[g funk, gangster rap, hip hop, rap, west coas...",https://api.spotify.com/v1/artists/3y24n3XhZ96...,3y24n3XhZ96wgwRXjvS17T,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Coolio,72,artist,spotify:artist:3y24n3XhZ96wgwRXjvS17T
2,{'spotify': 'https://open.spotify.com/artist/7...,"{'href': None, 'total': 1783513}","[latin, latin pop, mexican pop, rock en espano...",https://api.spotify.com/v1/artists/7mWCSSOYqm4...,7mWCSSOYqm4E9mB7V4ot6S,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Miguel Bosé,72,artist,spotify:artist:7mWCSSOYqm4E9mB7V4ot6S


In [27]:
df_genres = df_artists[['id', 'genres']]
df_genres = df_genres.rename(columns={'id': 'artist_id'})
df_genres.head(3) #explode?

Unnamed: 0,artist_id,genres
0,3WrFJ7ztbogyGnTHbHJFl2,"[beatlesque, british invasion, classic rock, m..."
1,3y24n3XhZ96wgwRXjvS17T,"[g funk, gangster rap, hip hop, rap, west coas..."
2,7mWCSSOYqm4E9mB7V4ot6S,"[latin, latin pop, mexican pop, rock en espano..."


In [28]:
### Consolidate: csv + features + artist_ID + genre
"""
1) keys = csv uri + features uri
2) keys = csv uri + artist_id (art_track_ids)
3) keys = 

"""

'\n1) keys = csv uri + features uri\n2) keys = csv uri + artist_id (art_track_ids)\n3) keys = \n\n'

In [29]:
df_stream = df_stream.merge(df_features, how='inner', 
                              left_on=['uri'],
                              right_on=['id'])
df_stream = df_stream.merge(art_track_ids, how='inner',
                           left_on=['uri'],
                           right_on=['track_id'])
df_stream = df_stream.merge(df_genres, how='inner',
                           left_on='artist_id',
                           right_on='artist_id')

In [39]:
df_stream = df_stream[['endTime',
 'year',
 'month',
 'day',
 'hour',
 'artist',
 'album',
 'track',
'genres',
 'minutesPlayed',
 'uri',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo']]

In [52]:
df_stream.sample(3)

Unnamed: 0,endTime,year,month,day,hour,artist,album,track,genres,minutesPlayed,...,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
803,2021-03-28 00:10:00,2021,3,28,0,Joey Pecoraro,Novice Juggler,Novice Juggler,"[chillhop, lo-fi beats]",3.19,...,0.419,9,-5.672,1,0.0341,0.505,0.644,0.112,0.224,88.021
651,2021-10-29 15:00:00,2021,10,29,15,Pennywise,About Time,Same Old Story,"[melodic hardcore, punk, skate punk, socal pop...",2.72,...,0.978,0,-3.653,1,0.128,0.0258,0.0,0.212,0.45,95.585
347,2020-12-03 15:39:00,2020,12,3,15,NoMBe,They Might've Even Loved Me,Milk & Coffee,"[electropop, indie poptimism, indie soul, mode...",3.1,...,0.805,7,-4.672,1,0.0315,0.00103,5e-06,0.488,0.727,90.999


In [55]:
df_genres = df_stream[['artist', 'genres']].explode('genres')
df_genres #assess using 'df_stream.genres.apply(pd.Series)'

Unnamed: 0,artist,genres
0,The Outfield,album rock
0,The Outfield,mellow gold
0,The Outfield,new romantic
0,The Outfield,new wave pop
0,The Outfield,power pop
...,...,...
1109,Hot Chip,new rave
1110,Tal National,afro-funk
1110,Tal National,afropop
1111,Charles Bradley,funk
