In [1]:
#Load data manipulation and http library 
import pandas as pd
import numpy as np
import requests
import os

### Read my json files

In [2]:
df_stream0 = pd.read_json('Data/StreamingHistory0.json')
df_stream1 = pd.read_json('Data/StreamingHistory1.json')

df_stream1

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2021-05-15 02:09,$uicideboy$,My Flaws Burn Through My Skin Like Demonic Fla...,3622
1,2021-05-15 02:09,Coldplay,Higher Power,10379
2,2021-05-15 02:12,Lil Peep,Awful Things,214426
3,2021-05-15 02:17,ZHU,Good4U,260699
4,2021-05-15 02:19,Kamaiyah,Fuck It Up (feat. YG),141844
...,...,...,...,...
7880,2021-10-15 23:43,Kanye West,New Slaves,212532
7881,2021-10-15 23:46,Nadine Lustre,Ivory (feat. MANILA GREY) [azel north Remix],182785
7882,2021-10-15 23:50,Nadine Lustre,Complicated Love (feat. James Reid) [azel nort...,183982
7883,2021-10-15 23:54,H.E.R.,"Slide (Remix) (feat. Pop Smoke, A Boogie Wit d...",281982


## Merge both into one dataframe & create unique identifier column

In [3]:
df_combined = pd.concat([df_stream0, df_stream1])

df_combined.head()

#Create a unique identifier column by concatenating artistName and trackName 

df_combined['Unique_ID'] = df_combined['artistName'] + ": " + df_combined['trackName']

df_combined.head()


Unnamed: 0,endTime,artistName,trackName,msPlayed,Unique_ID
0,2020-10-15 00:12,THEY.,What I Know Now (feat. Wiz Khalifa),3715,THEY.: What I Know Now (feat. Wiz Khalifa)
1,2020-10-15 00:12,Sam Smith,My Oasis (feat. Burna Boy),24725,Sam Smith: My Oasis (feat. Burna Boy)
2,2020-10-15 00:12,Sam Smith,My Oasis (feat. Burna Boy),1207,Sam Smith: My Oasis (feat. Burna Boy)
3,2020-10-15 00:15,Cheat Codes,Balenciaga,173706,Cheat Codes: Balenciaga
4,2020-10-15 00:16,ARTY,Rebound - Radio Edit,85740,ARTY: Rebound - Radio Edit


## Read in myLibrary file  

In [4]:
#Read my Library json file into a pandas df. Had to manually edit the json file in Atom. Took out "albums" dict
df_lib = pd.read_json('Data/YourLibrary1.json')

df_lib.head()

#Add an unique identifier column 
df_lib['Unique_ID'] = df_lib['artist'] + ": " + df_lib['track']

df_lib.head()


Unnamed: 0,artist,album,track,uri,Unique_ID
0,Slim,Love's Crazy,So Fly (feat. Yung Joc),spotify:track:2wmBKUx62Px6my6U1mYw18,Slim: So Fly (feat. Yung Joc)
1,JACKBOYS,JACKBOYS,JACKBOYS,spotify:track:62zKJrpbLxz6InR3tGyr7o,JACKBOYS: JACKBOYS
2,Calvin Harris,18 Months,Thinking About You (feat. Ayah Marar),spotify:track:1KtD0xaLAikgIt5tPbteZQ,Calvin Harris: Thinking About You (feat. Ayah ...
3,Foo Fighters,Best Of You,Best of You,spotify:track:4nUM7pGcTUK2pY1d2LybrT,Foo Fighters: Best of You
4,Laidback Luke,Bae (feat. Gina Turner),Bae,spotify:track:4zxhd49IbXASiDuIXmWm5o,Laidback Luke: Bae


## Split the URI column: separate "spotify:track"

In [5]:
uri = df_lib['uri'].str.split(":", expand=True)

df_lib['track_uri'] = uri[2]

df_lib.head()

Unnamed: 0,artist,album,track,uri,Unique_ID,track_uri
0,Slim,Love's Crazy,So Fly (feat. Yung Joc),spotify:track:2wmBKUx62Px6my6U1mYw18,Slim: So Fly (feat. Yung Joc),2wmBKUx62Px6my6U1mYw18
1,JACKBOYS,JACKBOYS,JACKBOYS,spotify:track:62zKJrpbLxz6InR3tGyr7o,JACKBOYS: JACKBOYS,62zKJrpbLxz6InR3tGyr7o
2,Calvin Harris,18 Months,Thinking About You (feat. Ayah Marar),spotify:track:1KtD0xaLAikgIt5tPbteZQ,Calvin Harris: Thinking About You (feat. Ayah ...,1KtD0xaLAikgIt5tPbteZQ
3,Foo Fighters,Best Of You,Best of You,spotify:track:4nUM7pGcTUK2pY1d2LybrT,Foo Fighters: Best of You,4nUM7pGcTUK2pY1d2LybrT
4,Laidback Luke,Bae (feat. Gina Turner),Bae,spotify:track:4zxhd49IbXASiDuIXmWm5o,Laidback Luke: Bae,4zxhd49IbXASiDuIXmWm5o


## Merge our library and streaming dataframes

In [6]:
#Create a copy of the stream df 

df_tab = df_combined.copy()

#Check if the streamed song is in the library 
df_tab['In_Library'] = np.where(df_tab['Unique_ID'].isin(df_combined['Unique_ID'].tolist()),1,0)

#Merge with a left join on df_lib to get album and track_uri

df_tab = pd.merge(df_tab, df_lib[['album', 'Unique_ID','track_uri']], how='left', on=['Unique_ID'])

df_tab.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,Unique_ID,In_Library,album,track_uri
0,2020-10-15 00:12,THEY.,What I Know Now (feat. Wiz Khalifa),3715,THEY.: What I Know Now (feat. Wiz Khalifa),1,What I Know Now (feat. Wiz Khalifa),3wImXdE4bliIVSEl3lVue1
1,2020-10-15 00:12,Sam Smith,My Oasis (feat. Burna Boy),24725,Sam Smith: My Oasis (feat. Burna Boy),1,My Oasis (feat. Burna Boy),2KoHxhRyWxJzA0VafWd5Nk
2,2020-10-15 00:12,Sam Smith,My Oasis (feat. Burna Boy),1207,Sam Smith: My Oasis (feat. Burna Boy),1,My Oasis (feat. Burna Boy),2KoHxhRyWxJzA0VafWd5Nk
3,2020-10-15 00:15,Cheat Codes,Balenciaga,173706,Cheat Codes: Balenciaga,1,Balenciaga,5k6ioThKIxeklkIt3MNJjC
4,2020-10-15 00:16,ARTY,Rebound - Radio Edit,85740,ARTY: Rebound - Radio Edit,1,Rebound,6UFQ1l5PHnBm09KxO5fmea


## Create a new project on Spotify to obtain credentials

In [7]:
#These credentials will generate an access token so we can pull data from Spotify's API
#https://stmorse.github.io/journal/spotify-api.html

client_id = ''
client_secret = ''

In [8]:
AUTH_URL = 'https://accounts.spotify.com/api/token'

#POST a request with our client credentials 

auth_resp = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id' : client_id,
    'client_secret' : client_secret 
})

#Convert the response to JSON

auth_resp_data = auth_resp.json()

#Save access token; we will use this to access the API 

access_token = auth_resp_data['access_token']

## Send GET request to the API server

In [9]:
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

In [10]:
#base URL for all Spotify API endpoints 

base_url = 'https://api.spotify.com/v1/'

## Pull the artists and genre associated with each track_uri in our library to add to a dictionary 

In [11]:
#Blank dictionary to store, track_uri, artist uri, and genre 

genre_dict = {}

#convert track_uri col to a list

track_uris = df_lib['track_uri'].to_list() 

#using the API, loop through track_uri and pull artist uri 

for t_uri in track_uris:
    genre_dict[t_uri] = {'artist_uri ': "", "genres": []}
    
    r = requests.get(base_url + "tracks/" + t_uri, headers = headers)
    
    r = r.json()
    
    artist_uri = r['artists'][0]['uri'].split(":")[2]
    genre_dict[t_uri]['artist_uri'] = artist_uri
    
    s = requests.get(base_url + "artists/" + artist_uri, headers=headers)
    
    s = s.json()
    genre_dict[t_uri]['genres']= s['genres']

In [14]:
# convert dictionary into dataframe with track_uri as the first column

df_genres = pd.DataFrame.from_dict(genre_dict, orient='index')

df_genres.insert(0, 'track_uri', df_genres.index)
df_genres.reset_index(inplace=True, drop=True)

df_genres.head()

Unnamed: 0,track_uri,artist_uri,genres,artist_uri.1
0,2wmBKUx62Px6my6U1mYw18,,[],33nrND6ODgesoa1Qmr5DbD
1,62zKJrpbLxz6InR3tGyr7o,,"[rap, trap]",7A8S43ryYdbWpJKeHRZRcq
2,1KtD0xaLAikgIt5tPbteZQ,,"[dance pop, edm, electro house, house, pop, po...",7CajNmpbOovFoOoasH2HaY
3,4nUM7pGcTUK2pY1d2LybrT,,"[alternative metal, alternative rock, modern r...",7jy3rLJdDQY21OgRLCZ9sD
4,4zxhd49IbXASiDuIXmWm5o,,"[big room, dance pop, dutch edm, dutch house, ...",53cQZtWDwDJwVCNZlfJ6Qk


In [16]:
#The explode() function is used to transform each element of a list-like to a row, replicating the index values.

df_genres_expand = df_genres.explode('genres')
df_genres_expand.head()

Unnamed: 0,track_uri,artist_uri,genres,artist_uri.1
0,2wmBKUx62Px6my6U1mYw18,,,33nrND6ODgesoa1Qmr5DbD
1,62zKJrpbLxz6InR3tGyr7o,,rap,7A8S43ryYdbWpJKeHRZRcq
1,62zKJrpbLxz6InR3tGyr7o,,trap,7A8S43ryYdbWpJKeHRZRcq
2,1KtD0xaLAikgIt5tPbteZQ,,dance pop,7CajNmpbOovFoOoasH2HaY
2,1KtD0xaLAikgIt5tPbteZQ,,edm,7CajNmpbOovFoOoasH2HaY


## Save df_tab and df_genre_expand to csv files to import 