In [1]:
import pathlib
from typing import List

import spotipy
from spotipy import SpotifyClientCredentials
import api_setup
import pandas as pd

In [2]:
raw_data_dir = pathlib.Path.cwd().parent / "data/raw/jojo"
intermediate_data_dir = pathlib.Path.cwd().parent / "data/intermediate/jojo"

In [3]:
# Put my music and its features into a single CSV
library_info0 = pd.read_json(intermediate_data_dir / "features_0_eh.json")
library_info1 = pd.read_json(intermediate_data_dir / "features_1_eh.json")
library_info2 = pd.read_json(intermediate_data_dir / "features_2_eh.json")
music_features = pd.concat([library_info0, library_info1, library_info2])

In [4]:
# Turn my streaming history into one big file
streaming_history = pd.concat([pd.read_json(raw_data_dir / "extended_history" / ("endsong_"+str(idx)+".json")) for idx in range(7)])
streaming_history.to_csv(intermediate_data_dir / "streaming_history.csv", index=False)

In [5]:
# Add artist + song name to the "all_listened_to_features"
uri_artist_track = streaming_history[["spotify_track_uri", "master_metadata_album_artist_name", "master_metadata_album_album_name"]]
uri_artist_track = uri_artist_track.dropna(how='all')
uri_artist_track["spotify_track_uri"] = uri_artist_track["spotify_track_uri"].apply(lambda x: x.split(":")[-1])
music_features = music_features.join(uri_artist_track.set_index("spotify_track_uri"), on="id")
music_features.drop_duplicates(subset=['id'], inplace=True)
music_features = music_features.dropna(subset=['id'])

In [6]:
music_features.to_csv(intermediate_data_dir / "all_listened_to_features.csv", index=False)

In [70]:
# Getting genre information from Spotify API
# API Auth
env_vars = api_setup.parse_api_kvs(pathlib.Path.cwd().parent / "api-keys")
auth_manager = SpotifyClientCredentials(env_vars['client_id'], env_vars['client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=auth_manager, backoff_factor=2)

In [21]:
# Get artist genres and append that to music features
# async api calls
def split_into_sublists(input_list, chunk_size=50):
	upto = 0
	output_lists = []
	while True:
		if upto + chunk_size >= len(input_list):
			output_lists.append(input_list[upto:-1])
			break
		else:
			output_lists.append(input_list[upto:upto+chunk_size])
			upto += chunk_size
	return output_lists

def get_tracks_info(track_uris: List[List[str]]) -> List:
	out=[]
	for sublist in track_uris:
		print(f"trying with sublist {sublist}")
		result = spotify.tracks(sublist)
		out.append(result)

	return out

In [24]:
track_uris = music_features['id'].dropna().values
chunked_track_uris = split_into_sublists(track_uris)
print(chunked_track_uris)

[array(['1Qpyt7U1yu4tfuJc4LKuzF', '0hB42rb9ZbfJN231AbDnfe',
       '6AwKyBU2ymb7zJ6PeUmKdo', '0CsR9Y9SnC6ZWmekmVqSHz',
       '5JtPGzRgrWxkXX9LoROq3d', '5N3hv6QgFViEOoZMmIvS8j',
       '54FgTS1O5kbZLaPjsCFY6x', '5bltTpT0HIaEQzlGvxabWy',
       '4fzsfWzRhPawzqhX8Qt9F3', '3eYAdEgUyFQEmQgofs2NEt',
       '6VLTel8dIRU2Jr5qat1TJG', '2Ch7LmS7r2Gy2kc64wv3Bz',
       '4YMqbFcDIFiCBd02PzUBcM', '15DFeFyhOGyqbSVveuE3wp',
       '7bsrxWD78NPHkQGdWDtVKA', '0r5u9i2GzrqzU2Pv0eIvVq',
       '6DOep8Fgl0BUeOeUi8Ug7w', '26dfLcCVJUdscPvmUmkuSI',
       '4Q5deMn3CXLzjsHgj0AnYq', '20oko5N4xL2LZK1Mj5BfMk',
       '2kJqNHHGOzLNahukdvlDWN', '3P4gLPPOHIRQ5j6PC1hNa5',
       '34vfgWxlPpSXghcRusmFrZ', '27vJ34Stsi20C79zSw39E7',
       '54Cv78I7DxfvXjPL0I2r3d', '5dJjpT4Z6ozSfzN0r1FdGP',
       '6ViHIEWmHqTt4Ov9cBORCb', '2W0Wz0NzO7u5kcb7dGvv5w',
       '5eFco20Tz7a2xiDbPPjOUD', '7pu8Kq3nIMhBjbnEPkz2QD',
       '0QkWikH5Z3U0f79T9iuF6c', '18ARUyYMqmBfXbp1EX1K7f',
       '4KTtYhxFtFL7mBwnjkKfLm', '5mIOsPuQdXchVY0jB5NO9

In [25]:
tracks_info = get_tracks_info(chunked_track_uris)

trying with sublist ['1Qpyt7U1yu4tfuJc4LKuzF' '0hB42rb9ZbfJN231AbDnfe'
 '6AwKyBU2ymb7zJ6PeUmKdo' '0CsR9Y9SnC6ZWmekmVqSHz'
 '5JtPGzRgrWxkXX9LoROq3d' '5N3hv6QgFViEOoZMmIvS8j'
 '54FgTS1O5kbZLaPjsCFY6x' '5bltTpT0HIaEQzlGvxabWy'
 '4fzsfWzRhPawzqhX8Qt9F3' '3eYAdEgUyFQEmQgofs2NEt'
 '6VLTel8dIRU2Jr5qat1TJG' '2Ch7LmS7r2Gy2kc64wv3Bz'
 '4YMqbFcDIFiCBd02PzUBcM' '15DFeFyhOGyqbSVveuE3wp'
 '7bsrxWD78NPHkQGdWDtVKA' '0r5u9i2GzrqzU2Pv0eIvVq'
 '6DOep8Fgl0BUeOeUi8Ug7w' '26dfLcCVJUdscPvmUmkuSI'
 '4Q5deMn3CXLzjsHgj0AnYq' '20oko5N4xL2LZK1Mj5BfMk'
 '2kJqNHHGOzLNahukdvlDWN' '3P4gLPPOHIRQ5j6PC1hNa5'
 '34vfgWxlPpSXghcRusmFrZ' '27vJ34Stsi20C79zSw39E7'
 '54Cv78I7DxfvXjPL0I2r3d' '5dJjpT4Z6ozSfzN0r1FdGP'
 '6ViHIEWmHqTt4Ov9cBORCb' '2W0Wz0NzO7u5kcb7dGvv5w'
 '5eFco20Tz7a2xiDbPPjOUD' '7pu8Kq3nIMhBjbnEPkz2QD'
 '0QkWikH5Z3U0f79T9iuF6c' '18ARUyYMqmBfXbp1EX1K7f'
 '4KTtYhxFtFL7mBwnjkKfLm' '5mIOsPuQdXchVY0jB5NO9Q'
 '1011y7A6PTOuitZSPlN5an' '2sHwSBNDZN2yjCtgwStYt1'
 '4euxYgIl5XEqUj5WB9lHNq' '4Q3N4Ct4zCuIHuZ65E3BD4'
 '0dqJjKKxu

In [64]:
tracks_info[0]['tracks']#[0]['album']['artists'][0]#['uri']
artist_to_uri = {}
for tracks in tracks_info:
	itracks = tracks['tracks']
	for track in itracks:
		artist_to_uri[track['album']['artists'][0]['name']] = track['album']['artists'][0]['uri']

In [65]:
artist_to_uri

{'Sharon Van Etten': 'spotify:artist:2wJ4vsxWd7df7dRU4KcoDe',
 'Goreshit': 'spotify:artist:2UyOnfaE8nWXKPwYQyp2La',
 'Capcom Sound Team': 'spotify:artist:3w1Q754jb31h5CXQCcnLNL',
 'Doja Cat': 'spotify:artist:5cj0lLjcoR7YOSnhnX0Po5',
 'Phoenix': 'spotify:artist:1xU878Z1QtBldR7ru9owdU',
 'mt. fujitive': 'spotify:artist:2V9zpugQCHRiu2lPjsUM6d',
 'Shou': 'spotify:artist:73YFrf7gXIkvp3pxaqtOpZ',
 'Lady Gaga': 'spotify:artist:1HY2Jd0NmPuamShAr6KMms',
 'Kanye West': 'spotify:artist:5K4W6rqBFWDnAN6FQUkS6x',
 'Sofya Wang': 'spotify:artist:4axDBKx7Segq3j5P2VVSjx',
 'Stalking Gia': 'spotify:artist:3VTJqPiHqHHIrx1FL7avMY',
 'The Weeknd': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ',
 'Macklemore & Ryan Lewis': 'spotify:artist:5BcAKTbp20cv7tC5VqPFoC',
 'Various Artists': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of',
 'South North': 'spotify:artist:7z19cN47vHnay3CoShIp1b',
 'Alicia Keys': 'spotify:artist:3DiDSECUqqY1AuBP8qtaIa',
 'Deniz Akbulut': 'spotify:artist:4zzGjrG42gu70uM9eJ5oQ5',
 'Danny Brown': 'spoti

In [71]:
def get_artists_info(artist_uris: List[List[str]]) -> List:
	out=[]
	for sublist in artist_uris:
		result = spotify.artists(sublist)
		out.append(result)
	return out

artist_uris = list(artist_to_uri.values())
artist_info = get_artists_info(split_into_sublists(artist_uris))

In [115]:
artist_to_genre = {}
artist_to_popularity = {}
for artists in artist_info:
	for artist in artists['artists']:
		artist_to_genre[artist['name']] = artist['genres']
		artist_to_popularity[artist['name']] = artist['popularity']

In [122]:
artist_to_genre_df = pd.DataFrame.from_dict(artist_to_genre, orient="index")
artist_to_popularity_df = pd.DataFrame.from_dict(artist_to_popularity, orient="index", columns=["artist_popularity"])
artist_to_genre_df['genres'] = list(artist_to_genre.values())
artist_to_genre_df = pd.DataFrame(artist_to_genre_df['genres'])
artist_to_genre_df = artist_to_genre_df.join(artist_to_popularity_df)
artist_to_genre_df.to_csv(intermediate_data_dir / "artist_genres.csv")

In [106]:
artist_info[0]

{'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2wJ4vsxWd7df7dRU4KcoDe'},
   'followers': {'href': None, 'total': 468055},
   'genres': ['art pop',
    'chamber pop',
    'dream pop',
    'indie pop',
    'indie rock',
    'melancholia',
    'new jersey indie',
    'singer-songwriter'],
   'href': 'https://api.spotify.com/v1/artists/2wJ4vsxWd7df7dRU4KcoDe',
   'id': '2wJ4vsxWd7df7dRU4KcoDe',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab6761610000e5eb3f701caa98c66d99d038d28d',
     'width': 640},
    {'height': 320,
     'url': 'https://i.scdn.co/image/ab676161000051743f701caa98c66d99d038d28d',
     'width': 320},
    {'height': 160,
     'url': 'https://i.scdn.co/image/ab6761610000f1783f701caa98c66d99d038d28d',
     'width': 160}],
   'name': 'Sharon Van Etten',
   'popularity': 56,
   'type': 'artist',
   'uri': 'spotify:artist:2wJ4vsxWd7df7dRU4KcoDe'},
  {'external_urls': {'spotify': 'https://open.spotify.com/artist/2UyOnfaE8nWXKPwY

In [131]:
music_features = music_features.join(artist_to_genre_df, on="master_metadata_album_artist_name", lsuffix="dropme")
music_features.columns

  music_features = music_features.join(artist_to_genre_df, on="master_metadata_album_artist_name", lsuffix="dropme")


Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'error', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'genresaisdhu', '0dropme', '0',
       'artist_popularitydropme', 'artist_popularitydropme', 'genresdropme',
       'artist_popularitydropme', 'genres', 'artist_popularity'],
      dtype='object')

In [132]:
music_features = music_features.drop(columns=["genresdropme"])
music_features.to_csv(intermediate_data_dir / "all_listened_to_features.csv", index=False)