The data augmented in this file is from:
- https://www.kaggle.com/datasets/tomigelo/spotify-audio-features
- https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

our data has the following columns:
danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,error,master_metadata_album_artist_name,master_metadata_album_album_name,genres,artist_popularity

In [19]:
import pandas as pd
import pathlib
from typing import List
REPO_ROOT = pathlib.Path.cwd().parent

# Standardizing data from tomigelo

In [69]:
import spotipy
import api_setup
from spotipy import SpotifyClientCredentials

# Getting genre information from Spotify API
# API Auth
env_vars = api_setup.parse_api_kvs(pathlib.Path.cwd().parent / "api-keys")
auth_manager = SpotifyClientCredentials(env_vars['client_id'], env_vars['client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=auth_manager, backoff_factor=2)

In [21]:
# Reading data
tomigelo_data = pd.concat([pd.read_csv(REPO_ROOT / "data/song_data/tomigelo/SpotifyAudioFeaturesApril2019.csv"), pd.read_csv(REPO_ROOT / "data/song_data/tomigelo/SpotifyAudioFeaturesNov2018.csv")])
tomigelo_data

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.005820,0.743,238373,0.339,0.0000,1,0.0812,-7.678,1,0.4090,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.024400,0.846,214800,0.557,0.0000,8,0.2860,-7.259,1,0.4570,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025000,0.603,138913,0.723,0.0000,9,0.0824,-5.890,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.029400,0.800,125381,0.579,0.9120,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,0.000035,0.783,124016,0.792,0.8780,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116367,Roma Atmosphere,5R9ukMX7BDIy1WJYsAhBD6,Sette Oche,0.978000,0.103,245714,0.086,0.9630,9,0.0987,-20.715,0,0.0399,57.545,4,0.037,55
116368,Philthy Rich,6LlTJUQPhqvZuvOmgkLGYh,"Off Safety (feat. Yhung T.O., Mozzy & Ziggy)",0.099200,0.674,221934,0.628,0.0000,9,0.3740,-8.277,0,0.1890,93.053,4,0.613,46
116369,Nef The Pharaoh,0B2LhMYcGR9Gmi6BQLdzlO,Ludacris,0.027700,0.792,183771,0.747,0.0000,11,0.1170,-8.753,0,0.3420,94.498,4,0.584,47
116370,The Ting Tings,0yzA9b21pJgnlLQDirsxAm,Estranged,0.067500,0.548,311267,0.506,0.0244,10,0.1050,-8.251,1,0.0286,163.967,4,0.271,50


In [22]:
# Remove duplicates (by track_id)
tomigelo_data = tomigelo_data.drop_duplicates(subset="track_id", keep='first')

In [23]:
# remove popularity (we'll compute this on our own, since they seem to have used track popularity instead of artist, and these things change over time)
tomigelo_data = tomigelo_data.drop(columns=["popularity"])

In [37]:
# adding artist popularity + genre data
def split_into_sublists(input_list, chunk_size=50):
	upto = 0
	output_lists = []
	while True:
		if upto + chunk_size >= len(input_list):
			output_lists.append(input_list[upto:-1])
			break
		else:
			output_lists.append(input_list[upto:upto+chunk_size])
			upto += chunk_size
	return output_lists

def get_tracks_info(track_uris: List[List[str]]) -> List:
	out=[]
	for idx, sublist in enumerate(track_uris):
		result = spotify.tracks(sublist)
		out.append(result)
		print(f"TRACK INFO: Completed chunk {idx + 1} / {len(track_uris)}")
	return out

def get_artists_info(artist_uris_chunked: List[List[str]]) -> List:
	out=[]
	for idx, sublist in enumerate(artist_uris_chunked):
		result = spotify.artists(sublist)
		out.append(result)
		print(f"ARTIST INFO: Completed chunk {idx + 1} / {len(artist_uris_chunked)}")
	return out

In [25]:
track_uris = tomigelo_data['track_id'].dropna().values
tracks_info = get_tracks_info(split_into_sublists(track_uris))

TRACK INFO: Completed chunk 1 / 2620
TRACK INFO: Completed chunk 2 / 2620
TRACK INFO: Completed chunk 3 / 2620
TRACK INFO: Completed chunk 4 / 2620
TRACK INFO: Completed chunk 5 / 2620
TRACK INFO: Completed chunk 6 / 2620
TRACK INFO: Completed chunk 7 / 2620
TRACK INFO: Completed chunk 8 / 2620
TRACK INFO: Completed chunk 9 / 2620
TRACK INFO: Completed chunk 10 / 2620
TRACK INFO: Completed chunk 11 / 2620
TRACK INFO: Completed chunk 12 / 2620
TRACK INFO: Completed chunk 13 / 2620
TRACK INFO: Completed chunk 14 / 2620
TRACK INFO: Completed chunk 15 / 2620
TRACK INFO: Completed chunk 16 / 2620
TRACK INFO: Completed chunk 17 / 2620
TRACK INFO: Completed chunk 18 / 2620
TRACK INFO: Completed chunk 19 / 2620
TRACK INFO: Completed chunk 20 / 2620
TRACK INFO: Completed chunk 21 / 2620
TRACK INFO: Completed chunk 22 / 2620
TRACK INFO: Completed chunk 23 / 2620
TRACK INFO: Completed chunk 24 / 2620
TRACK INFO: Completed chunk 25 / 2620
TRACK INFO: Completed chunk 26 / 2620
TRACK INFO: Completed

In [38]:
artist_to_uri = {}
for tracks in tracks_info:
	itracks = tracks['tracks']
	for track in itracks:
		artist_to_uri[track['album']['artists'][0]['name']] = track['album']['artists'][0]['uri']


In [39]:
artist_uris = list(set(artist_to_uri.values()))
slists = split_into_sublists(artist_uris)
print(len(slists))

635


In [40]:
artist_info = get_artists_info(slists)

ARTIST INFO: Completed chunk 1 / 635
ARTIST INFO: Completed chunk 2 / 635
ARTIST INFO: Completed chunk 3 / 635
ARTIST INFO: Completed chunk 4 / 635
ARTIST INFO: Completed chunk 5 / 635
ARTIST INFO: Completed chunk 6 / 635
ARTIST INFO: Completed chunk 7 / 635
ARTIST INFO: Completed chunk 8 / 635
ARTIST INFO: Completed chunk 9 / 635
ARTIST INFO: Completed chunk 10 / 635
ARTIST INFO: Completed chunk 11 / 635
ARTIST INFO: Completed chunk 12 / 635
ARTIST INFO: Completed chunk 13 / 635
ARTIST INFO: Completed chunk 14 / 635
ARTIST INFO: Completed chunk 15 / 635
ARTIST INFO: Completed chunk 16 / 635
ARTIST INFO: Completed chunk 17 / 635
ARTIST INFO: Completed chunk 18 / 635
ARTIST INFO: Completed chunk 19 / 635
ARTIST INFO: Completed chunk 20 / 635
ARTIST INFO: Completed chunk 21 / 635
ARTIST INFO: Completed chunk 22 / 635
ARTIST INFO: Completed chunk 23 / 635
ARTIST INFO: Completed chunk 24 / 635
ARTIST INFO: Completed chunk 25 / 635
ARTIST INFO: Completed chunk 26 / 635
ARTIST INFO: Complete

In [47]:
artist_to_genre = {}
artist_to_popularity = {}
for artists in artist_info:
	for artist in artists['artists']:
		artist_to_genre[artist['name']] = artist['genres']
		artist_to_popularity[artist['name']] = artist['popularity']

In [48]:
artist_to_genre_df = pd.DataFrame.from_dict(artist_to_genre, orient="index")
artist_to_popularity_df = pd.DataFrame.from_dict(artist_to_popularity, orient="index", columns=["artist_popularity"])
artist_to_genre_df['genres'] = list(artist_to_genre.values())
artist_to_genre_df = pd.DataFrame(artist_to_genre_df['genres'])
artist_to_genre_df = artist_to_genre_df.join(artist_to_popularity_df)
artist_to_genre_df.columns
tomigelo_clean = tomigelo_data.join(artist_to_genre_df, on='artist_name')

In [50]:
tomigelo_clean
tomigelo_clean.to_csv(REPO_ROOT / "data/song_data/tomigelo_clean.csv")

# Standardizing data from maharshipabdya

In [64]:
maharshipabdya_data = pd.read_csv(REPO_ROOT / "data/song_data/maharshipabdya/dataset.csv")
maharshipabdya_data

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.4610,...,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.420,0.1660,...,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.3590,...,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.4430,...,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,113995,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.2350,...,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music
113996,113996,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.1170,...,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music
113997,113997,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.3290,...,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music
113998,113998,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.5060,...,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music


In [65]:
# Make each track have only a single artist
maharshipabdya_data['artists'] = maharshipabdya_data['artists'].apply(lambda x: str(x).split(';')[0])

In [67]:
# Remove duplicates (by track ID)
maharshipabdya_data = maharshipabdya_data.drop_duplicates(subset="track_id", keep='first')
# drop unnecessary columns
maharshipabdya_data = maharshipabdya_data.drop(columns=["Unnamed: 0", "popularity", "explicit"])

KeyError: "['Unnamed: 0', 'popularity', 'explicit'] not found in axis"

In [70]:
# more data augmentation
track_uris = maharshipabdya_data['track_id'].dropna().values
tracks_info = get_tracks_info(split_into_sublists(track_uris))

TRACK INFO: Completed chunk 1 / 1795
TRACK INFO: Completed chunk 2 / 1795
TRACK INFO: Completed chunk 3 / 1795
TRACK INFO: Completed chunk 4 / 1795
TRACK INFO: Completed chunk 5 / 1795
TRACK INFO: Completed chunk 6 / 1795
TRACK INFO: Completed chunk 7 / 1795
TRACK INFO: Completed chunk 8 / 1795
TRACK INFO: Completed chunk 9 / 1795
TRACK INFO: Completed chunk 10 / 1795
TRACK INFO: Completed chunk 11 / 1795
TRACK INFO: Completed chunk 12 / 1795
TRACK INFO: Completed chunk 13 / 1795
TRACK INFO: Completed chunk 14 / 1795
TRACK INFO: Completed chunk 15 / 1795
TRACK INFO: Completed chunk 16 / 1795
TRACK INFO: Completed chunk 17 / 1795
TRACK INFO: Completed chunk 18 / 1795
TRACK INFO: Completed chunk 19 / 1795
TRACK INFO: Completed chunk 20 / 1795
TRACK INFO: Completed chunk 21 / 1795
TRACK INFO: Completed chunk 22 / 1795
TRACK INFO: Completed chunk 23 / 1795
TRACK INFO: Completed chunk 24 / 1795
TRACK INFO: Completed chunk 25 / 1795
TRACK INFO: Completed chunk 26 / 1795
TRACK INFO: Completed

In [71]:
artist_to_uri = {}
for tracks in tracks_info:
    itracks = tracks['tracks']
    for track in itracks:
        artist_to_uri[track['album']['artists'][0]['name']] = track['album']['artists'][0]['uri']

In [72]:
artist_uris = list(set(artist_to_uri.values()))
slists = split_into_sublists(artist_uris)
artist_info = get_artists_info(slists)

ARTIST INFO: Completed chunk 1 / 334
ARTIST INFO: Completed chunk 2 / 334
ARTIST INFO: Completed chunk 3 / 334
ARTIST INFO: Completed chunk 4 / 334
ARTIST INFO: Completed chunk 5 / 334
ARTIST INFO: Completed chunk 6 / 334
ARTIST INFO: Completed chunk 7 / 334
ARTIST INFO: Completed chunk 8 / 334
ARTIST INFO: Completed chunk 9 / 334
ARTIST INFO: Completed chunk 10 / 334
ARTIST INFO: Completed chunk 11 / 334
ARTIST INFO: Completed chunk 12 / 334
ARTIST INFO: Completed chunk 13 / 334
ARTIST INFO: Completed chunk 14 / 334
ARTIST INFO: Completed chunk 15 / 334
ARTIST INFO: Completed chunk 16 / 334
ARTIST INFO: Completed chunk 17 / 334
ARTIST INFO: Completed chunk 18 / 334
ARTIST INFO: Completed chunk 19 / 334
ARTIST INFO: Completed chunk 20 / 334
ARTIST INFO: Completed chunk 21 / 334
ARTIST INFO: Completed chunk 22 / 334
ARTIST INFO: Completed chunk 23 / 334
ARTIST INFO: Completed chunk 24 / 334
ARTIST INFO: Completed chunk 25 / 334
ARTIST INFO: Completed chunk 26 / 334
ARTIST INFO: Complete

In [73]:
artist_to_genre = {}
artist_to_popularity = {}
for artists in artist_info:
    for artist in artists['artists']:
        artist_to_genre[artist['name']] = artist['genres']
        artist_to_popularity[artist['name']] = artist['popularity']

In [74]:
artist_to_genre_df = pd.DataFrame.from_dict(artist_to_genre, orient="index")
artist_to_popularity_df = pd.DataFrame.from_dict(artist_to_popularity, orient="index", columns=["artist_popularity"])
artist_to_genre_df['genres'] = list(artist_to_genre.values())
artist_to_genre_df = pd.DataFrame(artist_to_genre_df['genres'])
artist_to_genre_df = artist_to_genre_df.join(artist_to_popularity_df)
artist_to_genre_df.columns

Index(['genres', 'artist_popularity'], dtype='object')

In [78]:
maharshipabdya_clean = maharshipabdya_data.join(artist_to_genre_df, on='artists')
maharshipabdya_clean = maharshipabdya_clean.drop(columns="track_genre")

In [83]:
print(maharshipabdya_clean.columns)
print(tomigelo_clean.columns)
print([column for column in maharshipabdya_clean.columns if column not in tomigelo_clean.columns])
print([column for column in tomigelo_clean.columns if column not in maharshipabdya_clean.columns])

Index(['track_id', 'artists', 'album_name', 'track_name', 'duration_ms',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'genres', 'artist_popularity'],
      dtype='object')
Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'genres', 'artist_popularity'],
      dtype='object')
['artists', 'album_name']
['artist_name']


In [89]:
maharshipabdya_clean = maharshipabdya_clean.rename(columns={"artists": "artist_name"})
#maharshipabdya_clean = maharshipabdya_clean.drop(columns="album_name")

In [90]:
print([column for column in maharshipabdya_clean.columns if column not in tomigelo_clean.columns])
print([column for column in tomigelo_clean.columns if column not in maharshipabdya_clean.columns])

[]
[]


In [93]:
# Concatenating and throwing them together
dataset = pd.concat([maharshipabdya_clean, tomigelo_clean])
dataset = dataset.drop_duplicates(subset="track_id")
dataset.to_csv(REPO_ROOT / "data/song_data/song_dataset.csv")