In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings

warnings.filterwarnings("ignore", message="DataFrame is highly fragmented.", category=pd.errors.PerformanceWarning)

In [4]:
import pandas as pd
import numpy as np
import requests
from urllib.parse import urlencode
from dotenv import load_dotenv
import os
import math
from itertools import chain

pd.options.display.max_columns=999

## 1. Authorization Flow

In [5]:
from src.data_download.api_utils import authorize_user

In [6]:
authorize_user()

Go to the following URL and authorize access: https://accounts.spotify.com/authorize?client_id=70dabab8a4f44e8c9014cc7b8f86d086&response_type=code&redirect_uri=https%3A%2F%2Foauth.pstmn.io%2Fv1%2Fbrowser-callback&scope=user-library-read


'https://accounts.spotify.com/authorize?client_id=70dabab8a4f44e8c9014cc7b8f86d086&response_type=code&redirect_uri=https%3A%2F%2Foauth.pstmn.io%2Fv1%2Fbrowser-callback&scope=user-library-read'

In [7]:
auth_code="AQAgh3w7foaU1tVvm2ffSfQU3UDi-2SwybyNwLrUKSFc-kIizd0oAm6nZIRBaEOMzIupnGWTX7QGGaja7uldjsDiBgDm7n1KHD5-VEmKSMH91073LmlFXSvSSMOkPzL0gvZDXae2EAQN0Xp5fJMC7oM1IrHCVnhh17UwUZqyB_eu2WGX4LcUqvSOrb7hhlhjZomqTBXh3_Keqix83wDS790"

In [8]:
from src.data_download.api_utils import SpotifyAPI

In [9]:
spotify_api = SpotifyAPI(auth_code=auth_code)

In [10]:
spotify_api.get_access_token()

## 2. Making API Requests

In [11]:
track_id = "1BSq0hYijZbubyR6waNyQn"

endpoint = f"audio-features/{track_id}"
response = spotify_api.execute_api_request(endpoint)

if response.status_code != 200:
    raise HTTPError(response.status_code)

In [12]:
response.status_code

200

In [13]:
response.json().keys()

dict_keys(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'])

In [14]:
response.json()['danceability']

0.784

In [15]:
response.json()

{'danceability': 0.784,
 'energy': 0.402,
 'key': 9,
 'loudness': -9.719,
 'mode': 0,
 'speechiness': 0.324,
 'acousticness': 0.0287,
 'instrumentalness': 0.0071,
 'liveness': 0.0963,
 'valence': 0.402,
 'tempo': 166.678,
 'type': 'audio_features',
 'id': '1BSq0hYijZbubyR6waNyQn',
 'uri': 'spotify:track:1BSq0hYijZbubyR6waNyQn',
 'track_href': 'https://api.spotify.com/v1/tracks/1BSq0hYijZbubyR6waNyQn',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1BSq0hYijZbubyR6waNyQn',
 'duration_ms': 335800,
 'time_signature': 4}

## 3. Get Library Tracks

In [16]:
tracks = spotify_api.get_user_library_tracks()

In [17]:
type(tracks)

list

In [18]:
tracks[0].keys()

dict_keys(['added_at', 'track'])

In [19]:
tracks[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [22]:
def get_entire_user_library():
    """
    Retrieves the entire library of tracks saved 
    in the authenticated user's library.

    This function iterates through all pages of the user's 
    library tracks, retrieving tracks in batches of 50 
    until all tracks have been retrieved.

    Returns:
    list: A list of track objects representing the user's entire library.
    """
    tracks = []
    offset=0
    length = 1
    while length > 0:
        new_tracks = spotify_api.get_user_library_tracks(offset=offset)
        length = len(new_tracks) # if no tracks returned, break the while loop
        tracks += new_tracks
        offset += 50
    return tracks

In [23]:
library = get_entire_user_library()
len(library)

5770

In [24]:
library[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [27]:
def process_library_tracks(library):
    """
    Processes a list of track information from the user's library 
    and converts it into a pandas DataFrame.

    This function extracts relevant track and album details 
    from the provided library data and organizes
    them into a structured DataFrame for further analysis or manipulation.

    Parameters:
    library (list): A list of dictionaries where each dictionary contains 
                    information about a track, as returned by the Spotify API.

    Returns:
    pandas.DataFrame: A DataFrame containing the processed track information.
    """
    df_dict = {
        'id':[],
        'date_added':[],
        'name':[],
        'popularity':[],
        'is_local':[],
        'is_explicit':[],
        'duration_ms':[],
        'album_id':[],
        'album_name':[],
        'album_release_date':[],
        'album_release_date_precision':[],
        'artist_id':[],
        'artist_name':[]
    }

    for item in library:
        df_dict['id'].append(item['track']['id']),
        df_dict['date_added'].append(item['added_at']),
        df_dict['name'].append(item['track']['name']),
        df_dict['popularity'].append(item['track']['popularity']),
        df_dict['is_local'].append(item['track']['is_local']),
        df_dict['is_explicit'].append(item['track']['explicit']),
        df_dict['duration_ms'].append(item['track']['duration_ms']),
        df_dict['album_id'].append(item['track']['album']['id']),
        df_dict['album_name'].append(item['track']['album']['name']),
        df_dict['album_release_date'].append(item['track']['album']['release_date']),
        df_dict['album_release_date_precision'].append(item['track']['album']['release_date_precision']),
        df_dict['artist_id'].append(item['track']['artists'][0]['id']),
        df_dict['artist_name'].append(item['track']['artists'][0]['name'])

    return pd.DataFrame(df_dict)

In [28]:
df1 = process_library_tracks(library)
print(df1.shape)
df1.head()

(5770, 13)


Unnamed: 0,id,date_added,name,popularity,is_local,is_explicit,duration_ms,album_id,album_name,album_release_date,album_release_date_precision,artist_id,artist_name
0,5gciWzZRdQHiWkyBaHPSdB,2024-05-21T11:43:18Z,Kanou Dan Yen,35,False,False,214293,1N7ckIzs97RiLEXWgEGgli,Fenfo,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara
1,0Ty1I0eAkYpSGd7MJWXM2n,2024-05-20T12:30:49Z,Gentle Persuasion,27,False,False,405520,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt
2,1YN3aermJfsy53lU90Nssf,2024-05-20T08:55:04Z,Life's Gone Down Low,40,False,False,297162,5W7bPvhB7XDg4RcHKSWsQU,Danger,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters
3,6BkeuHmO4P69Ln2BSonrXi,2024-05-20T08:18:51Z,Jumpin' Jack Flash,40,False,False,215866,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar
4,2YfRKF4sWeV1t1NqBoUgxH,2024-05-20T08:06:55Z,Anchin Kfu Ayinkash,51,False,False,324880,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia


### Get Audio Features

In [29]:
track_ids = "3CqaTHbiU2nBy3Ar9RnDHN,1dN55SR4aP4EpGdYcQpPb3"
response_json = spotify_api.get_audio_features_several_tracks(track_ids)

In [30]:
response_json

[{'danceability': 0.652,
  'energy': 0.877,
  'key': 6,
  'loudness': -6.281,
  'mode': 0,
  'speechiness': 0.0339,
  'acousticness': 0.00307,
  'instrumentalness': 0.699,
  'liveness': 0.186,
  'valence': 0.562,
  'tempo': 124.024,
  'type': 'audio_features',
  'id': '3CqaTHbiU2nBy3Ar9RnDHN',
  'uri': 'spotify:track:3CqaTHbiU2nBy3Ar9RnDHN',
  'track_href': 'https://api.spotify.com/v1/tracks/3CqaTHbiU2nBy3Ar9RnDHN',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3CqaTHbiU2nBy3Ar9RnDHN',
  'duration_ms': 222316,
  'time_signature': 4},
 {'danceability': 0.736,
  'energy': 0.603,
  'key': 9,
  'loudness': -7.763,
  'mode': 0,
  'speechiness': 0.0276,
  'acousticness': 0.79,
  'instrumentalness': 0.509,
  'liveness': 0.141,
  'valence': 0.187,
  'tempo': 123.957,
  'type': 'audio_features',
  'id': '1dN55SR4aP4EpGdYcQpPb3',
  'uri': 'spotify:track:1dN55SR4aP4EpGdYcQpPb3',
  'track_href': 'https://api.spotify.com/v1/tracks/1dN55SR4aP4EpGdYcQpPb3',
  'analysis_url': 'https://a

In [31]:
def get_library_audio_features(df):
    """
    Retrieves audio features for tracks in a library DataFrame 
    from the Spotify API.

    This function iterates through the rows of the provided DataFrame,
    fetching audio features for tracks in batches of 100 to comply 
    with Spotify API limitations. It then compiles all
    the fetched audio features into a list of dictionaries.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing track information,
                           typically obtained from the 
                           `process_library_tracks` function.

    Returns:
    list: A list of dictionaries containing the audio features
          for each track. If the request fails,
          an error message is returned instead.
    """
    dict_list = []
    
    for n in range(0, (math.ceil(len(df) / 100.0) * 100), 100): # loop through library df rows, 100 at a time
        track_ids = (",").join(df[n:n+100]['id'].to_list()) # get the track ids into a comma-separated string
        dict_list += spotify_api.get_audio_features_several_tracks(track_ids)
        
    return dict_list

In [32]:
audio_features_dicts = get_library_audio_features(df1)

In [33]:
audio_features_dicts[0]

{'danceability': 0.763,
 'energy': 0.451,
 'key': 4,
 'loudness': -10.693,
 'mode': 0,
 'speechiness': 0.0397,
 'acousticness': 0.376,
 'instrumentalness': 0.117,
 'liveness': 0.211,
 'valence': 0.173,
 'tempo': 110.001,
 'type': 'audio_features',
 'id': '5gciWzZRdQHiWkyBaHPSdB',
 'uri': 'spotify:track:5gciWzZRdQHiWkyBaHPSdB',
 'track_href': 'https://api.spotify.com/v1/tracks/5gciWzZRdQHiWkyBaHPSdB',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5gciWzZRdQHiWkyBaHPSdB',
 'duration_ms': 214293,
 'time_signature': 4}

In [34]:
audio_features = pd.DataFrame(audio_features_dicts)
print(audio_features.shape)
audio_features.head()

(5770, 18)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,audio_features,5gciWzZRdQHiWkyBaHPSdB,spotify:track:5gciWzZRdQHiWkyBaHPSdB,https://api.spotify.com/v1/tracks/5gciWzZRdQHi...,https://api.spotify.com/v1/audio-analysis/5gci...,214293,4
1,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,audio_features,0Ty1I0eAkYpSGd7MJWXM2n,spotify:track:0Ty1I0eAkYpSGd7MJWXM2n,https://api.spotify.com/v1/tracks/0Ty1I0eAkYpS...,https://api.spotify.com/v1/audio-analysis/0Ty1...,405520,4
2,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,audio_features,1YN3aermJfsy53lU90Nssf,spotify:track:1YN3aermJfsy53lU90Nssf,https://api.spotify.com/v1/tracks/1YN3aermJfsy...,https://api.spotify.com/v1/audio-analysis/1YN3...,297163,4
3,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,audio_features,6BkeuHmO4P69Ln2BSonrXi,spotify:track:6BkeuHmO4P69Ln2BSonrXi,https://api.spotify.com/v1/tracks/6BkeuHmO4P69...,https://api.spotify.com/v1/audio-analysis/6Bke...,215867,4
4,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,audio_features,2YfRKF4sWeV1t1NqBoUgxH,spotify:track:2YfRKF4sWeV1t1NqBoUgxH,https://api.spotify.com/v1/tracks/2YfRKF4sWeV1...,https://api.spotify.com/v1/audio-analysis/2YfR...,324880,4


In [35]:
audio_features.to_csv(os.path.abspath('..') + "/data/" + "audio_features.csv", index=False)

In [36]:
right = audio_features[[
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'id'
]]

df2 = pd.merge(df1, right, on='id')
print(df2.shape)
df2.head()

(5770, 24)


Unnamed: 0,id,date_added,name,popularity,is_local,is_explicit,duration_ms,album_id,album_name,album_release_date,album_release_date_precision,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,5gciWzZRdQHiWkyBaHPSdB,2024-05-21T11:43:18Z,Kanou Dan Yen,35,False,False,214293,1N7ckIzs97RiLEXWgEGgli,Fenfo,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001
1,0Ty1I0eAkYpSGd7MJWXM2n,2024-05-20T12:30:49Z,Gentle Persuasion,27,False,False,405520,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624
2,1YN3aermJfsy53lU90Nssf,2024-05-20T08:55:04Z,Life's Gone Down Low,40,False,False,297162,5W7bPvhB7XDg4RcHKSWsQU,Danger,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24
3,6BkeuHmO4P69Ln2BSonrXi,2024-05-20T08:18:51Z,Jumpin' Jack Flash,40,False,False,215866,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424
4,2YfRKF4sWeV1t1NqBoUgxH,2024-05-20T08:06:55Z,Anchin Kfu Ayinkash,51,False,False,324880,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791


### Get Album Info

In [37]:
response_json = spotify_api.get_several_albums("2fDJNwprKYSK4ovmRzcoe4,7GK7EmnHe9MdYTDSKHqMg7")

In [38]:
response_json[0].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'copyrights', 'external_ids', 'external_urls', 'genres', 'href', 'id', 'images', 'label', 'name', 'popularity', 'release_date', 'release_date_precision', 'total_tracks', 'tracks', 'type', 'uri'])

In [39]:
response_json[1]['popularity']

33

In [40]:
def get_library_albums(df):
    df_dict = {
        'album_id':[],
        'album_popularity':[],
    }
    album_id_list = df.album_id.unique()
    for n in range(0, (math.ceil(len(album_id_list) / 20) * 20), 20):
        album_ids = (",").join(album_id_list[n:n+20])
        albums = spotify_api.get_several_albums(album_ids)
        for album in albums:
            df_dict['album_id'].append(album['id'])
            df_dict['album_popularity'].append(album['popularity'])
    return  pd.DataFrame(df_dict)

In [41]:
albums = get_library_albums(df2)
print(albums.shape)
albums.head()

(3804, 2)


Unnamed: 0,album_id,album_popularity
0,1N7ckIzs97RiLEXWgEGgli,37
1,1RUBBim9ey8E6Kznw4UIkM,16
2,5W7bPvhB7XDg4RcHKSWsQU,30
3,24R9CyPLFa0CJrSZ9whlT3,30
4,5i6yMY4F7UoOuJD6wRV6NN,43


In [42]:
albums.to_csv(os.path.abspath('..') + "/data/" + "albums.csv", index=False)

In [43]:
df3 = pd.merge(df2, albums, on='album_id')
print(df3.shape)
df3.head()

(5770, 25)


Unnamed: 0,id,date_added,name,popularity,is_local,is_explicit,duration_ms,album_id,album_name,album_release_date,album_release_date_precision,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,album_popularity
0,5gciWzZRdQHiWkyBaHPSdB,2024-05-21T11:43:18Z,Kanou Dan Yen,35,False,False,214293,1N7ckIzs97RiLEXWgEGgli,Fenfo,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,37
1,0Ty1I0eAkYpSGd7MJWXM2n,2024-05-20T12:30:49Z,Gentle Persuasion,27,False,False,405520,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,16
2,1YN3aermJfsy53lU90Nssf,2024-05-20T08:55:04Z,Life's Gone Down Low,40,False,False,297162,5W7bPvhB7XDg4RcHKSWsQU,Danger,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,30
3,6BkeuHmO4P69Ln2BSonrXi,2024-05-20T08:18:51Z,Jumpin' Jack Flash,40,False,False,215866,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,30
4,2YfRKF4sWeV1t1NqBoUgxH,2024-05-20T08:06:55Z,Anchin Kfu Ayinkash,51,False,False,324880,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,43


### Get Artist Info

In [44]:
response_json = spotify_api.get_several_artists("2MPHBxznH1fj59jbOWY38u,7A0awCXkE1FtSU8B0qwOJQ")

In [45]:
response_json[0].keys()

dict_keys(['external_urls', 'followers', 'genres', 'href', 'id', 'images', 'name', 'popularity', 'type', 'uri'])

In [46]:
response_json[1]['genres']

['electronica', 'future garage', 'indie soul', 'indietronica']

In [47]:
def get_library_artists(df):
    df_dict = {
        'artist_id':[],
        'artist_popularity':[],
        'artist_genres':[],
    }
    artist_id_list = df.artist_id.unique()
    for n in range(0, (math.ceil(len(artist_id_list) / 50) * 50), 50):
        artist_ids = (",").join(artist_id_list[n:n+50])
        artists = spotify_api.get_several_artists(artist_ids)
        for artist in artists:
            df_dict['artist_id'].append(artist['id'])
            df_dict['artist_popularity'].append(artist['popularity'])
            df_dict['artist_genres'].append(artist['genres'])
    return  pd.DataFrame(df_dict)

In [48]:
artists = get_library_artists(df3)
print(artists.shape)
artists.head()

(2199, 3)


Unnamed: 0,artist_id,artist_popularity,artist_genres
0,4G5ZJny3HvX6Il7eHVfnNC,51,"[afropop, desert blues, malian blues, mande po..."
1,19f168OQu5y06r89NVmS55,16,[rare groove]
2,1iElGdidl4zFXOpaaem4wZ,37,"[afrobeat, afropop, world]"
3,5eNWwEF0woj5E5Fnu9qXaQ,25,"[raga rock, sitar, world fusion]"
4,0rsN9DKQhTCvkgbByOOBIm,39,[ethio-jazz]


In [49]:
artists.to_csv(os.path.abspath('..') + "/data/" + "artists.csv", index=False)

In [50]:
df4 = pd.merge(df3, artists, on='artist_id')
print(df4.shape)
df4.head()

(5770, 27)


Unnamed: 0,id,date_added,name,popularity,is_local,is_explicit,duration_ms,album_id,album_name,album_release_date,album_release_date_precision,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,album_popularity,artist_popularity,artist_genres
0,5gciWzZRdQHiWkyBaHPSdB,2024-05-21T11:43:18Z,Kanou Dan Yen,35,False,False,214293,1N7ckIzs97RiLEXWgEGgli,Fenfo,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,37,51,"[afropop, desert blues, malian blues, mande po..."
1,0Ty1I0eAkYpSGd7MJWXM2n,2024-05-20T12:30:49Z,Gentle Persuasion,27,False,False,405520,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,16,16,[rare groove]
2,1YN3aermJfsy53lU90Nssf,2024-05-20T08:55:04Z,Life's Gone Down Low,40,False,False,297162,5W7bPvhB7XDg4RcHKSWsQU,Danger,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,30,37,"[afrobeat, afropop, world]"
3,6BkeuHmO4P69Ln2BSonrXi,2024-05-20T08:18:51Z,Jumpin' Jack Flash,40,False,False,215866,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,30,25,"[raga rock, sitar, world fusion]"
4,2YfRKF4sWeV1t1NqBoUgxH,2024-05-20T08:06:55Z,Anchin Kfu Ayinkash,51,False,False,324880,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,43,39,[ethio-jazz]


In [51]:
df4.to_csv(os.path.abspath('..') + "/data/" + "liked_songs_raw.csv", index=False)

## 4. Cleaning the Data

In [52]:
df = pd.read_csv(os.path.abspath('..') + "/data/" + "liked_songs_raw.csv")
df.head()

Unnamed: 0,id,date_added,name,popularity,is_local,is_explicit,duration_ms,album_id,album_name,album_release_date,album_release_date_precision,artist_id,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,album_popularity,artist_popularity,artist_genres
0,5gciWzZRdQHiWkyBaHPSdB,2024-05-21T11:43:18Z,Kanou Dan Yen,35,False,False,214293,1N7ckIzs97RiLEXWgEGgli,Fenfo,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,37,51,"['afropop', 'desert blues', 'malian blues', 'm..."
1,0Ty1I0eAkYpSGd7MJWXM2n,2024-05-20T12:30:49Z,Gentle Persuasion,27,False,False,405520,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,16,16,['rare groove']
2,1YN3aermJfsy53lU90Nssf,2024-05-20T08:55:04Z,Life's Gone Down Low,40,False,False,297162,5W7bPvhB7XDg4RcHKSWsQU,Danger,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,30,37,"['afrobeat', 'afropop', 'world']"
3,6BkeuHmO4P69Ln2BSonrXi,2024-05-20T08:18:51Z,Jumpin' Jack Flash,40,False,False,215866,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,30,25,"['raga rock', 'sitar', 'world fusion']"
4,2YfRKF4sWeV1t1NqBoUgxH,2024-05-20T08:06:55Z,Anchin Kfu Ayinkash,51,False,False,324880,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,43,39,['ethio-jazz']


In [54]:
df.date_added = pd.to_datetime(df.date_added)
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['time_added'] = df['date_added'].dt.time

df['day_of_week_added'] = df['date_added'].dt.dayofweek

days_of_week_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df['day_of_week_added'] = df['day_of_week_added'].apply(lambda x: days_of_week_mapping[x])

month_mapping = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}


df['month_added'] = df['month_added'].apply(lambda x: month_mapping[x])

df['date_added'] = df['date_added'].dt.date

In [55]:
df['duration_ms'] = df['duration_ms'] / 1000
df = df.rename(columns={'duration_ms':'duration_s'})
df['duration_min'] = df['duration_s'] / 60

In [56]:
df = df[['id',
 'name',
 'popularity',
 'is_local',
 'is_explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'date_added',
 'year_added',
 'month_added',
 'day_of_week_added',
 'time_added',
 'duration_s',
 'duration_min',
 'album_id',
 'album_name',
 'album_popularity',
 'album_release_date',
 'album_release_date_precision',
 'artist_id',
 'artist_name',
 'artist_popularity',
 'artist_genres']]
print(df.shape)
df.head()

(5770, 32)


Unnamed: 0,id,name,popularity,is_local,is_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,date_added,year_added,month_added,day_of_week_added,time_added,duration_s,duration_min,album_id,album_name,album_popularity,album_release_date,album_release_date_precision,artist_id,artist_name,artist_popularity,artist_genres
0,5gciWzZRdQHiWkyBaHPSdB,Kanou Dan Yen,35,False,False,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,2024-05-21,2024,May,Tuesday,11:43:18,214.293,3.57155,1N7ckIzs97RiLEXWgEGgli,Fenfo,37,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,51,"['afropop', 'desert blues', 'malian blues', 'm..."
1,0Ty1I0eAkYpSGd7MJWXM2n,Gentle Persuasion,27,False,False,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,2024-05-20,2024,May,Monday,12:30:49,405.52,6.758667,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,16,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,16,['rare groove']
2,1YN3aermJfsy53lU90Nssf,Life's Gone Down Low,40,False,False,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,2024-05-20,2024,May,Monday,08:55:04,297.162,4.9527,5W7bPvhB7XDg4RcHKSWsQU,Danger,30,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,37,"['afrobeat', 'afropop', 'world']"
3,6BkeuHmO4P69Ln2BSonrXi,Jumpin' Jack Flash,40,False,False,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,2024-05-20,2024,May,Monday,08:18:51,215.866,3.597767,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),30,1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,25,"['raga rock', 'sitar', 'world fusion']"
4,2YfRKF4sWeV1t1NqBoUgxH,Anchin Kfu Ayinkash,51,False,False,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,2024-05-20,2024,May,Monday,08:06:55,324.88,5.414667,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,43,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,39,['ethio-jazz']


### Genres

In [57]:
lists_of_genres = df['artist_genres'].to_list()
genres = sorted(set(list(chain.from_iterable(lists_of_genres))))

In [58]:
genres[:5]

[' ', '"', '&', "'", '+']

In [59]:
genres_df = df[['artist_id', 'artist_genres']].copy()
genres_df.head()

Unnamed: 0,artist_id,artist_genres
0,4G5ZJny3HvX6Il7eHVfnNC,"['afropop', 'desert blues', 'malian blues', 'm..."
1,19f168OQu5y06r89NVmS55,['rare groove']
2,1iElGdidl4zFXOpaaem4wZ,"['afrobeat', 'afropop', 'world']"
3,5eNWwEF0woj5E5Fnu9qXaQ,"['raga rock', 'sitar', 'world fusion']"
4,0rsN9DKQhTCvkgbByOOBIm,['ethio-jazz']


In [60]:
for genre in genres:
    genres_df[genre] = genres_df['artist_genres'].apply(lambda x: 1 if genre in x else 0)
genres_df.drop(columns='artist_genres', inplace=True)
genres_df.drop_duplicates(inplace=True)
print(genres_df.shape)
genres_df.head()

(2199, 38)


Unnamed: 0,artist_id,Unnamed: 2,"""",&,',+,",",-,2,:,[,],a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
0,4G5ZJny3HvX6Il7eHVfnNC,1,0,0,1,0,1,0,0,0,1,1,1,1,0,1,1,1,0,0,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0
1,19f168OQu5y06r89NVmS55,1,0,0,1,0,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0
2,1iElGdidl4zFXOpaaem4wZ,1,0,0,1,0,1,0,0,0,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0
3,5eNWwEF0woj5E5Fnu9qXaQ,1,0,0,1,0,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,0,1,0,0,0
4,0rsN9DKQhTCvkgbByOOBIm,0,0,0,1,0,0,1,0,0,1,1,1,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [61]:
df_clean = pd.merge(df, genres_df, on='artist_id', how='inner')
df_clean.drop(columns='artist_genres', inplace=True)
print(df_clean.shape)
df_clean.head()

(5770, 68)


Unnamed: 0,id,name,popularity,is_local,is_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,date_added,year_added,month_added,day_of_week_added,time_added,duration_s,duration_min,album_id,album_name,album_popularity,album_release_date,album_release_date_precision,artist_id,artist_name,artist_popularity,Unnamed: 32,"""",&,',+,",",-,2,:,[,],a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
0,5gciWzZRdQHiWkyBaHPSdB,Kanou Dan Yen,35,False,False,0.763,0.451,4,-10.693,0,0.0397,0.376,0.117,0.211,0.173,110.001,2024-05-21,2024,May,Tuesday,11:43:18,214.293,3.57155,1N7ckIzs97RiLEXWgEGgli,Fenfo,37,2018-05-25,day,4G5ZJny3HvX6Il7eHVfnNC,Fatoumata Diawara,51,1,0,0,1,0,1,0,0,0,1,1,1,1,0,1,1,1,0,0,1,0,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0
1,0Ty1I0eAkYpSGd7MJWXM2n,Gentle Persuasion,27,False,False,0.594,0.635,1,-8.069,0,0.0411,0.067,0.782,0.093,0.709,138.624,2024-05-20,2024,May,Monday,12:30:49,405.52,6.758667,1RUBBim9ey8E6Kznw4UIkM,My Name is Doug Hream Blunt: Featuring the Hit...,16,2015-10-16,day,19f168OQu5y06r89NVmS55,Doug Hream Blunt,16,1,0,0,1,0,0,0,0,0,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0
2,1YN3aermJfsy53lU90Nssf,Life's Gone Down Low,40,False,False,0.607,0.428,0,-8.809,0,0.072,0.518,7e-06,0.201,0.597,153.24,2024-05-20,2024,May,Monday,08:55:04,297.162,4.9527,5W7bPvhB7XDg4RcHKSWsQU,Danger,30,1976-01-01,day,1iElGdidl4zFXOpaaem4wZ,The Lijadu Sisters,37,1,0,0,1,0,1,0,0,0,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0
3,6BkeuHmO4P69Ln2BSonrXi,Jumpin' Jack Flash,40,False,False,0.523,0.722,6,-6.533,1,0.0307,0.074,0.00336,0.122,0.506,129.424,2024-05-20,2024,May,Monday,08:18:51,215.866,3.597767,24R9CyPLFa0CJrSZ9whlT3,Ananda Shankar (US Internet Release),30,1970,year,5eNWwEF0woj5E5Fnu9qXaQ,Ananda Shankar,25,1,0,0,1,0,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,0,1,0,0,0
4,2YfRKF4sWeV1t1NqBoUgxH,Anchin Kfu Ayinkash,51,False,False,0.471,0.551,5,-10.484,0,0.0291,0.04,0.147,0.345,0.692,104.791,2024-05-20,2024,May,Monday,08:06:55,324.88,5.414667,5i6yMY4F7UoOuJD6wRV6NN,Wede Harer Guzo,43,2016-06-17,day,0rsN9DKQhTCvkgbByOOBIm,Hailu Mergia,39,0,0,0,1,0,0,1,0,0,1,1,1,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1
