# Data pre-processing

# Goal

The goal of this notebook is to obtain necessary data to begin our project.
- Training dataset: Obtain audio features for songs in the playlist and categorize genres based on playlists
- Testing dataset: Obtain songs from other playlists that are similar or related to the songs in your selected playlists.

In [1]:
%cd ../recommendation_systems

/Users/alexandermichaeltjhin/Everything/Repos/recommendation_systems


In [2]:
import json
import pandas as pd
from utils import *
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Set up credentials

In [26]:
def spotify(client_id, client_secret):
    # Authenticate
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp

with open('config.json') as config_file:
    config = json.load(config_file)

# private credentials hidden
client_id = config['SPOTIFY_CLIENT_ID']['0']
client_secret = config['SPOTIFY_CLIENT_SECRET']['0']

sp = spotify(client_id, client_secret)

# Identify genres by grouping playlists

In [31]:
genre_dict = {
    'Rap': ['Its time', 'Idfk2', 'idfk'],
    'RnB': ['idek', 'R&B ish'],
    'Classical': ['Classical', 'Classical 2'],
    'Covers': ['Covers', 'Covers 2.0'],
    'EDM': ['Drop'],
    'Old': ['Old'],
    'Easy': ['Easy', 'Easy 2']
}
t = []
username = 'byv1tdsf0wr3gpo2hkjkfd0tk'
def get_tracks_from_username(username, sp):
    playlists = sp.user_playlists(username)
    for playlist in playlists['items']:
        if playlist['owner']['id'] == username:
            name = playlist['name']
            genre = [key for key in genre_dict if name in genre_dict[key]]
            if len(genre) < 0:
                print(f"Playlist {name} does not belong in any genre, skipping this playlist")
                continue
            elif len(genre) < 1:
                print(f"Playlist {name} is in multiple genres, skipping this playlist")
                continue
            genre = genre[0]
            results = sp.playlist(playlist['id'], fields="tracks,next")
            tracks = results['tracks']

            for i, item in enumerate(tracks['items']):
                t.append((name, item['track']['id'], item['track']['name'], genre))
    id_df = pd.DataFrame(t, columns=['playlist', 'id', 'name', 'genre'])
    return id_df
id_df = calculate_time(get_tracks_from_username, username, sp)

Time taken is 3.73 seconds


# Extract audio features for each song

In [37]:
def get_audio_features(list_of_id):
    l = [sp.audio_features(track_id)[0] for track_id in list_of_id]
    return pd.DataFrame(l)
features_df = calculate_time(get_audio_features, id_df.id)
features_df.head()

Time taken is 19.45 seconds


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.517,0.367,10,-12.639,0,0.234,0.504,0.000405,0.6,0.363,69.073,audio_features,4yreExU3eRNTe2iJz6X6k3,spotify:track:4yreExU3eRNTe2iJz6X6k3,https://api.spotify.com/v1/tracks/4yreExU3eRNT...,https://api.spotify.com/v1/audio-analysis/4yre...,63416,4
1,0.652,0.806,5,-5.707,0,0.302,0.122,1.1e-05,0.842,0.779,159.947,audio_features,2FoahzOSxJnalPA8aBUme3,spotify:track:2FoahzOSxJnalPA8aBUme3,https://api.spotify.com/v1/tracks/2FoahzOSxJna...,https://api.spotify.com/v1/audio-analysis/2Foa...,198293,4
2,0.818,0.512,5,-9.056,0,0.0884,0.0963,0.0371,0.11,0.153,131.974,audio_features,6x9pCndnXEoea0CMcfjs9W,spotify:track:6x9pCndnXEoea0CMcfjs9W,https://api.spotify.com/v1/tracks/6x9pCndnXEoe...,https://api.spotify.com/v1/audio-analysis/6x9p...,143719,4
3,0.918,0.681,1,-4.705,1,0.201,0.263,0.0,0.0615,0.706,139.057,audio_features,2FDTHlrBguDzQkp7PVj16Q,spotify:track:2FDTHlrBguDzQkp7PVj16Q,https://api.spotify.com/v1/tracks/2FDTHlrBguDz...,https://api.spotify.com/v1/audio-analysis/2FDT...,229133,4
4,0.701,0.653,1,-4.695,1,0.105,0.479,2.8e-05,0.306,0.505,82.984,audio_features,5KI7I4mEtulXcv5VQJaV35,spotify:track:5KI7I4mEtulXcv5VQJaV35,https://api.spotify.com/v1/tracks/5KI7I4mEtulX...,https://api.spotify.com/v1/audio-analysis/5KI7...,231338,4


In [38]:
merged_df = pd.merge(id_df, features_df, how='inner', left_on='id', right_on='id')
merged_df = merged_df.iloc[1:].reset_index(drop=True)

# Save to csv

In [39]:
merged_df.to_csv('my_songs.csv')