<a href="https://colab.research.google.com/github/apanand/UChicago-MSADS/blob/main/Spotify%20Recommender/Data_Acquisition_and_Wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Acquisition and Wrangling

This code is to acquire and wrangle the Spotify data that eventually is loaded into the simple neural network found in the Modeling.ipynb file. This code uses my specially created Spotify Dev credits, so it may not run. Do not run this code to evaluate the Modeling.ipynb file: rather, load the datasets this script creates, which I have added to the "data" folder in GitHub, into the Modeling file to run it more efficiently.

In [None]:
import numpy as np
from numpy import argmax
import pandas as pd
from collections import Counter
from datetime import datetime
import os
import itertools
import math
import collections
import json

# Spotify
! pip3 install spotipy
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyOAuth

import warnings
warnings.filterwarnings("ignore")

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.4-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.0/252.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.4 spotipy-2.23.0


# **Wrangling my Listening Data**

This section is to load and wrangle my Spotify listening history from the past year, which I acquired by requesting directly from Spotify.

In [None]:
from google.colab import files

uploaded = files.upload()

Saving StreamingHistory_music_0.json to StreamingHistory_music_0.json
Saving StreamingHistory_music_1.json to StreamingHistory_music_1.json
Saving StreamingHistory_music_2.json to StreamingHistory_music_2.json
Saving StreamingHistory_music_3.json to StreamingHistory_music_3.json


In [None]:
def load_and_merge_json_files(file_paths):
    dataframes = []

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            df = pd.DataFrame(data)
            dataframes.append(df)
    merged_df = pd.concat(dataframes, ignore_index=True)

    return merged_df

file_paths = ['StreamingHistory_music_0.json', 'StreamingHistory_music_1.json', 'StreamingHistory_music_2.json', 'StreamingHistory_music_3.json']
merged_df = load_and_merge_json_files(file_paths)
print(merged_df)

                endTime      artistName  \
0      2023-05-18 00:02     JACK MARLOW   
1      2023-05-18 00:03         Kartoon   
2      2023-05-18 02:04          salute   
3      2023-05-18 03:05    TECH IT DEEP   
4      2023-05-18 03:10  Shaolin Cowboy   
...                 ...             ...   
33437  2024-05-18 23:56    Flipp Dinero   
33438  2024-05-18 23:56          Future   
33439  2024-05-18 23:56    Lil Uzi Vert   
33440  2024-05-18 23:56         J. Cole   
33441  2024-05-18 23:57           Gunna   

                                       trackName  msPlayed  
0                                     Flyin Baby    192000  
1      Soundboy Surrender - Crossy Remix - Mixed     61714  
2                                          Peach    345310  
3                                    Maria Maria     31151  
4                        Réveiller (feat. manon)    289787  
...                                          ...       ...  
33437                             Leave Me Alone      32

In [None]:
merged_df

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-05-18 00:02,JACK MARLOW,Flyin Baby,192000
1,2023-05-18 00:03,Kartoon,Soundboy Surrender - Crossy Remix - Mixed,61714
2,2023-05-18 02:04,salute,Peach,345310
3,2023-05-18 03:05,TECH IT DEEP,Maria Maria,31151
4,2023-05-18 03:10,Shaolin Cowboy,Réveiller (feat. manon),289787
...,...,...,...,...
33437,2024-05-18 23:56,Flipp Dinero,Leave Me Alone,3200
33438,2024-05-18 23:56,Future,Mask Off (feat. Kendrick Lamar) - Remix,5116
33439,2024-05-18 23:56,Lil Uzi Vert,New Patek,3480
33440,2024-05-18 23:56,J. Cole,MIDDLE CHILD,1200


In [None]:
merged_df.value_counts()

endTime           artistName          trackName                        msPlayed
2023-05-18 00:02  JACK MARLOW         Flyin Baby                       192000      1
2024-02-28 18:48  Magic City Hippies  Ghost On The Mend - veggi remix  88526       1
2024-02-28 16:07  Unknown T           Goodums - Sammy Virji Remix      9088        1
                  Sammy Virji         Find My Way Home                 6816        1
                  RAC                 Passion                          3455        1
                                                                                  ..
2023-10-24 19:06  Oppidan             Rosalina                         169411      1
2023-10-24 19:03  JEV                 DREAMER                          133846      1
2023-10-24 19:01  MPH                 Spend The Night                  210909      1
2023-10-24 18:58  Conducta            Alone - Mixed                    112923      1
2024-05-18 23:57  Gunna               Drip or Drown                   

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33442 entries, 0 to 33441
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     33442 non-null  object
 1   artistName  33442 non-null  object
 2   trackName   33442 non-null  object
 3   msPlayed    33442 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.0+ MB


In [None]:
merged_df = merged_df.drop_duplicates(subset=['trackName', 'artistName'])
merged_df


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-05-18 00:02,JACK MARLOW,Flyin Baby,192000
1,2023-05-18 00:03,Kartoon,Soundboy Surrender - Crossy Remix - Mixed,61714
2,2023-05-18 02:04,salute,Peach,345310
3,2023-05-18 03:05,TECH IT DEEP,Maria Maria,31151
4,2023-05-18 03:10,Shaolin Cowboy,Réveiller (feat. manon),289787
...,...,...,...,...
33416,2024-05-18 23:52,m$sha,OOPS,900
33418,2024-05-18 23:52,CALIsthenics,"Paper Thick (feat. E-40, Casual & Del the Funk...",1330
33421,2024-05-18 23:52,Chris Mula,Go To Work,620
33428,2024-05-18 23:56,Travis Scott,Oh My Dis Side (feat. Quavo),940


# Connecting to Spotify API to get Song ID's and Features

This section is where I connect to the Spotify API using the "Spotipy" package. I use this below to extract song metadata for my data as well as the Billboard Top 100 playlist as of May 18, 2024.

In [None]:
#@title Spotify Developer Credentials { run: "auto", vertical-output: true, display-mode: "form" }
SPOTIFY_CLIENT_ID = "63491f3b28294698834a8596f0ae0a22" #@param {type:"string"}
SPOTIFY_CLIENT_SECRET = "9c708402905d434d83b87657eef5d30b" #@param {type:"string"}
SPOTIFY_REDIRECT_URI = 'https://www.google.com/'
SCOPE = "playlist-modify-public playlist-read-collaborative playlist-modify-private"
scope = "user-read-recently-played"
assert len(SPOTIFY_CLIENT_ID)>0, 'Please provide a spotify client id'
assert len(SPOTIFY_CLIENT_SECRET)>0, 'Please provide a spotify client secret code'

sp = spotipy.Spotify(
    auth_manager=spotipy.SpotifyOAuth(
        client_id=SPOTIFY_CLIENT_ID,
        client_secret=SPOTIFY_CLIENT_SECRET,
        redirect_uri=SPOTIFY_REDIRECT_URI,
        scope=SCOPE, open_browser=False),
        requests_timeout=20, retries=10)
form_conn = sp.artist('spotify:artist:3jOstUTkEu2JkjvRdBA5Gu')

print('Authorization Sucessful!')

Authorization Sucessful!


In [None]:
import time
from tqdm import tqdm

def get_track_id(artist_name, track_name):
    query = f"artist:{artist_name} track:{track_name}"
    while True:
        try:
            result = sp.search(q=query, type='track', limit=1)
            tracks = result.get('tracks', {}).get('items', [])
            if tracks:
                return tracks[0]['id']
            return None
        except spotipy.SpotifyException as e:
            if e.http_status == 429:  # Rate limit status code
                retry_after = int(e.headers.get('Retry-After', 60))
                print(f"Rate limit exceeded. Sleeping for {retry_after} seconds.")
                time.sleep(retry_after)
            else:
                print(f"SpotifyException: {e}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"RequestException: {e}")
            time.sleep(5)
        except Exception as e:
            print(f"Unexpected error: {e}")
            return None


track_ids = []


for _, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0], desc="Fetching track IDs"):
    track_id = get_track_id(row['artistName'], row['trackName'])
    track_ids.append(track_id)
    time.sleep(0.5)  # Add a delay to avoid hitting the rate limit too quickly


merged_df['track_id'] = track_ids


In [None]:
merged_df = pd.read_csv('merged_df.csv')

In [None]:
merged_df

Unnamed: 0,endTime,artistName,trackName,msPlayed,track_id
0,2023-05-18 00:02,JACK MARLOW,Flyin Baby,192000,597vP6rhNkEpSHpCyriFnf
1,2023-05-18 00:03,Kartoon,Soundboy Surrender - Crossy Remix - Mixed,61714,3cEvwcgCxyMKIKFp37LGwS
2,2023-05-18 02:04,salute,Peach,345310,5cGZN0P1QnSfhCFBCHtp2N
3,2023-05-18 03:05,TECH IT DEEP,Maria Maria,31151,4wP6mzWZC94dYmU6OYeljf
4,2023-05-18 03:10,Shaolin Cowboy,Réveiller (feat. manon),289787,51oib6a2dRu2f6x8OMMphQ
...,...,...,...,...,...
8375,2024-05-18 23:52,m$sha,OOPS,900,4UBIs1W680L7JR7oqkx3zq
8376,2024-05-18 23:52,CALIsthenics,"Paper Thick (feat. E-40, Casual & Del the Funk...",1330,4uSsR7XGqmC4F8ngY65I2M
8377,2024-05-18 23:52,Chris Mula,Go To Work,620,3AwQtBfYQgDLfMnQ8ZNkEt
8378,2024-05-18 23:56,Travis Scott,Oh My Dis Side (feat. Quavo),940,2rMFawCg4BW65jzbwztXAV


In [None]:
track_ids = merged_df['track_id']

In [None]:
track_ids

0       597vP6rhNkEpSHpCyriFnf
1       3cEvwcgCxyMKIKFp37LGwS
2       5cGZN0P1QnSfhCFBCHtp2N
3       4wP6mzWZC94dYmU6OYeljf
4       51oib6a2dRu2f6x8OMMphQ
                 ...          
8375    4UBIs1W680L7JR7oqkx3zq
8376    4uSsR7XGqmC4F8ngY65I2M
8377    3AwQtBfYQgDLfMnQ8ZNkEt
8378    2rMFawCg4BW65jzbwztXAV
8379    6nO3tr47nr2P7f3hXb8JIo
Name: track_id, Length: 8380, dtype: object

In [None]:
import sys
from tqdm import tqdm



def get_track_details_and_features(sp, track_ids):
    details_list = []

    for i in tqdm(range(0, len(track_ids), 50), desc="Fetching track details and features"):
        batch = track_ids[i:i + 50]
        retries = 0
        while True:
            try:
                tracks = sp.tracks(batch)['tracks']
                features_list = sp.audio_features(batch)

                for track, features in zip(tracks, features_list):
                    if track and features:  # Ensure there are track details and features
                        details = {
                            'id': track['id'],
                            'title': track['name'],
                            'all_artists': ', '.join([artist['name'] for artist in track['artists']]),
                            'popularity': track['popularity'],
                            'release_date': track['album']['release_date'],
                            'danceability': features['danceability'],
                            'energy': features['energy'],
                            'key': features['key'],
                            'loudness': features['loudness'],
                            'mode': features['mode'],
                            'acousticness': features['acousticness'],
                            'instrumentalness': features['instrumentalness'],
                            'liveness': features['liveness'],
                            'valence': features['valence'],
                            'tempo': features['tempo'],
                            'duration_ms': features['duration_ms'],
                            'time_signature': features['time_signature']
                        }
                        details_list.append(details)
                break
            except spotipy.SpotifyException as e:
                if e.http_status == 429:  # Rate limit status code
                    retries += 1
                    retry_after = min(int(e.headers.get('Retry-After', 60)), 2 ** retries)
                    print(f"Rate limit exceeded. Sleeping for {retry_after} seconds. Retry #{retries}", file=sys.stderr)
                    time.sleep(retry_after)
                else:
                    print(f"SpotifyException: {e}", file=sys.stderr)
                    break
            except requests.exceptions.RequestException as e:
                print(f"RequestException: {e}", file=sys.stderr)
                time.sleep(5)  # Wait for a short time before retrying
            except Exception as e:
                print(f"Unexpected error: {e}", file=sys.stderr)
                break
    return details_list

In [None]:
track_ids = merged_df['track_id'].dropna().tolist()

track_details = get_track_details_and_features(sp, track_ids)

details_df = pd.DataFrame(track_details)


Fetching track details and features: 100%|██████████| 163/163 [01:09<00:00,  2.33it/s]


In [None]:
details_df

Unnamed: 0,id,title,all_artists,popularity,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,597vP6rhNkEpSHpCyriFnf,Flyin Baby,JACK MARLOW,39,2023-04-21,0.826,0.576,9,-10.317,0,0.0165,0.747000,0.1340,0.670,129.980,192000,4
1,3cEvwcgCxyMKIKFp37LGwS,Soundboy Surrender - Crossy Remix - Mixed,"Kartoon, Crossy",55,2022-07-22,0.530,0.856,10,-2.220,0,0.0176,0.351000,0.2450,0.718,87.513,61714,4
2,5cGZN0P1QnSfhCFBCHtp2N,Peach,"salute, Sammy Virji",55,2023-03-31,0.866,0.668,1,-7.472,1,0.0115,0.889000,0.2380,0.657,135.009,345311,4
3,4wP6mzWZC94dYmU6OYeljf,Maria Maria,TECH IT DEEP,69,2023-02-08,0.723,0.685,0,-6.790,1,0.0992,0.000601,0.2440,0.386,127.987,161802,4
4,51oib6a2dRu2f6x8OMMphQ,Réveiller (feat. manon),"Shaolin Cowboy, manon",52,2021-03-26,0.842,0.418,8,-9.602,0,0.7650,0.022800,0.1170,0.448,128.009,289787,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8095,4UBIs1W680L7JR7oqkx3zq,OOPS,"m$sha, Jay Critch",42,2023-03-17,0.790,0.652,3,-6.882,0,0.0325,0.000041,0.0769,0.253,140.399,117447,4
8096,4uSsR7XGqmC4F8ngY65I2M,"Paper Thick (feat. E-40, Casual & Del the Funk...","CALIsthenics, E-40, Casual, Del The Funky Homo...",32,2024-03-29,0.770,0.770,11,-6.348,0,0.0177,0.005410,0.3220,0.261,104.105,124824,4
8097,3AwQtBfYQgDLfMnQ8ZNkEt,Go To Work,Chris Mula,66,2023-10-17,0.509,0.540,9,-13.219,1,0.5210,0.000000,0.4570,0.152,67.770,116286,4
8098,2rMFawCg4BW65jzbwztXAV,Oh My Dis Side (feat. Quavo),"Travis Scott, Quavo",66,2015-09-04,0.459,0.623,1,-4.450,1,0.3520,0.000000,0.1310,0.205,126.014,351253,4


In [None]:
details_df.to_csv('listening_history.csv', index=False)

from google.colab import files
files.download("listening_history.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Get Billboard Top 100 Data and Features

In [None]:
def get_playlist_tracks(sp, playlist_id):
    tracks = []
    results = sp.playlist_tracks(playlist_id)
    while results:
        tracks.extend([item['track'] for item in results['items'] if item['track']])
        results = sp.next(results) if results['next'] else None
    return tracks


def get_track_details_and_features(sp, tracks):
    track_ids = [track['id'] for track in tracks if track['id']]
    features_list = sp.audio_features(track_ids)

    details_list = []
    for track, features in zip(tracks, features_list):
        if features:  # Ensure there are features
            details = {
                'id': track['id'],
                'title': track['name'],
                'all_artists': ', '.join([artist['name'] for artist in track['artists']]),
                'popularity': track['popularity'],
                'release_date': track['album']['release_date'],
                'danceability': features['danceability'],
                'energy': features['energy'],
                'key': features['key'],
                'loudness': features['loudness'],
                'mode': features['mode'],
                'acousticness': features['acousticness'],
                'instrumentalness': features['instrumentalness'],
                'liveness': features['liveness'],
                'valence': features['valence'],
                'tempo': features['tempo'],
                'duration_ms': features['duration_ms'],
                'time_signature': features['time_signature']
            }
            details_list.append(details)
    return details_list

billboard_playlist_id = '1G8IpkZKobrIlXcVPoSIuf'
billboard_tracks = get_playlist_tracks(sp, billboard_playlist_id)
track_details = get_track_details_and_features(sp, billboard_tracks)


billboard_features_df = pd.DataFrame(track_details)
billboard_features_df.head()

In [None]:
billboard_features_df.to_csv('billboard_features_df.csv', index=False)

from google.colab import files
files.download("billboard_features_df.csv")