In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import time 
import numpy as np

## Download data from Spotify API

In [2]:
# read from csv that contains client id and client secret
# instead of sharing my personal details in notebook
spotify_client_info = pd.read_csv('../data/spotify_client_info.csv')

In [3]:
client_id = spotify_client_info.iloc[0,0]
client_secret = spotify_client_info.iloc[0,1]

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
# my spotify username and playlist ids 
# on playlist page, click on "..." -> then on "Share" -> then "Copy Spotify URI"
def getTrackIDs(user, playlist_id):
    ids = []
    playlist = sp.user_playlist(user, playlist_id)
    for item in playlist['tracks']['items']:
        track = item['track']
        ids.append(track['id'])
    return ids

In [5]:
# Get spotify singular song data from these locations
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-track
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features
def getTrackFeatures(id):
    meta = sp.track(id)
    features = sp.audio_features(id)

    # meta
    name = meta['name']
    album = meta['album']['name']
    artist = meta['album']['artists'][0]['name']
    release_date = meta['album']['release_date']
    length = meta['duration_ms']
    popularity = meta['popularity']

    # features
    acousticness = features[0]['acousticness']
    danceability = features[0]['danceability']
    energy = features[0]['energy']
    instrumentalness = features[0]['instrumentalness']
    liveness = features[0]['liveness']
    loudness = features[0]['loudness']
    speechiness = features[0]['speechiness']
    tempo = features[0]['tempo']
    valence = features[0]['valence']
    time_signature = features[0]['time_signature']
    key = features[0]['key']
    mode = features[0]['mode']
    uri = features[0]['uri']

    track = [name, album, artist, release_date,
             length, popularity, acousticness,
             danceability, energy, instrumentalness, 
             liveness, loudness, speechiness, tempo, 
             valence, time_signature,
             key, mode, uri]
    return track

In [6]:
# loop over track ids to get all songs in playlist
def loop_playist(playlist_ids):
    tracks = []
    for i in range(len(playlist_ids)):
        time.sleep(.2)
        track = getTrackFeatures(playlist_ids[i])
        tracks.append(track)
    return tracks

In [7]:
# turn data into dataframe
def get_spotify_df(tracks, year):
    df = pd.DataFrame(tracks, columns = ['name', 'album', 'artist', 'release_date',
                                         'length', 'popularity', 'acousticness', 'danceability',
                                         'energy', 'instrumentalness', 'liveness', 'loudness',
                                         'speechiness', 'tempo', 'valence', 'time_signature',
                                         'key', 'mode', 'uri'])
    return df

### Add release year

In [8]:
def get_years(df):
    years = []
    for date in df['release_date'].values:
        if '-' in date:
            years.append(date.split('-')[0])
        else:
            years.append(date)
    df['release_year'] = years
    return df

### Get data

In [9]:
spotify_users_and_playlists = pd.read_csv('../data/spotify_users_and_playlists.csv')

### Adam

In [10]:
adam_user = spotify_users_and_playlists.iloc[0,0]

adam_playlist_2016 = spotify_users_and_playlists.iloc[0,1]
adam_playlist_2017 = spotify_users_and_playlists.iloc[1,1]
adam_playlist_2018 = spotify_users_and_playlists.iloc[2,1]
adam_playlist_2019 = spotify_users_and_playlists.iloc[3,1]
adam_playlist_2020 = spotify_users_and_playlists.iloc[4,1]
adam_playlist_2021 = spotify_users_and_playlists.iloc[5,1]

In [11]:
ids_2016 = getTrackIDs(adam_user, adam_playlist_2016)
ids_2017 = getTrackIDs(adam_user, adam_playlist_2017)
ids_2018 = getTrackIDs(adam_user, adam_playlist_2018)
ids_2019 = getTrackIDs(adam_user, adam_playlist_2019)
ids_2020 = getTrackIDs(adam_user, adam_playlist_2020)
ids_2021 = getTrackIDs(adam_user, adam_playlist_2021)

In [12]:
# takes between 4 and 5 minutes on my laptop
time_start = time.time()
ids_2016_playlist_loop = loop_playist(ids_2016)
ids_2017_playlist_loop = loop_playist(ids_2017)
ids_2018_playlist_loop = loop_playist(ids_2018)
ids_2019_playlist_loop = loop_playist(ids_2019)
ids_2020_playlist_loop = loop_playist(ids_2020)
ids_2021_playlist_loop = loop_playist(ids_2021)
time_end = time.time()
print((time_end - time_start)/60)

4.632485683759054


In [13]:
df_2016 = get_spotify_df(ids_2016_playlist_loop, 2016)
df_2017 = get_spotify_df(ids_2017_playlist_loop, 2017)
df_2018 = get_spotify_df(ids_2018_playlist_loop, 2018)
df_2019 = get_spotify_df(ids_2019_playlist_loop, 2019)
df_2020 = get_spotify_df(ids_2020_playlist_loop, 2020)
df_2021 = get_spotify_df(ids_2021_playlist_loop, 2021)

In [14]:
df_2016 = get_years(df_2016)
df_2017 = get_years(df_2017)
df_2018 = get_years(df_2018)
df_2019 = get_years(df_2019)
df_2020 = get_years(df_2020)
df_2021 = get_years(df_2021)

In [15]:
# save dataframes
dfs = [df_2016,df_2017,df_2018,df_2019,df_2020,df_2021]
names = ['Adam_2016', 'Adam_2017', 'Adam_2018', 'Adam_2019', 'Adam_2020', 'Adam_2021']
for df, name in zip(dfs, names):
    df.to_csv(f'../data/{name}_Top_Songs.csv', index=False)

In [16]:
df_2016 = pd.read_csv(f'../data/Adam_2016_Top_Songs.csv')
df_2016 = pd.read_csv(f'../data/Adam_2017_Top_Songs.csv')
df_2018 = pd.read_csv(f'../data/Adam_2018_Top_Songs.csv')
df_2019 = pd.read_csv(f'../data/Adam_2019_Top_Songs.csv')
df_2020 = pd.read_csv(f'../data/Adam_2020_Top_Songs.csv')
df_2021 = pd.read_csv(f'../data/Adam_2021_Top_Songs.csv')

In [17]:
# df_2021

### Brenda

In [18]:
brenda_user = spotify_users_and_playlists.iloc[6,0]

brenda_playlist_2016 = spotify_users_and_playlists.iloc[6,1]
brenda_playlist_2017 = spotify_users_and_playlists.iloc[7,1]
brenda_playlist_2018 = spotify_users_and_playlists.iloc[8,1]
brenda_playlist_2019 = spotify_users_and_playlists.iloc[9,1]
brenda_playlist_2020 = spotify_users_and_playlists.iloc[10,1]
brenda_playlist_2021 = spotify_users_and_playlists.iloc[11,1]

In [19]:
brenda_ids_2016 = getTrackIDs(brenda_user, brenda_playlist_2016)
brenda_ids_2017 = getTrackIDs(brenda_user, brenda_playlist_2017)
brenda_ids_2018 = getTrackIDs(brenda_user, brenda_playlist_2018)
brenda_ids_2019 = getTrackIDs(brenda_user, brenda_playlist_2019)
brenda_ids_2020 = getTrackIDs(brenda_user, brenda_playlist_2020)
brenda_ids_2021 = getTrackIDs(brenda_user, brenda_playlist_2021)

In [20]:
# takes between 4 and 5 minutes on my laptop
time_start = time.time()
brenda_ids_2016_playlist_loop = loop_playist(brenda_ids_2016)
brenda_ids_2017_playlist_loop = loop_playist(brenda_ids_2017)
brenda_ids_2018_playlist_loop = loop_playist(brenda_ids_2018)
brenda_ids_2019_playlist_loop = loop_playist(brenda_ids_2019)
brenda_ids_2020_playlist_loop = loop_playist(brenda_ids_2020)
brenda_ids_2021_playlist_loop = loop_playist(brenda_ids_2021)
time_end = time.time()
print((time_end - time_start)/60)

4.60126409928004


In [21]:
brenda_df_2016 = get_spotify_df(brenda_ids_2016_playlist_loop, 2016)
brenda_df_2017 = get_spotify_df(brenda_ids_2017_playlist_loop, 2017)
brenda_df_2018 = get_spotify_df(brenda_ids_2018_playlist_loop, 2018)
brenda_df_2019 = get_spotify_df(brenda_ids_2019_playlist_loop, 2019)
brenda_df_2020 = get_spotify_df(brenda_ids_2020_playlist_loop, 2020)
brenda_df_2021 = get_spotify_df(brenda_ids_2021_playlist_loop, 2021)

In [22]:
brenda_df_2016 = get_spotify_df(brenda_ids_2016_playlist_loop, 2016)
brenda_df_2017 = get_spotify_df(brenda_ids_2017_playlist_loop, 2017)
brenda_df_2018 = get_spotify_df(brenda_ids_2018_playlist_loop, 2018)
brenda_df_2019 = get_spotify_df(brenda_ids_2019_playlist_loop, 2019)
brenda_df_2020 = get_spotify_df(brenda_ids_2020_playlist_loop, 2020)
brenda_df_2021 = get_spotify_df(brenda_ids_2021_playlist_loop, 2021)

In [23]:
brenda_df_2016 = get_years(brenda_df_2016)
brenda_df_2017 = get_years(brenda_df_2017)
brenda_df_2018 = get_years(brenda_df_2018)
brenda_df_2019 = get_years(brenda_df_2019)
brenda_df_2020 = get_years(brenda_df_2020)
brenda_df_2021 = get_years(brenda_df_2021)

In [24]:
# save dataframes
dfs = [brenda_df_2016, brenda_df_2017, brenda_df_2018, brenda_df_2019, brenda_df_2020, brenda_df_2021]
names = ['Brenda_2016', 'Brenda_2017', 'Brenda_2018', 'Brenda_2019', 'Brenda_2020', 'Brenda_2021']
for df, name in zip(dfs, names):
    df.to_csv(f'../data/{name}_Top_Songs.csv', index=False)

In [25]:
brenda_df_2016 = pd.read_csv(f'../data/Brenda_2016_Top_Songs.csv')
brenda_df_2017 = pd.read_csv(f'../data/Brenda_2017_Top_Songs.csv')
brenda_df_2018 = pd.read_csv(f'../data/Brenda_2018_Top_Songs.csv')
brenda_df_2019 = pd.read_csv(f'../data/Brenda_2019_Top_Songs.csv')
brenda_df_2020 = pd.read_csv(f'../data/Brenda_2020_Top_Songs.csv')
brenda_df_2021 = pd.read_csv(f'../data/Brenda_2021_Top_Songs.csv')

In [26]:
# brenda_df_2021