### 1) Use the Spotify API to create a dataframe of songs and their respective attributes from a large playlist

### 2) Use the Spotify API to find the features for the songs I have listed to this past year

-------------------------

### Useful links

#### Retrieving spotify data from spotify dev api
https://towardsdatascience.com/how-to-create-large-music-datasets-using-spotipy-40e7242cc6a6

https://github.com/MaxHilsdorf/introduction_to_spotipy

#### Retrieving your own personal spotify data
https://towardsdatascience.com/get-your-spotify-streaming-history-with-python-d5a208bbcbd3

#### Spotify API 
https://spotipy.readthedocs.io/en/latest/#more-examples

https://developer.spotify.com/documentation/web-api/

https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/



In [1]:
import pandas as pd
import numpy as np

In [2]:
import spotipy
import spotipy.util as util

#### TO GET THE BELOW ID AND SECRET, YOU NEED TO REGISTER FOR AN APP FROM DEVELOPER.SPOTIFY.COM

In [98]:
CLIENT_ID = "*****"
CLIENT_SECRET = "*****"

In [99]:
token = spotipy.oauth2.SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
cache_token = token.get_access_token()
sp = spotipy.Spotify(cache_token)

  


In [12]:
playlist_creator = "spotify"
#This is a 5000 songs playlist from spotify
playlist = '4rnleEAOdmFAbRcNCgZMpY'

In [45]:
#https://stackoverflow.com/questions/39086287/spotipy-how-to-read-more-than-100-tracks-from-a-playlist
#This enables us to get more than 100 tracks at a time
def get_playlist_tracks(username,playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [14]:
#Combining the get_playlist_tracks and analyze_playlist functions to get all features for our data

def create_playlist_df(username,playlist_id):
     # Create empty dataframe
    playlist_features_list = ["artist", "album", "track_name", "track_id", 
                             "danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]
    playlist_df = pd.DataFrame(columns = playlist_features_list)
    
    # Create empty dict
    playlist_features = {}
    
    # Loop through every track in the playlist, extract features and append the features to the playlist df
    playlist = get_playlist_tracks(username,playlist_id)
    for track in playlist:
        # Get metadata
        playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
        playlist_features["album"] = track["track"]["album"]["name"]
        playlist_features["track_name"] = track["track"]["name"]
        playlist_features["track_id"] = track["track"]["id"]
        # Get audio features
        audio_features = sp.audio_features(playlist_features["track_id"])[0]
        for feature in playlist_features_list[4:]:
            playlist_features[feature] = audio_features[feature]
        
        # Concat the dfs
        track_df = pd.DataFrame(playlist_features, index = [0])
        playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
        
    return playlist_df

In [55]:
a = create_playlist_df(playlist_creator,playlist)

In [56]:
len(a)

4778

In [57]:
a.head()

Unnamed: 0,artist,album,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Hozier,Hozier (Deluxe),Take Me To Church,7dS5EaCoMnN7DzlpT6aRn2,0.566,0.664,4,-5.303,0,0.0464,0.0,0.116,0.437,128.945,241688,4
1,Mike Posner,31 Minutes to Takeoff,Cooler Than Me - Single Mix,2V4bv1fNWfTcyRJKmej6Sj,0.768,0.82,7,-4.63,0,0.0475,0.0,0.704,0.625,129.969,213293,4
2,"Tyler, The Creator",Flower Boy,See You Again (feat. Kali Uchis),7KA4W4McWYRpgf0fWsJZWB,0.558,0.559,6,-9.222,1,0.0959,7.49e-06,0.109,0.62,78.558,180387,4
3,Bastille,Bad Blood,Pompeii,3gbBpTdY8lnQwqxNCcf795,0.679,0.715,9,-6.383,1,0.0407,0.0,0.271,0.571,127.435,214148,4
4,Shakira,"Oral Fixation, Vol. 2 (Expanded Edition)",Hips Don't Lie (feat. Wyclef Jean),3ZFTkvIE7kyPt6Nu3PEa7V,0.778,0.824,10,-5.892,0,0.0712,0.0,0.405,0.756,100.024,218093,4


In [58]:
a.to_csv('biggest_playlist_ever.csv')

### Now time to bring in my personal song data

In [158]:
import json 
  
# Opening JSON file 
f = open(r'C:\Users\charl\Python\GitHub Projects\Spotify_Project\MyData\StreamingHistory0.json',encoding="utf8") 
  
# returns JSON object as  
# a dictionary 
data = json.load(f) 

In [159]:
hist = pd.DataFrame.from_dict(data)

In [160]:
hist.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2019-09-20 18:05,Elvis Presley,From a Jack to a King,147973
1,2019-09-20 18:08,Elvis Presley,From a Jack to a King,147973
2,2019-09-20 18:26,Marty Robbins,Devil Woman,130966
3,2019-09-21 17:49,Marty Robbins,Devil Woman,6843
4,2019-09-21 17:49,Jimi Hendrix,May This Be Love,39967


In [161]:
hist.to_csv('streaming_history.csv')

### Get features for all songs that I listed to in the past year (2019 to 2020)

In [162]:
my_songs = pd.read_csv(r'C:\Users\charl\Python\GitHub Projects\Spotify_Project\MyData\my_2019_tracks.csv')

In [163]:
my_songs.head()

Unnamed: 0.1,Unnamed: 0,trackName,artistName,skip,play,like
0,0,"""C"" is for Cookie",Cookie Monster,1,0,0
1,1,$ave Dat Money (feat. Fetty Wap & Rich Homie Q...,Lil Dicky,1,1,0
2,2,'Round Here,IODONTPLAY,0,2,1
3,3,'Till I Collapse,Eminem,1,1,0
4,4,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES,0,2,1


In [164]:
#Now filter out songs that have not been played more than once
my_songs = my_songs[(my_songs['play']>1)|(my_songs['skip']>1)].reset_index()

In [165]:
len(my_songs)

1837

In [166]:
#Create list that includes song name and artist in each item
songs= my_songs['trackName']
artists= my_songs['artistName']

combined = list(zip(songs,artists))
combined[0]

("'Round Here", 'IODONTPLAY')

In [167]:
def get_track_id(zipped):
    #Get track info in dict format
    artist = zipped[1]
    track = zipped[0]
    track_id = sp.search(q='artist:' + artist + ' track:' + track, type='track')
    
    #Sort through the dict
    track = track_id['tracks']['items']
    
    #The dict stops and turns into a long ass string so we need to be creative
    #Turn into string and pull all characters after 'tracks' appears in the url
    mystring =  str(track)
    keyword = '/tracks/'
    before_keyword, keyword, after_keyword = mystring.partition(keyword)
    
    #Extract the song id
    lsty = list(after_keyword.split(","))
    track_id = lsty[0][0:-1]

    track_id
    return track_id

In [168]:
#Get all of the track_ids for the tracks I have listened to in the past year
def get_song_ids(combined_songs_artist):
    song_ids=[]
    for i in combined_songs_artist:
        track_id = get_track_id(i)
        song_ids.append(track_id)
    return song_ids

In [169]:
song_ids = get_song_ids(combined)

### Now create a dataframe with these columns, append all the track info, then delete the unnecessessary columns

In [170]:
def create_audio_df(song_ids):
    #Create first row of df with song features
    df = pd.DataFrame.from_dict(sp.audio_features(tracks= song_ids[0]))
    #Add all other songs and their features to the df
    for i in song_ids[1:]:
        df =  df.append(pd.DataFrame.from_dict(sp.audio_features(tracks = i)))
    return df

In [171]:
my_songs_full = create_audio_df(song_ids)

In [172]:
len(my_songs_full)

1837

In [173]:
len(song_ids)

1837

In [174]:
my_songs_full.drop(columns = ['analysis_url','track_href','type','uri'],inplace=True)

In [175]:
my_songs_full = my_songs_full.set_index('id').reset_index()

In [176]:
my_songs_full.drop(columns = 0,inplace=True)

In [177]:
my_songs_full.tail()

Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
1832,1Ci4wASMY4xtKVMeHA6Sd5,0.36,0.744,286236.0,0.589,0.0,5.0,0.119,-1.218,0.0,0.0359,127.051,4.0,0.242
1833,2Zy7XVdxyZQB8xp5xbpJdl,0.222,0.396,272501.0,0.554,0.0,0.0,0.612,-5.048,1.0,0.0488,117.384,4.0,0.17
1834,1cZlBZwnwGPtYeRIeQcoFh,0.0737,0.769,216946.0,0.569,0.0,1.0,0.0571,-5.591,0.0,0.0786,146.062,4.0,0.612
1835,7pQi9lwATNWDfNwtyMXUtm,0.0572,0.844,222573.0,0.488,0.0,4.0,0.209,-7.834,0.0,0.255,102.011,4.0,0.298
1836,1gU5jNTPxkrF52O7vUTWzR,0.995,0.484,152933.0,0.0111,0.878,0.0,0.0556,-36.334,1.0,0.0384,121.708,4.0,0.2


In [178]:
my_songs_full[['trackName','artistName']] = my_songs[['trackName','artistName']]

In [179]:
my_songs_full.head()

Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,trackName,artistName
0,5Le9sSLxWIaIEPPppZ9EuF,0.33,0.673,254118.0,0.8,0.0,7.0,0.689,-6.253,1.0,0.214,135.997,4.0,0.268,'Round Here,IODONTPLAY
1,5F7bIFd3xWuoXmvXFqFl5M,0.0825,0.744,365950.0,0.57,0.0,3.0,0.121,-4.359,0.0,0.0357,75.019,4.0,0.6,(Bonus) Air Canada,RiFF Raff & DOLLABiLLGATES
2,2PzU4IB8Dr6mxV3lHuaG34,0.0383,0.723,222813.0,0.863,0.0317,2.0,0.128,-7.89,1.0,0.0338,136.302,4.0,0.931,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones
3,3qLfQNPEE27KI3Hgd9Om8A,0.991,0.295,135653.0,0.0706,0.92,9.0,0.101,-20.157,0.0,0.0439,76.425,1.0,0.139,(prelude),Zachary Bruno
4,4txn9qnwK3ILQqv5oq2mO3,0.388,0.519,264213.0,0.809,0.0,1.0,0.275,-6.362,1.0,0.556,146.02,4.0,0.262,03' Adolescence,J. Cole


In [182]:
my_songs_full.to_csv(r'C:\Users\charl\Python\GitHub Projects\Spotify_Project\MyData\songs_for_personal_analysis.csv')