# Import Statements

In [19]:
import datetime
import base64
from urllib.parse import urlencode
import json
import time

import requests
import numpy as np
import pandas as pd
from datetime import datetime

from spotifyClient import SpotifyAPI

# Reading in the Data

In [20]:
# Read in the hot 100 by year web scrape
df = pd.read_csv('../data/hot100_60-99.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Song,Artist(s),year
0,0,El Paso,Marty Robbins,1960
1,1,Running Bear,Johnny Preston,1960
2,2,Teen Angel,Mark Dinning,1960
3,3,Theme from A Summer Place,Percy Faith,1960
4,4,Stuck on You,Elvis Presley,1960


In [21]:
df.shape

(828, 4)

# Data Cleaning

In [22]:
# Drop the Unnamed column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [23]:
# Remapping names of columns
hot100_col_names = {
    "Song": "song",
    "Artist(s)": "artist",
    "year": "year"
}

df.rename(columns=hot100_col_names, inplace=True)

In [24]:
df.head()

Unnamed: 0,song,artist,year
0,El Paso,Marty Robbins,1960
1,Running Bear,Johnny Preston,1960
2,Teen Angel,Mark Dinning,1960
3,Theme from A Summer Place,Percy Faith,1960
4,Stuck on You,Elvis Presley,1960


# Exploring Spotify WebAPI Data

## Setting up Spotify Client

In [7]:
spotify = SpotifyAPI()

## Testing Track Search Validation

In [8]:
# Running a loop on one song and getting the JSON search data for that 
# track name.

for song in df['song'][:1]:
    track_search = spotify.search({"track": str(song)}, search_type="track")

# JSON data as type dict
track_search

q=track%3AEl+Paso&type=track


{'tracks': {'href': 'https://api.spotify.com/v1/search?query=track%3AEl+Paso&type=track&offset=0&limit=20',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0Xi59sEw38vRvwleSAVqoo'},
       'href': 'https://api.spotify.com/v1/artists/0Xi59sEw38vRvwleSAVqoo',
       'id': '0Xi59sEw38vRvwleSAVqoo',
       'name': 'Marty Robbins',
       'type': 'artist',
       'uri': 'spotify:artist:0Xi59sEw38vRvwleSAVqoo'}],
     'available_markets': ['AD',
      'AE',
      'AG',
      'AL',
      'AM',
      'AO',
      'AR',
      'AT',
      'AU',
      'AZ',
      'BA',
      'BB',
      'BD',
      'BE',
      'BF',
      'BG',
      'BH',
      'BI',
      'BJ',
      'BN',
      'BO',
      'BR',
      'BS',
      'BT',
      'BW',
      'BY',
      'BZ',
      'CA',
      'CH',
      'CI',
      'CL',
      'CM',
      'CO',
      'CR',
      'CV',
      'CW',
      'CY',
      'CZ',
      'DE',
      'DJ',
      'DK'

In [9]:
# Items available from my search
track_search.keys()

dict_keys(['tracks'])

In [10]:
# print the number of keys available
print(len(track_search['tracks']))

# list all keys from track serach
track_search["tracks"].keys()

7


dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

I now need to get an idea of the number of tracks that appear in the search, as well as the keys available. I will need to find a way to check and validate each track that came up in search with the exact track that I'm searching for. 

The **'href'** key gets the query url used to generate the search.

The **'items'** key gets a list of dictionaries containing information on one of the tracks from the search. For my test track the first item from my search ended up being correct song, however this needs to be adjusted for the instances when the track isn't the first in search.

The **'limit'** key shows the imposed limit on number of tracks for this search. This could need to be changed if the needed track isn't showing up. 

The **'next'** key shows the next url address needed for the same search offset by 20. This could be useful if a specific song isn't showing up in the first 20 tracks.

The **'offset'** key shows the number of tracks the search has been offset by.

The **'previous'** key is empty here, but I would assume this would be similar to next, however showing the url for the search results before being offset.

The **'total'** key shows the total number of search results available.

In [110]:
# look at the output from the correct track that matches test search
track_search['tracks']['items'][0]

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0Xi59sEw38vRvwleSAVqoo'},
    'href': 'https://api.spotify.com/v1/artists/0Xi59sEw38vRvwleSAVqoo',
    'id': '0Xi59sEw38vRvwleSAVqoo',
    'name': 'Marty Robbins',
    'type': 'artist',
    'uri': 'spotify:artist:0Xi59sEw38vRvwleSAVqoo'}],
  'available_markets': ['AD',
   'AE',
   'AG',
   'AL',
   'AM',
   'AO',
   'AR',
   'AT',
   'AU',
   'AZ',
   'BA',
   'BB',
   'BD',
   'BE',
   'BF',
   'BG',
   'BH',
   'BI',
   'BJ',
   'BN',
   'BO',
   'BR',
   'BS',
   'BT',
   'BW',
   'BY',
   'BZ',
   'CA',
   'CH',
   'CI',
   'CL',
   'CM',
   'CO',
   'CR',
   'CV',
   'CW',
   'CY',
   'CZ',
   'DE',
   'DJ',
   'DK',
   'DM',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FJ',
   'FM',
   'FR',
   'GA',
   'GB',
   'GD',
   'GE',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GR',
   'GT',
   'GW',
   'GY',
   'HK',
   'HN',
   'HR',
   'HT',
   'HU',
   'ID'

In [111]:
# Getting the available features of the track from search. 
track_search['tracks']['items'][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

In [18]:
# validating that there were was a limit of 20 tracks on this search
print(f"Total tracks from search - {track_search['tracks']['total']}")
print(f"Tracks from this search - {len(track_search['tracks']['items'])}")

Total tracks from search - 7656
Tracks from this search - 20


In [113]:
# validate search with song names
print(track_search['tracks']['items'][0]['name'])
df['song'][0] == track_search['tracks']['items'][0]['name']

El Paso


True

In [114]:
# validate search with song's artist name
print(track_search['tracks']['items'][0]['artists'][0]['name'])
df['artist'][0] == track_search['tracks']['items'][0]['artists'][0]['name']

Marty Robbins


True

In [115]:
# validate search checking release year is the same year 
# or earlier then hot 100 first apperance.
print(track_search['tracks']['items'][0]['album']['release_date'])
df['year'][0] >= int(track_search['tracks']['items'][0]['album']['release_date'])

1959


True

One way to validate the search is to make sure the name of the song in the Hot100 dataframe is the same as the track from the search. As mentioned before, many songs have been originally written and performed by one artist and later covered by another with both versions appearing on the Hot100. Beyond that, some songs just share the same name but are entirely different songs. As such, this will only be a first pass to validate the song but I will need more validation steps.

Another way to validate the search is to make sure the name of the artist from the Hot100 dataframe is the same as the artist from the search. Some problems may arise here due to artists names being recorded differently in the Wikipedia data compared with the Spotify WebAPI. I do not see this being a major issue, as I can check that the artist name from Wiki is contained within the API data rather then perfectly matching. Following this I can perform some data cleaning to fix any special cases that arise.

The last way I'll want to validate the search is to make sure the release date of the track was equal to or earlier then the Hot100 version. The release date data from Spotify is based on the album release rather then the individual song (this may not be the case where singles are involved, could be worth looking into further if problems start arising). As the recommender is going to be based off of the Billboard Hot 100 songs, I will be using the year the song hit the Hot 100 as the songs reference year. Given this, I will validate that the songs album was released in the same year or earlier.

# Search & Data Pull for Hot 100 Songs from Spotify WebAPI
## Initial Song Search for Standard Data

Now that I have an understanding of the layout of Spotify's JSON data and a plan for song search validation, I can start my data pull. I will need to work with the WebAPI in 3 steps:

1. Initial pull of basic song data song (name, artist, album, popularity, explicit, and id). This will also be where search validation happens which ensures the following steps will be working with the correct song id's.
2. Song Feature data pull for each song to gather metadata such as danceability, energy, key, etc.
3. Song Analysis data pull for each song to gather metadata such as number of samples, duration, end of fade in, start of fade out, etc.

I will save out to a .csv file after every successful data pull to create an artificial save point in the data pull process. I will also be using a counter variable *(c1)* during each pull in case of any connection issues with the Spotify WebAPI. If there are any, the loop will be able to pick back up where it left off retaining all data that was previously pulled.

In [None]:
# Create an empty list to store track song_info list
# this is stored in a different cell then the main loop incase there
# is an error that pops up during the loop - list retains what was pulled
# prior to error and doesn't get reset when the cell with loop is run
track_list = []

# create an empty list to store any missing songs
missing_songs = []

# start counter at 0 - increase by 1 after every song
# if there is an error during pull c1 will be used to determine where
# error occured and as the starting point for the loop
c1 = 0

In [140]:
# Reauthenticate Spotify Client
spotify = SpotifyAPI()

# Test build of a search loop to find the correct item for the first 5 songs
# in the hot100 dataframe.
# the x-variable is a counter to loop through only 5 songs
# the c-variable is a counter for the hot100 dataframe to keep track of the
# current song being searched
for song in df['song'][c1:]:
    # Collect search query for each song in the loop
    track_search = spotify.search({"track": str(song)}, search_type="track")
    
    if len(track_search['tracks']['items']) > 0:
        # The limit for each search query is 20 so the i counter will 
        # loop through a range of 20 to check each item in the query against 
        # the song from hot100.
        for i in range(len(track_search['tracks']['items'])):  

            song_info = []

            # Check if hot100 song matches query items song name
            # replace method was added due to name issues with
            # 'Theme from A Summer Place'. Based on this more cleaning
            # may be needed as I go through the rest of the hot100 dataframe.
            if song in track_search['tracks']['items'][i]['name'].replace('"', ''):

                # Check if hot100 song's artist matches the name of the
                # query items artist as well as the artist listed under
                # the query items album. Some items match artist name but are
                # from compilation albums which are listed as having various artists.
                if (df['artist'][c1] in track_search['tracks']['items'][i]['artists'][0]['name']) and\
                (df['artist'][c1] in track_search['tracks']['items'][i]['album']['artists'][0]['name']):

                    # Check to see if query item was released in the 
                    # same year or earlier as the hot100 song. Some albums get
                    # reissued at later dates so this check helps filter those out
                    if df['year'][c1] <= int(track_search['tracks']['items'][i]['album']['release_date'][:4]):
                        song_info.append(track_search['tracks']['items'][i]['name'])
                        song_info.append(track_search['tracks']['items'][i]['album']['name'])
                        song_info.append(track_search['tracks']['items'][i]['album']['artists'][0]['name'])
                        song_info.append(track_search['tracks']['items'][i]['popularity'])
                        song_info.append(track_search['tracks']['items'][i]['id'])
                        song_info.append(track_search['tracks']['items'][i]['explicit'])

                        track_list.append(song_info)
                        
    # if song could not be found append song to missing_songs list
    else:
        missing_songs.append(song)
        
    # Add 1 to c as long as it is still smaller then x.
    # Limits the number of loops and keeps track of hot100 song index
    c1 += 1
    time.sleep(3)
        
song_df_col = [
    'artist_name',
    'album_name',
    'song_name',
    'popularity',
    'track_id',
    'track_explicit'
]

song_df = pd.DataFrame(track_list, columns=song_df_col)

q=track%3ALet%27s+Stay+Together&type=track
q=track%3AWithout+You&type=track
q=track%3AHeart+of+Gold&type=track
q=track%3AA+Horse+with+No+Name&type=track
q=track%3AThe+First+Time+Ever+I+Saw+Your+Face&type=track
q=track%3AOh+Girl&type=track
q=track%3AI%27ll+Take+You+There&type=track
q=track%3AThe+Candy+Man&type=track
q=track%3ASong+Sung+Blue&type=track
q=track%3ALean+on+Me&type=track
q=track%3AAlone+Again+%28Naturally%29&type=track
q=track%3ABrandy+%28You%27re+a+Fine+Girl%29&type=track
q=track%3ABlack+and+White&type=track
q=track%3ABaby%2C+Don%27t+Get+Hooked+on+Me&type=track
q=track%3ABen&type=track
q=track%3AMy+Ding-a-Ling&type=track
q=track%3AI+Can+See+Clearly+Now&type=track
q=track%3APapa+Was+a+Rollin%27+Stone&type=track
q=track%3AI+Am+Woman&type=track
q=track%3AMe+and+Mrs.+Jones&type=track
q=track%3AYou%27re+So+Vain&type=track
q=track%3ASuperstition&type=track
q=track%3ACrocodile+Rock&type=track
q=track%3AKilling+Me+Softly+with+His+Song&type=track
q=track%3ALove+Train&type=track
q=tr

q=track%3ARing+My+Bell&type=track
q=track%3ABad+Girls&type=track
q=track%3AGood+Times&type=track
q=track%3AMy+Sharona&type=track
q=track%3ASad+Eyes&type=track
q=track%3ADon%27t+Stop+%27Til+You+Get+Enough&type=track
q=track%3ARise&type=track
q=track%3APop+Muzik&type=track
q=track%3AHeartache+Tonight&type=track
q=track%3AStill&type=track
q=track%3ANo+More+Tears+%28Enough+Is+Enough%29&type=track
q=track%3ABabe&type=track
q=track%3AEscape+%28The+Pi%C3%B1a+Colada+Song%29&type=track
q=track%3APlease+Don%27t+Go&type=track
q=track%3ARock+with+You&type=track
q=track%3ADo+That+to+Me+One+More+Time&type=track
q=track%3ACrazy+Little+Thing+Called+Love&type=track
q=track%3AAnother+Brick+in+the+Wall%2C+Part+II&type=track
q=track%3ACall+Me&type=track
q=track%3AFunkytown&type=track
q=track%3AComing+Up&type=track
q=track%3AIt%27s+Still+Rock+and+Roll+to+Me&type=track
q=track%3AMagic&type=track
q=track%3ASailing&type=track
q=track%3AUpside+Down&type=track
q=track%3AAnother+One+Bites+the+Dust&type=track
q=t

q=track%3ALook+Away&type=track
q=track%3AEvery+Rose+Has+Its+Thorn&type=track
q=track%3AMy+Prerogative&type=track
q=track%3ATwo+Hearts&type=track
q=track%3AWhen+I%27m+with+You&type=track
q=track%3AStraight+Up&type=track
q=track%3ALost+in+Your+Eyes&type=track
q=track%3AThe+Living+Years&type=track
q=track%3AEternal+Flame&type=track
q=track%3AThe+Look&type=track
q=track%3AShe+Drives+Me+Crazy&type=track
q=track%3ALike+a+Prayer&type=track
q=track%3AI%27ll+Be+There+for+You&type=track
q=track%3AForever+Your+Girl&type=track
q=track%3ARock+On&type=track
q=track%3AWind+Beneath+My+Wings&type=track
q=track%3AI%27ll+Be+Loving+You+%28Forever%29&type=track
q=track%3ASatisfied&type=track
q=track%3ABaby+Don%27t+Forget+My+Number&type=track
q=track%3AGood+Thing&type=track
q=track%3AIf+You+Don%27t+Know+Me+by+Now&type=track
q=track%3AToy+Soldiers&type=track
q=track%3ABatdance&type=track
q=track%3ARight+Here+Waiting&type=track
q=track%3ACold+Hearted&type=track
q=track%3AHangin%27+Tough&type=track
q=track%3AD

In [141]:
# write song_df to a .csv file to save inital spotify pull
song_df.to_csv('first_spotify_pull.csv')

In [151]:
len(song_df['song_name'].unique())

958

In [None]:
# get the shape of song_df to validate future pulls against
song_df.shape

In [None]:
song_df.head()

## Song Feature Data Pull

After a successful first pull, I can now move onto pulling the song feature data. I am interested in getting all of the features available here, as I'm predicting these features will end up being the most useful moving forward with my recommendations.

I will be utilizing a counter variable *(c2)* in the same fashion as I have above. After the pull has finished I will print out the shape of the song_feat_df to make sure it is the same as the song_df before the merge. I will finish this part by saving out a .csv for a second time. 

In [None]:
# create an empty list to store song feature list
# this is stored in a different cell then the main loop incase there
# is an error that pops up during the loop - list retains what was pulled
# prior to error and doesn't get reset when the cell with loop is run
song_feat_list = []

# start counter at 0 - increase by 1 after every song
# if there is an error during pull c2 will be used to determine where
# error occured and as the starting point for the loop
c2 = 0

In [153]:
# Reauthenticating Client
spotify = SpotifyAPI()

# loop through song_df using the track_id's 
# for each song that was gathered in the first step
for song in song_df['track_id'][c2:]:
    
    # pull song feature data
    song_feat = spotify.get_features(song)
    
    song_feat_list.append([v for k,v in song_feat.items()])
    
    c2 += 1
    time.sleep(3)
    
# all features available for all songs 
# so can use the col names from any song feature search
# as col names for dataframe
song_feat_df_col = [k for k,v in song_feat.items()]

# create a song feature dataframe
song_feat_df = pd.DataFrame(song_feat_list, columns=song_feat_df_col)

# validate the shape is the same as song_df
print(song_feat_df.shape)
song_feat_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.597,0.4720,0,-11.721,1,0.0342,0.828000,0.000002,0.1440,0.561,107.590,audio_features,4CIaUS9qVxS6RsQBnC37EU,spotify:track:4CIaUS9qVxS6RsQBnC37EU,https://api.spotify.com/v1/tracks/4CIaUS9qVxS6...,https://api.spotify.com/v1/audio-analysis/4CIa...,251773,4
1,0.644,0.3520,4,-10.562,1,0.0358,0.874000,0.000000,0.1400,0.643,96.896,audio_features,2jqx9Oq9ZErm5ywDblnvHi,spotify:track:2jqx9Oq9ZErm5ywDblnvHi,https://api.spotify.com/v1/tracks/2jqx9Oq9ZErm...,https://api.spotify.com/v1/audio-analysis/2jqx...,498387,3
2,0.762,0.4190,5,-9.312,0,0.0653,0.744000,0.000000,0.1440,0.765,119.964,audio_features,0x6gSfnYA91AHPLvULn5NK,spotify:track:0x6gSfnYA91AHPLvULn5NK,https://api.spotify.com/v1/tracks/0x6gSfnYA91A...,https://api.spotify.com/v1/audio-analysis/0x6g...,158213,4
3,0.772,0.2970,5,-14.679,0,0.0530,0.854000,0.000008,0.1250,0.822,119.987,audio_features,1RYznli2VNO7FCbW1Hq4KM,spotify:track:1RYznli2VNO7FCbW1Hq4KM,https://api.spotify.com/v1/tracks/1RYznli2VNO7...,https://api.spotify.com/v1/audio-analysis/1RYz...,158200,4
4,0.584,0.0863,0,-15.537,1,0.0403,0.775000,0.000000,0.2120,0.460,101.493,audio_features,3PymNAkWROfyEVeYq6XtjD,spotify:track:3PymNAkWROfyEVeYq6XtjD,https://api.spotify.com/v1/tracks/3PymNAkWROfy...,https://api.spotify.com/v1/audio-analysis/3Pym...,158200,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,0.648,0.6220,7,-6.063,1,0.0428,0.001520,0.000060,0.1090,0.510,88.684,audio_features,0BUoLE4o9eVahDHvTqak67,spotify:track:0BUoLE4o9eVahDHvTqak67,https://api.spotify.com/v1/tracks/0BUoLE4o9eVa...,https://api.spotify.com/v1/audio-analysis/0BUo...,278067,4
1610,0.660,0.6190,7,-6.295,1,0.0428,0.000919,0.000253,0.1310,0.572,88.676,audio_features,6iFP92AND2nfZrEQyY0mN8,spotify:track:6iFP92AND2nfZrEQyY0mN8,https://api.spotify.com/v1/tracks/6iFP92AND2nf...,https://api.spotify.com/v1/audio-analysis/6iFP...,240067,4
1611,0.648,0.6360,7,-6.054,1,0.0401,0.001100,0.000064,0.1250,0.532,88.692,audio_features,3rkXtjSan4H20nXkY5HKML,spotify:track:3rkXtjSan4H20nXkY5HKML,https://api.spotify.com/v1/tracks/3rkXtjSan4H2...,https://api.spotify.com/v1/audio-analysis/3rkX...,278000,4
1612,0.660,0.6280,7,-6.270,1,0.0434,0.001370,0.000275,0.1200,0.547,88.668,audio_features,5rPEBtAvWfa0WN3rtcrgUq,spotify:track:5rPEBtAvWfa0WN3rtcrgUq,https://api.spotify.com/v1/tracks/5rPEBtAvWfa0...,https://api.spotify.com/v1/audio-analysis/5rPE...,240533,4


In [154]:
# merge song_df with song_feature_df
song_df = song_df.merge(song_feat_df, how='left', left_on='track_id', right_on='id')

In [155]:
# write out soung_df to .csv file as a
# save point with basic song data and song features
song_df.to_csv('second_spotify_pull.csv')

## Song Analysis Data Pull

Now that I have basic and feature data for each song, I will pull song analysis data, merge with song_df, and export a final .csv file. Seeing as some of the features in the song analysis have already been pulled (key, tempo, mode, etc.) I will specify which features I want data for. I will be utilizing a counter variable *(c3)* in the same fashion as I have above. 

In [165]:
# create an empty list to store s_list (song analysis data)
# this is stored in a different cell then the main loop incase there
# is an error that pops up during the loop - list retains what was pulled
# prior to error and doesn't get reset when the cell with loop is run
song_analysis_list = []

# start counter at 0 - increase by 1 after every song
# if there is an error during pull c3 will be used to determine where
# error occured and as the starting point for the loop
c3 = 0

In [174]:
# Reauthenticating Client
spotify = SpotifyAPI()

# Only need to grab the features of interest
# certian features from analysis have already been pulled in feature pull
# keeping 'id' in for merge at the end
analysis_feat = [
    'num_samples',
    'duration',
    'analysis_sample_rate',
    'end_of_fade_in',
    'start_of_fade_out',
    'tempo_confidence',
    'time_signature_confidence',
    'key_confidence',
    'mode_confidence',
    'id'
]


# loop through song_df using the track_id's 
# for each song that was gathered in the first step
for song in song_df['track_id'][c:]:
    
    # pull song analysis data
    song_analysis = spotify.get_analysis(song)
    
    s_list = [v for k,v in song_analysis['track'].items() if k in analysis_feat]
    
    # append song 'track_id' back to s_list to 
    # use later when merging dataframes
    s_list.append(song)
    song_analysis_list.append(s_list)
    
    c3 += 1
    time.sleep(3)
    
song_analysis_df = pd.DataFrame(song_analysis_list, columns=analysis_feat)

In [176]:
# validate the shape is the same as song_df
print(song_analysis_df.shape)
song_analysis_df.head()

(1624, 15)


Unnamed: 0,num_samples,duration,analysis_sample_rate,end_of_fade_in,start_of_fade_out,loudness,tempo,tempo_confidence,time_signature,time_signature_confidence,key,key_confidence,mode,mode_confidence,id
0,5551602,251.77333,22050,0.11723,244.99956,-11.721,107.59,0.767,4,0.708,0,0.914,1,0.788,4CIaUS9qVxS6RsQBnC37EU
1,10989426,498.38666,22050,0.24721,492.39075,-10.562,96.896,0.378,3,0.607,4,0.89,1,0.714,2jqx9Oq9ZErm5ywDblnvHi
2,3488604,158.21333,22050,0.0,151.93398,-9.312,119.964,0.639,4,1.0,5,0.766,0,0.602,0x6gSfnYA91AHPLvULn5NK
3,3488310,158.2,22050,1.13624,151.88753,-14.679,119.987,0.7,4,1.0,5,0.686,0,0.585,1RYznli2VNO7FCbW1Hq4KM
4,3488310,158.2,22050,0.30227,151.19093,-15.537,101.493,0.438,4,0.767,0,0.886,1,0.725,3PymNAkWROfyEVeYq6XtjD


In [177]:
# merge song_df with analysis dataframe to 
# capture all needed data in one dataframe
song_df = song_df.merge(song_analysis_df, how='left',\
                        left_on='track_id', right_on='id')

In [178]:
print(song_df.shape)
song_df.head()

Unnamed: 0,artist_name,album_name,song_name,popularity,track_id,track_explicit,danceability,energy,key_x,loudness_x,...,loudness_y,tempo_y,tempo_confidence,time_signature_y,time_signature_confidence,key_y,key_confidence,mode_y,mode_confidence,id_y
0,Marty Robbins,The Essential Marty Robbins 1951-1982,El Paso City,40,4CIaUS9qVxS6RsQBnC37EU,False,0.597,0.472,0,-11.721,...,-11.721,107.59,0.767,4,0.708,0,0.914,1,0.788,4CIaUS9qVxS6RsQBnC37EU
1,Marty Robbins,The Drifter,Faleena (from El Paso),37,2jqx9Oq9ZErm5ywDblnvHi,False,0.644,0.352,4,-10.562,...,-10.562,96.896,0.378,3,0.607,4,0.89,1,0.714,2jqx9Oq9ZErm5ywDblnvHi
2,Johnny Preston,Golden Selection (Remastered),Running Bear - Remastered,32,0x6gSfnYA91AHPLvULn5NK,False,0.762,0.419,5,-9.312,...,-9.312,119.964,0.639,4,1.0,5,0.766,0,0.602,0x6gSfnYA91AHPLvULn5NK
3,Johnny Preston,Greatest Hits,Running Bear,36,1RYznli2VNO7FCbW1Hq4KM,False,0.772,0.297,5,-14.679,...,-14.679,119.987,0.7,4,1.0,5,0.686,0,0.585,1RYznli2VNO7FCbW1Hq4KM
4,Mark Dinning,The Lovin' Touch,Teen Angel,29,3PymNAkWROfyEVeYq6XtjD,False,0.584,0.0863,0,-15.537,...,-15.537,101.493,0.438,4,0.767,0,0.886,1,0.725,3PymNAkWROfyEVeYq6XtjD


In [179]:
# write out song_df to .csv file for the final time with all needed data
song_df.to_csv('final_spotify_pull.csv')