# Data ETL using billboard and spotify data
*****
## Step 1: collect billboard data
Songs date as far back as 1958, where I found the first available chart for Hot 100.

In [None]:
# pip install billboard.py

# in this part we want to build up a data frame of all available charts on billboard
import billboard
import pandas as pd

Hot_10 = pd.DataFrame(columns = ['date','title','artist','rank','weeks','change_in_rank','spotifyID'
    ])


chart = billboard.ChartData('hot-100',date = '2010-12-25') # this breaks sometimes so have to rerun and change date
date = chart.date
prev_year,prev_month = date.split('-')[0],date.split('-')[1]

while chart.previousDate:
    year,month = date.split('-')[0],date.split('-')[1]
    if prev_month != month:
        print 'chart year:{},month:{}'.format(year,month)
    for i in range(10):
        song = chart[i]
        Hot_10 = Hot_10.append([(date,song.title,song.artist,song.rank,song.weeks,song.change,song.spotifyID)])
    chart = billboard.ChartData('hot-100',chart.previousDate)
    prev_year,prev_month = year,month
    date = chart.date
    no += 1


In [None]:
# the scrapping breaks sometimes due to website control so have to rerun and change date
chart = billboard.ChartData('hot-100',date = '1958-08-09') 
date = chart.date
prev_year,prev_month = date.split('-')[0],date.split('-')[1]

while chart.previousDate:
    year,month = date.split('-')[0],date.split('-')[1]
    if prev_month != month:
        print 'chart year:{},month:{}'.format(year,month)
    for i in range(10):
        song = chart[i]
        Hot_10 = Hot_10.append([(date,song.title,song.artist,song.rank,song.weeks,song.change,song.spotifyID)])
    chart = billboard.ChartData('hot-100',chart.previousDate)
    prev_year,prev_month = year,month
    date = chart.date
    no += 1

In [None]:
Hot_10_dropped = Hot_10.drop(Hot_10.columns.values[7:],axis=1)
Hot_10_dropped.columns = ['date','title','artist','rank','weeks','change_in_rank','spotifyID']
Hot_10_dropped.to_pickle('Billboard_data')

In [37]:
import pandas as pd
top10 = pd.read_pickle('Billboard_data')
top10_spotify = top10[top10.spotifyID != u'']
top10_spotify.head(2)

Unnamed: 0,date,title,artist,rank,weeks,change_in_rank,spotifyID
0,2017-03-18,Shape Of You,Ed Sheeran,1.0,8.0,0,0FE9t6xYkqWXU2ahLh6D8X
0,2017-03-18,Bad And Boujee,Migos Featuring Lil Uzi Vert,2.0,16.0,0,1FvU97lrWOG2NRxErh6OZz


*****
## Step 2: collect spotify data

In [56]:
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
import time,json
import pandas as pd
import requests
import re
import random
import urllib

In [96]:
def get_token():
    auth_url = u'https://accounts.spotify.com/api/token'

    ##### private information #####
    client_id = u'c7cdefa717bb477c8d04cb65978e7003'
    client_secret = u'f3571dfd22c040dd84959f858e5c482d'
    ##### wipe if publish in public #####
    client = BackendApplicationClient(client_id=client_id)
    oauth = OAuth2Session(client=client)
    token = oauth.fetch_token(token_url=auth_url, client_id=client_id,
        client_secret=client_secret)

    return token

def get_spotify_json(url,token=None):
    if not token or token['expires_at']-time.time() < 100: # if token less than 100 seconds until expiration
        token = get_token()

    access_token = token['access_token']
    header = {u'Authorization':u'Bearer '+access_token}

    r = requests.get(url,headers=header)
    return r.json(),token

def single_audio_features(track_id,token=None):
    url_head = u'https://api.spotify.com/v1/audio-features/'

    if type(track_id) == str:
        url = url_head + track_id
    else:
        raise TypeError("Track_id should be str!")

    r,token = get_spotify_json(url,token)
    table = pd.DataFrame([r])
    table.drop(['analysis_url','id','track_href','type'],axis=1,inplace=True)
    return table,token
    
def multiple_audio_features(track_ids,token=None):
    url_head = u'https://api.spotify.com/v1/audio-features?ids='

    url = url_head + ','.join(track_ids)

    rs,token = get_spotify_json(url,token)
    table = pd.DataFrame()
    for r in rs['audio_features']:
        row = pd.DataFrame([r])
        row.drop(['analysis_url','id','track_href','type'],axis=1,inplace=True)
        table = table.append(row)

    return table,token

def chunk_100(series):
    for i in xrange(0,len(series),100):
        yield series[i:(i+100)]
    
def search_uri(title,artist,token=None):
    '''
    title eg.: 'never gonna give you up'
    '''
    url_head = u'https://api.spotify.com/v1/search?q='
    name = '+'.join(map(urllib.quote,unicode(title,errors='ignore').split(' ')))

    url = url_head+name+'&type=track&market=US&limit=20'
    
    rs,token = get_spotify_json(url,token)
    
    if 'tracks' in rs and 'items' in rs['tracks']:
        for result in rs['tracks']['items']:
            if result['album']['artists'][0]['name'].lower() == artist.lower():
                return result['id']
    return None


In [39]:
title_artist_spotify_set = set(zip(map(lambda x:x.lower(),top10_spotify.title.values),map(lambda x:x.lower(),top10_spotify.artist.values)))
print len(title_artist_spotify_set)

4314


In [100]:
# select until 4000 songs from MSD dataset which has spotify uri for each track_artist pair

def get_MSD_tracks(title_artist_set):
    MSD_tracks = []
    with open('./MSD/MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt') as f:
        for i,line in enumerate(f):
            if len(MSD_tracks) % 100 == 0:
                print i,len(MSD_tracks)
            title,artist = re.split('<SEP>',line.replace('\n',''))[::-1][:2]
            pair = (title.lower(),artist.lower())

            if len(MSD_tracks) <= 4000:
                if pair not in title_artist_set:
                    uri = search_uri(title,artist)
                    if uri:
                        MSD_tracks.append((title,artist,uri))
            else:
                return MSD_tracks
        return MSD_tracks

MSD_tracks = get_MSD_tracks(title_artist_spotify_set)

0 0




211 100
212 100
475 200
476 200
730 300
731 300
732 300
979 400
1200 500
1421 600
1422 600
1423 600
1424 600
1691 700
1692 700
1693 700
1942 800
2198 900
2464 1000
2465 1000
2466 1000
2699 1100
2939 1200
2940 1200
2941 1200
2942 1200
2943 1200
3214 1300
3215 1300
3216 1300
3484 1400
3739 1500
4023 1600
4024 1600
4257 1700
4523 1800
4785 1900
4786 1900
5003 2000
5004 2000
5005 2000
5227 2100
5488 2200
5489 2200
5716 2300
5966 2400
5967 2400
6226 2500
6227 2500
6508 2600
6509 2600
6747 2700
6748 2700
6749 2700
6750 2700
6751 2700
6752 2700
6753 2700
6754 2700
6755 2700
6999 2800
7000 2800
7001 2800
7002 2800
7003 2800
7004 2800
7005 2800
7006 2800
7235 2900
7236 2900
7461 3000
7462 3000
7726 3100
7727 3100
7988 3200
8259 3300
8260 3300
8498 3400
8499 3400
8500 3400
8501 3400
8502 3400
8503 3400
8504 3400
8505 3400
8506 3400
8781 3500
8782 3500
9042 3600
9043 3600
9044 3600
9259 3700
9522 3800
9800 3900


In [104]:
MSD_track_ids = map(lambda x:x[2],MSD_tracks)
import os.path

audio_data = pd.DataFrame()
if not os.path.isfile('MSD_audio_features'):
    for i,track_ids in enumerate(chunk_100(MSD_track_ids)):
        audio_features,token = multiple_audio_features(track_ids)
        audio_data = audio_data.append(audio_features)
        
    assert len(audio_data)==len(MSD_track_ids)
    audio_data.to_pickle('MSD_audio_features')

else:
    audio_data = pd.read_pickle('MSD_audio_features')