# Data ETL using billboard and spotify data


In [15]:
# all packages
import billboard
import pandas as pd
import os

from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
import time,json
import requests
import re
import random
import urllib
import os.path

*****
## Step 1: collect billboard data
Songs date as far back as 1958, where I found the first available chart for Hot 100.

In [1]:
# pip install billboard.py

# in this part we want to build up a data frame of all available charts on billboard

if not os.path.isfile('Billboard_data'):
    Hot_10 = pd.DataFrame(columns = ['date','title','artist','rank','weeks','change_in_rank','spotifyID'
        ])


    chart = billboard.ChartData('hot-100',date = '2010-12-25') # this breaks sometimes so have to rerun and change date
    date = chart.date
    prev_year,prev_month = date.split('-')[0],date.split('-')[1]

    while chart.previousDate:
        year,month = date.split('-')[0],date.split('-')[1]
        if prev_month != month:
            print 'chart year:{},month:{}'.format(year,month)
        for i in range(10):
            song = chart[i]
            Hot_10 = Hot_10.append([(date,song.title,song.artist,song.rank,song.weeks,song.change,song.spotifyID)])
        chart = billboard.ChartData('hot-100',chart.previousDate)
        prev_year,prev_month = year,month
        date = chart.date
        no += 1


In [2]:
if not os.path.isfile('Billboard_data'):
    # the scrapping breaks sometimes due to website control so have to rerun and change date
    chart = billboard.ChartData('hot-100',date = '1958-08-09') 
    date = chart.date
    prev_year,prev_month = date.split('-')[0],date.split('-')[1]

    while chart.previousDate:
        year,month = date.split('-')[0],date.split('-')[1]
        if prev_month != month:
            print 'chart year:{},month:{}'.format(year,month)
        for i in range(10):
            song = chart[i]
            Hot_10 = Hot_10.append([(date,song.title,song.artist,song.rank,song.weeks,song.change,song.spotifyID)])
        chart = billboard.ChartData('hot-100',chart.previousDate)
        prev_year,prev_month = year,month
        date = chart.date
        no += 1
        
    Hot_10_dropped = Hot_10.drop(Hot_10.columns.values[7:],axis=1)
    Hot_10_dropped.columns = ['date','title','artist','rank','weeks','change_in_rank','spotifyID']
    Hot_10_dropped.to_pickle('Billboard_data')

In [3]:

top10 = pd.read_pickle('Billboard_data')
top10_spotify = top10[top10.spotifyID != u'']
top10_spotify.head(2)

Unnamed: 0,date,title,artist,rank,weeks,change_in_rank,spotifyID
0,2017-03-18,Shape Of You,Ed Sheeran,1.0,8.0,0,0FE9t6xYkqWXU2ahLh6D8X
0,2017-03-18,Bad And Boujee,Migos Featuring Lil Uzi Vert,2.0,16.0,0,1FvU97lrWOG2NRxErh6OZz


*****
## Step 2: collect spotify data

In [6]:
def get_token():
    auth_url = u'https://accounts.spotify.com/api/token'

    ##### private information #####
    client_id = u'client_id_string_here'
    client_secret = u'client_secret_string_here'
    ##### wipe if publish in public #####
    client = BackendApplicationClient(client_id=client_id)
    oauth = OAuth2Session(client=client)
    token = oauth.fetch_token(token_url=auth_url, client_id=client_id,
        client_secret=client_secret)

    return token

def get_spotify_json(url,token=None):
    if not token or token['expires_at']-time.time() < 100: # if token less than 100 seconds until expiration
        token = get_token()

    access_token = token['access_token']
    header = {u'Authorization':u'Bearer '+access_token}

    r = requests.get(url,headers=header)
    return r.json(),token

def single_audio_features(track_id,token=None):
    url_head = u'https://api.spotify.com/v1/audio-features/'

    if type(track_id) == str:
        url = url_head + track_id
    else:
        raise TypeError("Track_id should be str!")

    r,token = get_spotify_json(url,token)
    table = pd.DataFrame([r])
    table.drop(['analysis_url','id','track_href','type'],axis=1,inplace=True)
    return table,token
    
def multiple_audio_features(track_ids,token=None):
    url_head = u'https://api.spotify.com/v1/audio-features?ids='

    url = url_head + ','.join(track_ids)

    rs,token = get_spotify_json(url,token)
    table = pd.DataFrame()
    for r in rs['audio_features']:
        row = pd.DataFrame([r])
        row.drop(['analysis_url','id','track_href','type'],axis=1,inplace=True)
        table = table.append(row)

    return table,token

def chunk_100(series):
    for i in xrange(0,len(series),100):
        yield series[i:(i+100)]
    
def search_uri(title,artist,token=None):
    '''
    title eg.: 'never gonna give you up'
    '''
    url_head = u'https://api.spotify.com/v1/search?q='
    name = '+'.join(map(urllib.quote,unicode(title,errors='ignore').split(' ')))

    url = url_head+name+'&type=track&market=US&limit=20'
    
    rs,token = get_spotify_json(url,token)
    
    if 'tracks' in rs and 'items' in rs['tracks']:
        for result in rs['tracks']['items']:
            if result['album']['artists'][0]['name'].lower() == artist.lower():
                return result['id']
    return None


## Get audio features for billboard songs

In [9]:
top10_spotify = top10[top10.spotifyID != u'']
distinct_trackids = list(set(top10_spotify.spotifyID.values))

audio_data = pd.DataFrame()
if not os.path.isfile('Spotify_audio_features'):
    for i,track_ids in enumerate(chunk_100(distinct_trackids)):
        audio_features,token = multiple_audio_features(track_ids,token)
        audio_data = audio_data.append(audio_features)
        
    assert len(audio_data)==len(distinct_trackids)
    audio_data.to_pickle('Spotify_audio_features')

else:
    audio_data = pd.read_pickle('Spotify_audio_features')
    # be aware that some songs are not in their original form given the spotifyID
    # eg. smooth criminal by Michael Jackson, the given spotifyID on billboard.com
    # is song by alien ant farm, and is recreated into another style.
    # however this should not be common in this dataset, not exceeding 700
    # 700 is achieved using billboard_crosscheck function,which also includes
    # naming differences and copyright issues. eg. Taylor Swift, Adele don't
    # have their songs available on spotify, also beyonce has encoded as beyonc/xe9
    # in spotify database

## Get audio features for 4000 MSD songs

In [10]:
title_artist_spotify_set = set(zip(map(lambda x:x.lower(),top10_spotify.title.values),map(lambda x:x.lower(),top10_spotify.artist.values)))
print len(title_artist_spotify_set)

4314


In [19]:
def get_MSD_tracks(title_artist_set):
    MSD_tracks = []
    with open('./MSD/MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt') as f:
        for i,line in enumerate(f):
            if len(MSD_tracks) % 100 == 0:
                print i,len(MSD_tracks)
            title,artist = re.split('<SEP>',line.replace('\n',''))[::-1][:2]
            pair = (title.lower(),artist.lower())

            if len(MSD_tracks) <= 4000:
                if pair not in title_artist_set:
                    uri = search_uri(title,artist)
                    if uri:
                        MSD_tracks.append((title,artist,uri))
            else:
                return MSD_tracks
        return MSD_tracks

if not os.path.isfile('MSD_tracks'):
    MSD_tracks = get_MSD_tracks(title_artist_spotify_set)
    MSD = pd.DataFrame(MSD_tracks,columns=['title','artist','uri'])
    MSD.to_pickle('MSD_tracks')
else:
    MSD = pd.read_pickle('MSD_tracks')
    MSD_track_ids = MSD.uri.values

In [21]:
MSD.head(2)

Unnamed: 0,title,artist,uri
0,I Didn't Mean To,Casual,01TR6aAKrA2cI3Z0gnCOsu
1,Soul Deep,The Box Tops,4yPl1mK1oluIrCwI4HInPR


In [13]:
audio_data = pd.DataFrame()
if not os.path.isfile('MSD_audio_features'):
    for i,track_ids in enumerate(chunk_100(MSD_track_ids)):
        audio_features,token = multiple_audio_features(track_ids)
        audio_data = audio_data.append(audio_features)
        
    assert len(audio_data)==len(MSD_track_ids)
    audio_data.to_pickle('MSD_audio_features')

else:
    audio_data = pd.read_pickle('MSD_audio_features')

*****
## Step 3 Get lyrics from lyrics.wikia.com

According to the original plan I was to get the lyrics from Musixmatch, but for their free version they only provide 30% of the lyrics per song, and has a quota of 2000 songs per day. So I looked for other resources and found this website, and scraped from its html code
link to the website:
http://lyrics.wikia.com/

Since it's a long process I singled the whole process out for a separate notebook. Please check '1.5 get lyrics.ipynb' for details

*****
## Step 4 Look for other songs by same billboard artists

From Step 3 I found that only 46% of the MSD songs have lyrics available, and from those whose lyrics are not available, I found that was because they were very unpopular songs, some in foreign languages even.(details can be found in '1.5 get lyrics.ipynb')

Therefore I decided to replace MSD songs with a list of songs by billboard artists, in that sense I can achieve two goals:
- 1 by limiting artists to those once have billboard hit songs, I can compare songs with their production companies a controlled variable, i.e. the influence of production companies will be limited and under control because of sourcing process of data.
- 2 I can still get 4000 more non-billboard songs, and get the lyrics of them and build the prediction model

In [2]:
top10_spotify = pd.read_pickle('Billboard_data')
top10_spotify = top10_spotify[top10_spotify.spotifyID != '']
unique = top10_spotify.copy()
unique = unique.loc[:,['title','artist','spotifyID']]
unique = unique.drop_duplicates(subset='spotifyID',keep='first')
len(unique) # should be 4264

In [103]:
def chunk_50(series): # requesting track info needs smaller size of requests
    for i in xrange(0,len(series),50):
        yield series[i:(i+50)]

def get_track_info(chunked_spotifyIDs,token=None):
    '''
    chuncked_spotifyIDs:chunk_20,track IDs
    '''
    url_head = u'https://api.spotify.com/v1/tracks?ids='

    if type(chunked_spotifyIDs) == list:
        url = url_head + ','.join(chunked_spotifyIDs)
    else:
        raise TypeError("IDs should be list!")

    rs,token = get_spotify_json(url,token)
    results = []
    if 'tracks' in rs:
        for r in rs['tracks']:
            title,uri,pos,artist,album = r['name'],r['uri'],r['track_number'],r['artists'][0]['uri'],r['album']['uri']
            results.append((title,uri,pos,artist,album))
        results = pd.DataFrame(results,columns = ['title','track_uri','track_number','artist','album_uri'])
        return results
    return None

def get_album_tracks(album_id,token=None):
    '''
    in: album_id: u'0FE9t6xYkqWXU2ahLh6D8X'
    out: [trackID1,...]
    '''
    url_head = u'https://api.spotify.com/v1/albums/'

    url = url_head+album_id+'/tracks?market=US'

    rs,token = get_spotify_json(url,token)
    
    results = []
    if 'items' in rs:
        for result in rs['items']:
            r = result['uri']
            results.append(r)
        return results #track_uri
    return None

def get_same_album_tracks(spotifyIDs,chosen_songs,token=None):
    '''
    spotifyIDs:album IDs
    chosen_songs: dict of chosen songs,initiated as list of billboard songs
    '''
    results = [] # stores track ids
    for n,album_id in enumerate(spotifyIDs):
        if n % 100 == 0:
            print 'Processing {:{prec}} of total albums'.format(n/4264.0,prec='.2')
        tracks = get_album_tracks(album_id,token)
        
        for i in range(len(tracks)):
            track_id = tracks[i].split(':')[-1]
            if track_id not in chosen_songs: # choose a song
                chosen_songs[track_id] = True
                results.append(track_id)
                break
    return results,chosen_songs

In [110]:
if not os.path.isfile('Billboard_track_info'):
    unique_ids = unique.spotifyID.tolist()
    
    unique_ids = chunk_50(unique_ids)
    billboard_track_info = pd.DataFrame()
    for ids in unique_ids:
        billboard_track_info = billboard_track_info.append(get_track_info(ids))
    
    billboard_track_info.to_pickle('Billboard_track_info')
else:
    billboard_track_info = pd.read_pickle('Billboard_track_info')

[u'0FE9t6xYkqWXU2ahLh6D8X', u'1FvU97lrWOG2NRxErh6OZz']


In [106]:
# if not os.path.isfile('same_album_tracks.p'):
    
#     pickle.dump(same_album_tracks, open('same_album_tracks.p', 'wb')) 
# else:
#     same_album_tracks = pickle.load(open('same_album_tracks.p','rb'))
    
if not os.path.isfile('Same_album_track_info'):
    unique_ids = unique.spotifyID.tolist()
    chosen_songs = {ids:True for ids in unique_ids}
    billboard_album_ids = map(lambda x:x.split(':')[-1],billboard_track_info['album_uri'])
    # takes about 30 mins
    same_album_tracks,added_chosen_songs = get_same_album_tracks(billboard_album_ids,chosen_songs)
    
    same_chunked = chunk_50(same_album_tracks)
    same_track_info = pd.DataFrame()
    for ids in same_chunked:
        same_track_info = same_track_info.append(get_track_info(ids))
    
    same_track_info.to_pickle('Same_album_track_info')
else:
    same_track_info = pd.read_pickle('Same_album_track_info')
    # NOTICE: eg. 4Km5HrUvYTaSUfiSGPJeQR(track_id), is contained in album 3tWaxq6QGN4jkrrXHcvNBA
    # but when look up track info on Spotify using this track_id, it is listed under album 2AvupjUeMnSffKEV05x222
    # so have to use title + artist to do filtering

In [107]:
same_track_info.head()

Unnamed: 0,title,track_uri,track_number,artist,album_uri
0,Bad and Boujee (feat. Lil Uzi Vert),spotify:track:4Km5HrUvYTaSUfiSGPJeQR,4,spotify:artist:6oMuImdp5ZcFhWP0ESe6mG,spotify:album:2AvupjUeMnSffKEV05x222
1,I Don’t Wanna Live Forever (Fifty Shades Darker),spotify:track:2y5aJvzXhHPA94U5GFAcXe,1,spotify:artist:5ZsFI1h6hIdQRw2ti0hz81,spotify:album:5VML6S956h4YfoYPooqLEi
2,24K Magic,spotify:track:6b8Be6ljOzmkOmFslEb23P,1,spotify:artist:0du5cEVh5yTK9QJze8zA0C,spotify:album:4PgleR09JVnm3zY1fW3XBA
3,Consideration,spotify:track:0WgBb7XgdtbUW0GlYel9mH,1,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H,spotify:album:3Q149ZH46Z0f3oDR7vlDYV
4,Bounce Back,spotify:track:0SGkqnVQo9KPytSri1H6cF,3,spotify:artist:0c173mlxpT3dSFRgMO8XPh,spotify:album:0XAIjjN5qxViVS0Y5fYkar


In [111]:
billboard_track_info.head()

Unnamed: 0,title,track_uri,track_number,artist,album_uri
0,Shape of You,spotify:track:0FE9t6xYkqWXU2ahLh6D8X,1,spotify:artist:6eUKZXaKkcviH0Ku9w2n3V,spotify:album:7oJa8bPFKVbq4c7NswXHw8
1,Bad and Boujee (feat. Lil Uzi Vert),spotify:track:1FvU97lrWOG2NRxErh6OZz,1,spotify:artist:6oMuImdp5ZcFhWP0ESe6mG,spotify:album:3tWaxq6QGN4jkrrXHcvNBA
2,I Don’t Wanna Live Forever (Fifty Shades Darker),spotify:track:6yIdwnpDHufLWgQcveTPMk,1,spotify:artist:5ZsFI1h6hIdQRw2ti0hz81,spotify:album:2i54PoQ1ENZSYsZZE8haRr
3,That's What I Like,spotify:track:0KKkJNfGyhkQ5aFogxQAPU,4,spotify:artist:0du5cEVh5yTK9QJze8zA0C,spotify:album:4PgleR09JVnm3zY1fW3XBA
4,Love On The Brain,spotify:track:7HRHMuUTY7Dk4mw7CjS36i,11,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H,spotify:album:2hNdpum500dG6mDXs87nbc


In [149]:
# check duplicate tracks 
i = 0 # billboard
j = 0 # extra
duplicate = []
while i <len(billboard_track_info):
    row_i = billboard_track_info.iloc[i,:]
    row_j = same_track_info.iloc[j,:]
    title_i,artist_i = row_i.title,row_i.artist
    title_j,artist_j = row_j.title,row_j.artist
    if artist_i == artist_j:
        if title_i == title_j:
            duplicate.append(j)
        i += 1
        j += 1
    else:
        i += 1

same_track_info = same_track_info.reset_index(range(len(same_track_info)))
same_track_info.drop(same_track_info.index[duplicate],inplace=True)

In [154]:
if not os.path.isfile('Same_album_track_audio_features'):
    audio_data = pd.DataFrame()
    distinct_trackids = map(lambda x:x.split(':')[-1],same_track_info.track_uri)
    token = get_token()
    for i,track_ids in enumerate(chunk_100(distinct_trackids)):
        audio_features,token = multiple_audio_features(track_ids,token)
        audio_data = audio_data.append(audio_features)
        
    assert len(audio_data)==len(distinct_trackids)
    audio_data.to_pickle('Same_album_track_audio_features')

else:
    audio_data = pd.read_pickle('Same_album_track_audio_features')

In [155]:
audio_data.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,uri,valence
0,0.034,0.818,225983,0.803,0.0,1,0.153,-4.282,1,0.0797,106.97,4,spotify:track:6b8Be6ljOzmkOmFslEb23P,0.618
0,0.0362,0.886,161067,0.58,2e-06,7,0.0882,-2.144,0,0.0513,144.988,4,spotify:track:0WgBb7XgdtbUW0GlYel9mH,0.806
