In [27]:
import requests,time,re,time,csv
from pyquery import PyQuery as pq
import datetime
from datetime import timedelta
import pandas as pd

#this module is to fetch the title, artist, initial rank, and rank variation of the songs which initially presented on Billboard weekly ranking between 2006-01-03 to 2015-12-26
#Author: Bochao(James) Zhan
#Date: 2019-9-12


#rankinfo object->[rank, date]
class RankInfo:
    def __init__(self, rank, date):
        self.rank = rank
        self.date = date.strftime("%Y-%m-%d")

#song object->[title, artist, rankinfo list]
class Song:
    def __init__(self, title, artist, initialRankInfo):
        self.title = title
        self.artist = artist
        self.rankInfos = list()
        self.rankInfos.append(initialRankInfo)
    
    def addRankInfo(self, rankInfo):
        self.rankInfos.append(rankInfo)

#to mark the songs listed in the end date. This module won't stop after hit the end date if the complete rank variation record of any song has not been collected  
SongListedInEndDate = dict()

#complete song info collection
SongInfoSet = dict()

#record the maximum duration of a song on board
maxDuration = 1

#start and end date
StartDate = datetime.date(2006, 1, 7)
EndDate = datetime.date(2015, 12, 26)

#for test
TestEndDate = datetime.date(2006,12,23)
TestLogicalEndDate = datetime.date(2007,5,7)

#this module will continue to fetch song information until 2016-12-31 to make sure the complete rank variation record of any song initially listed on board in later 2015 will be collected 
LogicalEndDate = datetime.date(2016, 12, 31)

#Crawlera api attributes (source: https://support.scrapinghub.com/support/solutions/articles/22000203567-using-crawlera-with-python)
baseUrl = "https://www.billboard.com/charts/hot-100/"
proxyHost = "proxy.crawlera.com"
proxyPort = "8010"

#this is my own API key so please don't copy it
proxyAuth = "f004a7eade8142c4bd96cd80a74ab9fd:" # Make sure to include ':' at the end

proxies = {"https": "https://{}@{}:{}/".format(proxyAuth, proxyHost, proxyPort),
          "http": "http://{}@{}:{}/".format(proxyAuth, proxyHost, proxyPort)}


#get html doc of each weekly ranking record
def getHTMLDoc(currentDate, silence = False):
    url = baseUrl + currentDate.strftime("%Y-%m-%d")
    r = requests.get(url, proxies = proxies, verify=False)
    if silence == False:
        print("""
            Requesting [{}]
            through proxy [{}]
            Request Headers:{}
            Response Time: {}
            Response Code: {}
            Response Headers:{}
        """.format(url, proxyHost, r.request.headers, r.elapsed.total_seconds(), r.status_code, r.headers, r.text))
    return pq(r.text)

#convert html items into string list
def toList(target):
    return [item.text() for item in target.items()]

#check parsed info
def checkStatus(a,b,c,d,date):
    lena = len(a)
    lenb = len(b)
    lenc = len(c)
    lend = len(d)
    if(lena != 100 or lenb != 100 or lenc != 100 or lend != 100):
        print("***missing data on weekly ranking of",date,"***")
        print("titles:",lena)
        print("artists:",lenb)
        print("ranks:",lenc)
        print("durations:",lend)
        print("*************************************************")
        return False
    else:
        print()
        print("data on weekly ranking of",date,"is successfully proceeded.")
        print()
        return True

#add song info into SongInfoSet
def appendData(titles, artists, ranks, durations, currentDate):
    global maxDuration
    print(currentDate)
    for i in range(0, 100):
        title = titles[i]
        artist = artists[i]
        artist = artist.replace(" Featuring ",",")
        artist = artist.replace(" & ",",")
        rank = int(ranks[i])
        duration = int(durations[i])
        
        if SongInfoSet.get(title) == None:
            #ignore any song which is not recorded in songinfoset but has previous record (only happens on the start date)
            if duration > 1:
                continue
            #add a song which is just presented on board
            newRankRecord = RankInfo(rank, currentDate)
            newSongInfo = Song(title, artist, newRankRecord)
            SongInfoSet[title] = newSongInfo
        
        else:
            songInfo = SongInfoSet[title]
            
            #if different songs have the same title, rename the title and try again
            if len(songInfo.rankInfos) != duration - 1:
                j = 1
                r_title = title + "_"+str(j)
                resolved = False
                while SongInfoSet.get(r_title) != None:
                    songInfo = SongInfoSet[r_title]
                    if len(songInfo.rankInfos) == duration - 1:
                        newRankRecord = RankInfo(rank, currentDate)
                        songInfo.addRankInfo(newRankRecord)
                        songInfo = SongInfoSet[r_title]
                        if maxDuration < len(songInfo.rankInfos):
                            maxDuration = len(songInfo.rankInfos)
                        resolved = True
                        break
                    j += 1
                    r_title = title + "_"+str(j)
                if resolved == False:
                    if duration > 1:
                        continue
                    newRankRecord = RankInfo(rank, currentDate)
                    #use the original title in value for song info creation but inserted it into the dictionary with renamed title
                    newSongInfo = Song(title, artist, newRankRecord)
                    SongInfoSet[r_title] = newSongInfo
            
            #append a new rank record in the existing song info item
            else:
                newRankRecord = RankInfo(rank, currentDate)
                songInfo.addRankInfo(newRankRecord)
                if maxDuration < len(songInfo.rankInfos):
                    maxDuration = len(songInfo.rankInfos)
                songInfo = SongInfoSet[title]

                

#only add rank record into SongInfoSet
def addRecords(titles, artists, ranks, durations, currentDate):
    global maxDuration
    print(currentDate)
    for i in range(0, 100):
        title = titles[i]
        artist = artists[i]
        artist = artist.replace(" Featuring ",",")
        artist = artist.replace(" & ",",")
        rank = int(ranks[i])
        duration = int(durations[i])
        
        if SongInfoSet.get(title) == None:
            continue        
        else:
            songInfo = SongInfoSet[title]
            
            #if different songs have the same title, rename the title and try again
            if len(songInfo.rankInfos) != duration - 1:
                j = 1
                r_title = title + "_"+str(j)
                while SongInfoSet.get(r_title) != None:
                    songInfo = SongInfoSet[r_title]
                    if len(songInfo.rankInfos) == duration - 1:
                        newRankRecord = RankInfo(rank, currentDate)
                        songInfo.addRankInfo(newRankRecord)
                        songInfo = SongInfoSet[r_title]
                        if maxDuration < len(songInfo.rankInfos):
                            maxDuration = len(songInfo.rankInfos)
                        resolved = True
                        break
                    j += 1
                    r_title = title + "_"+str(j)      
            #append a new rank record in the existing song info item
            else:
                newRankRecord = RankInfo(rank, currentDate)
                songInfo.addRankInfo(newRankRecord)
                if maxDuration < len(songInfo.rankInfos):
                    maxDuration = len(songInfo.rankInfos)
                songInfo = SongInfoSet[title]

                
def dataFrameCreation(SongInfoSet):
    attributesName = ['title', 'artist']
    for i in range(0, maxDuration):
        attributesName.append('date'+str(i+1))
        attributesName.append('rank'+str(i+1))
        
    df = pd.DataFrame(columns=attributesName)
    
    for item in SongInfoSet.values():
        itemdict = dict()
        itemdict['title'] = item.title
        itemdict['artist'] = item.artist
        rankList = item.rankInfos
        for i in range(0,len(rankList)):
            itemdict['date'+str(i+1)] = rankList[i].date
            itemdict['rank'+str(i+1)] = rankList[i].rank
        df = df.append(itemdict, ignore_index=True)
    return df
def splitArtist(artist):
    splitSymArtist=[' Featuring ',', ',' & ',' / ',' x ',' + ',' Duet With ',' X ',' With ']
    artist=artist.replace(' ','+')+' '+re.split('|'.join(splitSymArtist),artist)[0].replace(' ','+')
    return artist
def getTunebat(title,artist,filepath='tunebat.csv'):
    file=open(filepath,'r',encoding='utf-8')#append mode
    if file.read()=='':
        file.close
        file=open(filepath,'a',encoding='utf-8')
        file.write('Title,Artist,Key,Camelot,Duration,BPM,Popularity,ReleaseDate,AlbumName,IsExplicit,Label,Energy,Danceability,Happiness,Loudness,Acousticness,Instrumentalness,Liveness,Speechiness\n')
    else:
        file.close
        file=open(filepath,'a',encoding='utf-8')
    tunebat_url='https://tunebat.com/'
    keyword=splitArtist(artist)
    status_code=0
    while status_code!=200:
        r = requests.get(tunebat_url+'Search?q='+keyword, proxies=proxies,verify=False)
        status_code=r.status_code
    info_html=pq(r.text)
    info=info_html('script[src="/js/ReactUIComponents/SearchResults.jsx?v=1.10"]').next().text()#get data from script
    #clean text and to make it easier to operate, transform it info json format
    info=re.sub('\), document\.getElementById\("react_.*"\)\);','',re.sub(re.escape('ReactDOM.hydrate(React.createElement(SearchResultContainer, '),'',info))
    info=eval(info)
    info=json.loads(info['data'])['TrackItems'][0]# get first search result
    if json.loads(info['data'])['TrackItems']!=[]:
        Key=info['Key']
        Camelot=info['Camelot']
        Duration=info['Duration']
        BPM=info['BPM']
        Popularity=info['Popularity']
        ReleaseDate=info['ReleaseDate']
        AlbumName=info['AlbumName']
        IsExplicit=info['IsExplicit']
        Label=info['Label']
        Energy=info['Energy']
        Danceability=info['Danceability']
        Happiness=info['Happiness']
        Loudness=info['Loudness']
        Acousticness=info['Acousticness']
        Instrumentalness=info['Instrumentalness']
        Liveness=info['Liveness']
        Speechiness=info['Speechiness']
        file.write('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(title, artist, Key, Camelot, Duration, BPM, Popularity, ReleaseDate, AlbumName, IsExplicit, Label, Energy, Danceability, Happiness, Loudness, Acousticness, Instrumentalness, Liveness, Speechiness))
        file.close
    else:
        with open('not_found_songs','a',encoding='utf-8') as f:#put songs not found into another file
            f.write(title,artist,'\n')
    print('success')

In [21]:
#main logic
currentDate = StartDate
print('start')
while currentDate <= EndDate:
    rank_doc = getHTMLDoc(currentDate, True)
    SongInfo =rank_doc('div[class="chart-details "]')
    
    #extract song info from html document
    titles = toList(SongInfo.find('span[class="chart-list-item__title-text"]'))
    ranks = toList(SongInfo.find('div[class="chart-list-item__rank "]'))
    artists = toList(SongInfo.find('div[class="chart-list-item__artist"]'))
    durations = toList(SongInfo.find('div[class="chart-list-item__weeks-on-chart"]'))
    
    #the rank for the last song is encapsulated in a werid label thus the last rank is manually added
    ranks.append(100)
    
    #check status and skip data processing if any item is missing
    if checkStatus(titles,artists,ranks,durations,currentDate) == True:
        #process data
        appendData(titles,artists,ranks,durations,currentDate) 
    
    #get the release date of next week
    currentDate += timedelta(days=7)

#continue to collect the rests rank records of the songs present in later 2015. It applies the same logic but ignores any new-on-board songs 
while currentDate <= LogicalEndDate:
    rank_doc = getHTMLDoc(currentDate, True)
    SongInfo =rank_doc('div[class="chart-details "]')
    
    #extract song info from html document
    titles = toList(SongInfo.find('span[class="chart-list-item__title-text"]'))
    ranks = toList(SongInfo.find('div[class="chart-list-item__rank "]'))
    artists = toList(SongInfo.find('div[class="chart-list-item__artist"]'))
    durations = toList(SongInfo.find('div[class="chart-list-item__weeks-on-chart"]'))
    
    #the rank for the last song is encapsulated in a werid label thus the last rank is manually added
    ranks.append(100)
    
    #check status and skip data processing if any item is missing
    if checkStatus(titles,artists,ranks,durations,currentDate) == True:
        #process data
        addRecords(titles,artists,ranks,durations,currentDate) 
    
    #get the release date of next week
    currentDate += timedelta(days=7)

print("complete")

start


ProxyError: HTTPSConnectionPool(host='www.billboard.com', port=443): Max retries exceeded with url: /charts/hot-100/2006-01-07 (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))

In [9]:
#create dataframe based on the info recorded in SongInfoSet
df = dataFrameCreation(SongInfoSet)
print(df)

df.to_csv("song_rank_info.csv", encoding = "UTF8")

                                     title  \
0             Who I Am Hates Who I've Been   
1                                     Dare   
2                                  My Hood   
3                         The Ghost Of You   
4                              Temperature   
5                 Lean Wit It, Rock Wit It   
6                            Ever The Same   
7                Get Drunk And Be Somebody   
8                                Walk Away   
9                       We Belong Together   
10                             Upside Down   
11                                Cheatin'   
12                  Living In Fast Forward   
13              Who Says You Can't Go Home   
14                  Oh Yes (aka 'Postman')   
15                           Nobody But Me   
16                       Lights And Sounds   
17                                    Love   
18                           Breaking Free   
19                           Ms. New Booty   
20                                

In [28]:
dataLists=list(csv.DictReader(open('song_rank_info.csv')))
for data in dataLists:
    getTunebat(data['title'],data['artist'])

ProxyError: HTTPSConnectionPool(host='tunebat.com', port=443): Max retries exceeded with url: /Search?q=Relient+K%20Relient+K (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))