In [2]:
from datetime import datetime,timedelta

### Small Demo Snippet of getting the correct weekly billboard date

Billboard top album sales dates are weekly and they start on Sunday, end on Saturday. The chart title always corresponds to a Saturday. For example, Top Album sales the week of 2019-09-28 which is a Saturday. This is always true for the URL corresponding to that week's top 100 albums.

In [3]:
start_date = '2019-10-05'

In [4]:
end_date = '2019-09-19' ## Billboards data should end at week of 2019-09-21

In [10]:
start_dt = datetime.strptime(start_date, '%Y-%m-%d')
end_dt = datetime.strptime(end_date, '%Y-%m-%d')

chart_week_dates = []
current_dt = start_dt # would be best if start date is a valid week of date, i.e. a saturday
while(current_dt > end_dt):
    chart_week_dates.append(str(current_dt.date()))    
    current_dt = current_dt - timedelta(days=7)
    # exit loop as soon as current date less than end date


In [11]:
print(chart_week_dates)

['2019-10-05', '2019-09-28', '2019-09-21']


### Now generate the correct list of dates for our dataset
The end date will be the publication date of the first pitchfork review: '1999-01-05' which can be verified with reviews.pub_date.min() 

In [15]:
start_date = '2019-10-05' # Week before final's week of module 1
end_date = '1999-01-05' 

start_dt = datetime.strptime(start_date, '%Y-%m-%d')
end_dt = datetime.strptime(end_date, '%Y-%m-%d')

chart_week_dates = []
current_dt = start_dt # would be best if start date is a valid week of date, i.e. a saturday
while(current_dt > end_dt):
    chart_week_dates.append(str(current_dt.date()))    
    current_dt = current_dt - timedelta(days=7)
    # exit loop as soon as current date less than end date

### Now time to fetch the billboard charts data
We will store everything in a dictonary first with the {key:value} pair being {(artist,album_name):peak_position). The key is a tuple of (artist,album_name), the value is the peak position in the charts of the album. When updating the dictionary, we will check if the currently stored peak position is less than this week's peak position.

In [113]:
import billboard
import pandas as pd
from collections import defaultdict
import random
import time

In [75]:
api_urls = ['top-album-sales/'+date for date in chart_week_dates]

This takes a while to run because it's scraping the website, also we will sleep every 1 seconds to avoid 429 error

In [None]:
charts = []
for url in api_urls:
    chart = billboard.ChartData(url)
    charts.append(chart)
    sleep_sec = random.randrange(0,10,1) * 0.5
    time.sleep(sleep_sec)

In [31]:
album_ranks = defaultdict(lambda: 666) # default ranking is 666 which is lower than 100 (lowest possible rank)

for entry in chart:
    artist_name = entry.artist
    album_name = entry.title
    peak_chart_ranking = entry.peakPos
    key = (artist_name,album_name)
    if album_ranks[key] > peak_chart_ranking: # If stored rank is lower this week's peak, we replace the stored rank with the new higher ranking
        album_ranks[key] = peak_chart_ranking

In [48]:
billboard_chart_ranks = pd.DataFrame(columns=['artist','title','peak_chart_ranking'])

In [55]:
import numpy as np

In [68]:
billboard_chart_ranks = pd.DataFrame(columns=['artist','title','peak_chart_ranking'])
counter = 0
for key,peak_chart_ranking in album_ranks.items():
    artist_name,album_name = key
    new_row = [artist_name.lower().strip(),album_name.lower().strip(),peak_chart_ranking]
    billboard_chart_ranks.loc[counter] = new_row
    counter += 1

In [71]:
billboard_chart_ranks.to_csv('chart_rankings.csv')