In [2]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as soup
from datetime import date, timedelta
import csv
import time
import os

In [3]:
def get_hot_100_for_week(ymdtime):
    '''
    ymdtime : string (eg: '2021-03-06'), weeks are marked by saturday to change date get new week data
    return 
    '''
    url = 'https://www.billboard.com/charts/hot-100/'+ymdtime

    # grabbing the page
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
    # with urlopen(url) as response:
    #    page_html = response.read()

    req = Request(url=url, headers=headers)
    page_html = urlopen(req).read()

    # HTML parsing
    page_soup = soup(page_html, "html.parser")

    # Grabs all information related to the top 100 songs
    containers = page_soup.find_all(
        'li', {'class': 'chart-list__element display--flex'})

    tbr_data = []
    rank = 1

    for container in containers:

       # Grabs the song name
        song = container.find(
            'span', {'class': 'chart-element__information__song'}).text

        # Grabs the artist name
        artist = container.find(
            'span', {'class': 'chart-element__information__artist'}).text

        # Grabs the song's position last week
        last_week = container.find('span', {
                                   'class': 'chart-element__meta text--center color--secondary text--last'}).text

        # Grabs the song's peak position
        peak_position = container.find('span', {
                                       'class': 'chart-element__meta text--center color--secondary text--peak'}).text

        # Grabs the song's duration in the hot 100 (in weeks)
        weeks_on_chart = container.find('span', {
                                        'class': 'chart-element__meta text--center color--secondary text--week'}).text

        # format rank, artist, song, rank last week, peak pos, weeks on chart, date
        tbr_data.append([rank, artist, song, last_week,
                         peak_position, weeks_on_chart, ymdtime])

        rank += 1

    return tbr_data

In [4]:
fields=['rank', 'artist', 'song', 'rank_last_week', 'peak_position', 'weeks_on_chart', 'date']

def writeToFile(l, yr):
    path = '../data/raw/charts/bb_hot_100_'+str(yr)+'.csv'
    if not os.path.isfile(path):
        with open(path, 'w') as myfile:
            writer = csv.writer(myfile)
            writer.writerow(fields)
    with open(path, 'a') as myfile:
       writer = csv.writer(myfile)
       writer.writerows(l) 
    time.sleep(5)

In [None]:
# set year
#yr = 1964
# testing and writing column names
#writeToFile(fields, yr)
# first saturday of year 1960, 1, 2 as data on billboard starts from 1958
start_date = date(1960, 1, 2)
# end date is till year 2020
end_date = date(2020, 12, 31)
# day difference 7 days
td = timedelta(days=7)
while start_date <= end_date:
    week_day = start_date.strftime("%Y-%m-%d")
    bbHot100 = get_hot_100_for_week(week_day)
    writeToFile(bbHot100, week_day.split('-')[0])
    start_date += td