In [61]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import httpx
import asyncio
import datetime as dt
import time
import pylint
import lxml
import fastparquet

In [47]:
def date_generator(start_dt,end_dt,num_charts):
    curr_dt = end_dt
    if num_charts is not None:
        for _ in range(num_charts):
            yield curr_dt
            curr_dt -= dt.timedelta(weeks=1)
    else:
        while curr_dt > start_dt:
            yield curr_dt
            curr_dt -= dt.timedelta(weeks=1)

In [48]:
async def fetch_url(date,client):
        r = await client.get(date.strftime('%Y-%m-%d')+"/")
        return date, r

In [49]:
def scrape(soup):
    na_list = ['-',None]
    ignore = ['RE-\nENTRY','NEW']
    row = []
    for string in soup.stripped_strings:
        if len(row) >= 6: break
        if string in na_list:
            row.append(np.nan)
        elif string not in ignore:
            row.append(string)
    row.append(soup.find('path',fill='currentColor') is not None)
    return float(row[0]),{
            'song':row[1],
            'artist':row[2],
            'award':row[6],
            'last_week':float(row[3]),
            'peak_pos':float(row[4]),
            'wks_on_chart':float(row[5])}

In [50]:
def get_chart(date, soup):
    multi_idx = []
    rows = []
    soup_iter = (soup.main
                 .find('div',class_='pmc-paywall')
                 .find_all('div',class_='o-chart-results-list-row-container'))
    
    for container in soup_iter:
        pos, row = scrape(container)
        multi_idx.append((date, pos))
        rows.append(row)
        
    return multi_idx, rows

In [51]:
def validate_args(start_dt,end_dt,num_charts):
    if start_dt is None and num_charts is None:
        raise Exception("Must provide either a start date or the number of charts")

    if end_dt is not None and end_dt > dt.datetime.today():
        raise Exception(f"Date range cannot extend beyond {dt.date.today()}")
    
    if start_dt is not None:
        while start_dt.weekday()!=5:
            start_dt-=dt.timedelta(days=1)
            
    if end_dt is not None:
        while end_dt.weekday()!=5:
            start_dt+=dt.timedelta(days=1)
    return start_dt,end_dt,num_charts
    

In [57]:
async def compose(start_dt,end_dt, num_charts):
    args = validate_args(start_dt,end_dt,num_charts)
    
    async with httpx.AsyncClient(base_url="https://www.billboard.com/charts/hot-100/",
                                 timeout=10.0) as client:
        async with asyncio.TaskGroup() as tg:
            tasks = [tg.create_task(
                fetch_url(date,client)) for date in date_generator(*args)]

    idx = []
    data = []
    async for completed in asyncio.as_completed(tasks):
        date, r = completed.result()
        chart_idx, chart_data = get_chart(
            date, BeautifulSoup(r.text,"lxml"))
        
        idx+=chart_idx
        data+=chart_data

    # return idx, data
    multi_idx = pd.MultiIndex.from_tuples(idx,names=["date","position"])
    return pd.DataFrame(data,index=multi_idx).sort_values(by=["date","position"],ascending=[False,True])
            

In [58]:
args = [dt.datetime.strptime("10-29-1995","%m-%d-%Y"),dt.datetime.strptime("01-01-2000","%m-%d-%Y"),None]

In [59]:
df5 = await compose(*args)

In [65]:
with open("charts1995-2000.parquet","wb") as file:
    df5.to_parquet(file,engine="fastparquet")
