In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import httpx
import asyncio
import datetime as dt
import time
import pylint
import lxml
import fastparquet

In [49]:
OLDEST = dt.datetime(1958, 8, 4)

In [3]:
def date_generator(start_dt,end_dt,num_charts):
    curr_dt = end_dt
    years_ignore = []
    if num_charts is not None:
        for _ in range(num_charts):
            if curr_dt.year % 10 == 0:
                return curr_dt
            else:
                yield curr_dt
                curr_dt -= dt.timedelta(weeks=1)
    else:
        while curr_dt > start_dt:
            if curr_dt.year % 10 == 0:
                return curr_dt
            else:
                yield curr_dt
                curr_dt -= dt.timedelta(weeks=1)

In [None]:
async def fetch_url(date,client):
        async with client.stream('GET', date.strftime('%Y-%m-%d')+"/") as r:
                async for chunk in r.aiter_text():
                        
        r = await client.get(date.strftime('%Y-%m-%d')+"/")
        return date, r

In [44]:
def scrape(soup):
    na_list = ['-',None]
    ignore = ['RE-\nENTRY','NEW']
    row = []
    for string in soup.stripped_strings:
        if len(row) >= 6: break
        if string in na_list:
            row.append(np.nan)
        elif string not in ignore:
            row.append(string)
    row.append(soup.find('path',fill='currentColor') is not None)
    return float(row[0]),{
            'song':row[1],
            'artist':row[2],
            'award':row[6],
            'last_week':float(row[3]),
            'peak_pos':float(row[4]),
            'wks_on_chart':float(row[5])}

In [45]:
def get_chart(date, soup):
    multi_idx = []
    rows = []
    soup_iter = (soup.main
                 .find('div',class_='pmc-paywall')
                 .find_all('div',class_='o-chart-results-list-row-container'))
    
    for container in soup_iter:
        pos, row = scrape(container)
        multi_idx.append((date, pos))
        rows.append(row)
        
    return multi_idx, rows

In [46]:
def round_date(date, scalar=1):
    while date.weekday() != 5:
        date = date + scalar*dt.timedelta(days=1)
    return date

In [None]:
def check_page_d(soup):
    str_d = soup.main.find('div', class_='pmc-paywall').p.string.split(maxsplit=2)[-1]
    return dt.datetime.strptime(str_d,'%B %d, %Y')
        


In [39]:
def validate_args(start_dt,end_dt,num_charts):
    if start_dt is None and num_charts is None:
        raise Exception("Must provide either a start date or the number of charts")

    if end_dt is not None and end_dt > dt.datetime.today():
        raise Exception(f"Date range cannot extend beyond {dt.date.today()}")
    
    if start_dt is not None:
        start_dt = round_date(start_dt, -1)
            
    if end_dt is not None:
        end_dt = round_date(end_dt)

    return start_dt, end_dt, num_charts
    

In [None]:
async def compose(start_dt,end_dt, num_charts):
    start_t = time.time()
    args = validate_args(start_dt,end_dt,num_charts)
    
    async with httpx.AsyncClient(base_url="https://www.billboard.com/charts/hot-100/",
                                 timeout=15.0) as client:
        async with asyncio.TaskGroup() as tg:
            tasks = [tg.create_task(
                fetch_url(date,client)) for date in date_generator(*args)]

    idx = []
    data = []
    dfs = []
    decades_ignore = [2025]
    async for completed in asyncio.as_completed(tasks):
        date, r = completed.result()
        chart_idx, chart_data = get_chart(
            date, BeautifulSoup(r.text,"lxml"))
    
        idx+=chart_idx
        data+=chart_data

        if date.year % 5 == 0 and date.year not in decades_ignore:
            print(f"Reached year {date.year} at {time.time()-start_t} seconds")
            decades_ignore.append(date.year)
            multi_idx = pd.MultiIndex.from_tuples(idx,names=["date","position"])
            dfs.append(pd.DataFrame(data,index=multi_idx).sort_values(by=["date","position"],ascending=[False,True]))
            idx = []
            data = []
    
    return dfs
            

In [53]:
args = [OLDEST,dt.datetime.today(),None]

In [None]:
dfs = await compose(*args)

In [1]:
# with open("charts1995-2000.parquet","wb") as file:
#     df5.to_parquet(file,engine="fastparquet")
dfs


NameError: name 'dfs' is not defined