In [158]:
import httpx
import asyncio
import datetime as dt
import pylint
from lxml import etree
import time
import re
import pandas as pd

In [159]:
OLDEST = dt.datetime(1958, 8, 4)

In [160]:
def to_saturday(date, scalar=1):
    # rounds the date to a Saturday
    while date.weekday() != 5:
        date = date + scalar*dt.timedelta(days=1)
    return date

In [161]:
def date_generator(latest_date=to_saturday(dt.datetime.today()), delta=1):
    # infintite generator for dates that takes the timedelta as a paramater
    latest_date = to_saturday(latest_date)
    curr = latest_date
    while curr >= OLDEST:
        yield curr
        curr -= dt.timedelta(weeks=delta)

In [162]:
def get_url(date):
    return date.strftime('%Y-%m-%d') + '/'

In [163]:
# testing parser feed() and read_events() methods
async def html_driver(date, client):
    url = get_url(date)
    parser = etree.HTMLPullParser(events=("start", "end"))
    handler = HTMLHandler()
    
    async with client.stream('GET', url) as response:
        async for chunk in response.aiter_text():
            parser.feed(chunk)
            for event, ele in parser.read_events():
                handler.on_event(event, ele)
                
    data = handler.get_content()
    return format_data(date, data)
                

In [164]:
class HTMLHandler:
    PARENT_TAG = 'ul' # parent tag that contains all relevant nodes
    CHILD_TAGS = ['span', 'h3'] # tags within parent node that contain relevant data
    PARENT_CLASS = 'o-chart-results-list-row //' # beginning of the parent class
    TEXT_IGNORE = ['NEW', 'RE-\nENTRY'] # irrelevant text that shouldn't be returned 

    def __init__(self):
        self.content = [] # returns all of the rows as lists of strings
        self.curr_row = [] # holds the data as a list of strings
        self.within_parent = False # flag indicating whether the current node is within the parent node

    def is_parent(self, tag, ele):
        return tag == self.PARENT_TAG and self.PARENT_CLASS in ele.attrib.get('class', '')

    def is_child(self, tag):
        return self.within_parent and tag in self.CHILD_TAGS

    def on_event(self, event, ele):
        tag = ele.tag
        if event == 'start':
            self.on_start(tag, ele)
        elif event == 'end':
            self.on_end(tag, ele)

    def on_start(self, tag, ele):
        # when the parser encounters a start tag...
        if self.is_parent(tag, ele):
            assert not self.curr_row # row should be empty if entering a new parent node
            self.within_parent = True

    def on_end(self, tag, ele):
        if self.is_parent(tag, ele) and self.curr_row:
            self.content.append(self.curr_row)
            self.curr_row = []
            self.within_parent = False
        elif self.is_child(tag):
            if ele.text:
                data = ele.text.strip()
                if data and data not in self.TEXT_IGNORE:
                    self.curr_row.append(data)
        ele.clear()
    
    def get_content(self):
        if self.curr_row:
            self.content.append(self.curr_row)
            self.curr_row = []
        return self.content


    

    


In [165]:
def format_data(date, parsed_data):
    idxs = [-1, 0, 1, 2, 5]
    columns = ['date', 'position', 'song', 'artist', 'wks_on_chart']
    parsed_data = [row + [date] for row in parsed_data]
    return [{column: row[i] for column, i in zip(columns, idxs)} for row in parsed_data]

In [None]:
async def main():
    start_time = time.time()
    BUFFER_SIZE = 800
    dates = date_generator(delta=52)
    buffer = []
    tasks = []
    async with httpx.AsyncClient(
        base_url="https://www.billboard.com/charts/hot-100/", timeout=15.0
        ) as client:
        async with asyncio.TaskGroup() as tg:
            tasks = [
                tg.create_task(html_driver(
                    date, client)) for date in dates
                    ]

    for future in asyncio.as_completed(tasks):
        data = await future
        buffer += data
    
        if len(buffer) >= BUFFER_SIZE:
            df = pd.DataFrame(buffer)
            buffer.clear()
            print(f"flushed buffer at {time.time() - start_time} seconds")

    if buffer:
        df = pd.DataFrame(buffer)
        print(f"final flush at {time.time() - start_time} seconds")
    return results




In [183]:
dfs = await main()


flushed buffer at 2.262375831604004 seconds
flushed buffer at 2.2629051208496094 seconds
flushed buffer at 2.2634239196777344 seconds
flushed buffer at 2.2639269828796387 seconds
flushed buffer at 2.2643980979919434 seconds
flushed buffer at 2.2648561000823975 seconds
flushed buffer at 2.2653160095214844 seconds
flushed buffer at 2.2657721042633057 seconds
final flush at 2.266050100326538 seconds


['1', 'Roses Are Red (My Love)', 'Bobby Vinton', '7']