In [None]:
%pip install -U -q beautifulsoup4 openpyxl pandas

In [1]:
rank = "tiger"

base_url = f"https://www.scouting.org/programs/cub-scouts/adventures/{rank}/"

In [8]:
from bs4 import BeautifulSoup
import requests


def get_html_document(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    return response.text


def get_adventure_highlights():
    html_document = get_html_document(base_url)
    soup = BeautifulSoup(html_document, "html.parser")

    # get all adventures
    adventures = soup.find_all("div", {"class": f"cs-adv-rank-{rank}"})

    adventure_hightlights = [
        {
            "adventure_name": "Bobcat",
            "adventure_link": f"https://www.scouting.org/cub-scout-adventures/bobcat-{rank}/",
            "adventure_required": True,
        }
    ]
    for adventure in adventures:
        classes = adventure.get("class", [])
        required = "cs-adv-type-required" in classes

        details_link = adventure.find("a")

        adventure_hightlights.append(
            {
                "adventure_name": details_link.text,
                "adventure_link": details_link.get("href"),
                "adventure_required": required,
            }
        )

    return adventure_hightlights


def get_activity_details(url):
    html_document = get_html_document(url)
    soup = BeautifulSoup(html_document, "html.parser")

    activity_description = soup.find("section", {"class": "adv-act"}).find("p").text

    requirements = soup.find("section", {"class": "xadv-requirements"})
    supplies = requirements.find("ul").text

    directions = requirements.find_all("div", {"class": "pp-accordion-tab-content"})[
        1
    ].text

    return {
        "activity_description": activity_description,
        "activity_supplies": supplies,
        "activity_directions": directions,
    }


def get_adventure_details(adventure_hightlight):
    url = adventure_hightlight["adventure_link"]

    html_document = get_html_document(url)
    soup = BeautifulSoup(html_document, "html.parser")

    # adventure_name = soup.find("h1").text
    adventure_description = (
        soup.find("section", {"class": "adv-requirements-snapshot"}).find("p").text
    )

    requirements = soup.find_all("section", {"class": "adv-requirements"})
    details = []
    for requirement in requirements[2:]:
        requirement_name = requirement.find("h2").text
        print(f"  {requirement_name}")
        requirement_description = requirement.find("p").text

        activities = requirement.find_all("article")
        for activity in activities:
            activity_link = activity.find("a")

            attributes = [
                val.text.strip()
                for val in activity.find_all(
                    "div", {"class": "elementor-icon-box-title"}
                )[1:]
            ]
            
            activity_attributes = {
                "activity_attr_energy": attributes[0],
                "activity_attr_supplies": attributes[1],
                "activity_attr_time": attributes[2],
            }
            
            activity_name = activity_link.text
            print(f"    {activity_name}")
            activity_link = activity_link.get("href")

            activity_details = get_activity_details(activity_link)

            details.append(
                (
                    {
                        **adventure_hightlight,
                        "adventure_description": adventure_description,
                        "requirement_name": requirement_name,
                        "requirement_description": requirement_description,
                        "activity_name": activity_name,
                        "activity_link": activity_link,
                        **activity_attributes,
                        **activity_details,
                    }
                )
            )

    return details

In [None]:
import pandas as pd

adventure_highlights = get_adventure_highlights()

data = []
for idx, adventure in enumerate(adventure_highlights):
    print(f"Adventure: {adventure['adventure_name']} ({idx + 1}/{len(adventure_highlights)})")
    details = get_adventure_details(adventure)
    for detail in details:
        data.append(detail)

df = pd.DataFrame.from_dict(data)

In [12]:
with pd.ExcelWriter('./output/data.xlsx') as writer:
    df.to_excel(writer, sheet_name=rank, index=False)