# Namu Scraper

In [14]:
import requests
from bs4 import BeautifulSoup
import re
import json
import copy

def main():
    url = "https://en.namu.wiki/w/세피로트의%20나무#s-3.6"
    response = requests.get(url)
    response.raise_for_status()
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    # 1) Find <h3> with <a id="s-3.6">
    anchor = soup.find("a", id="s-3.6")
    if not anchor:
        raise RuntimeError("Could not locate <a id='s-3.6'> anchor.")

    h3_tag = anchor.find_parent("h3")
    if not h3_tag:
        raise RuntimeError("Could not locate the <h3> that wraps anchor #s-3.6")

    # 2) The immediate next sibling div
    main_div = h3_tag.find_next_sibling("div")
    if not main_div:
        raise RuntimeError("Could not find next-sibling <div> after <h3>")

    # 3) Top-level <ul> blocks under that div
    all_uls = main_div.find_all("ul", recursive=False)
    print(f"Found {len(all_uls)} <ul> blocks right under that div.")

    # Regex: "Sephira 6 (Kether)" or "Path 12: Burning Raging"
    # Case-insensitive via (?i) at start
    title_pattern = re.compile(
        r"(?i)^\s*(sephira|path)\s+(\d+)\s*"
        r"(?:\(\s*(.*?)\s*\)|:\s*(.*))?\s*$"
    )

    def parse_li_for_data(li):
        """
        1) Convert the LI to text and parse the first line with a regex 
           for 'Sephira 6 (Kether)' or 'Path 12: Burning Raging'.
        2) If name not found, see if there's a <strong> we can use.
        3) Remove that <strong> tag from the DOM if used.
        4) Also remove the entire 'first line' from the final description text 
           so we don't duplicate the type/name in the description.
        """

        # Full text, used only for the first-line pattern check
        full_text = li.get_text(separator="\n\n", strip=True)
        paragraphs = full_text.split("\n\n")
        first_line = paragraphs[0] if paragraphs else ""
        description_paragraphs = paragraphs[1:]  # by default skip the first line

        # Attempt to parse "Sephira 1 (Kether)" or "Path 12: Something"
        m = title_pattern.match(first_line)
        if m:
            item_type_raw = m.group(1)  # "sephira"/"path"
            number_str    = m.group(2)  # e.g. "6" or "11"
            name_paren    = (m.group(3) or "").strip()  # from (...)
            name_colon    = (m.group(4) or "").strip()  # from : ...
            raw_name = name_paren if name_paren else name_colon
        else:
            # If we cannot parse it at all, 
            # store everything in description and return empty type/name
            return ("", "", full_text)

        # Build final "title": e.g. "Sephira 6" or "Path 11"
        raw_type = f"{item_type_raw} {number_str}"
        item_title = raw_type.title()   # e.g. "Sephira 6" / "Path 11"

        # If we didn't get a name from parentheses/colon, try <strong>
        if not raw_name:
            strong_tag = li.find("strong")
            if strong_tag:
                raw_name = strong_tag.get_text(strip=True)
                # remove it from the DOM so it's not repeated in the final description
                strong_tag.extract()

        item_name = raw_name.title() if raw_name else ""

        # Now that we've possibly removed <strong>, let's get the updated text 
        # without that <strong> or the first line. We'll do a fresh extraction:
        li_copy = copy.copy(li)  # shallow copy of the element
        # but we also want to remove the entire first line from li_copy's text

        # A simple approach: 
        #  - Remove the entire first line node. Typically that line is in a <div> or text node.
        #    or we can re-get the text, then skip the first paragraph.

        # We'll just re-get the text from li (with <strong> removed if used),
        # then skip the first paragraph.
        updated_text = li.get_text(separator="\n\n", strip=True)
        updated_paragraphs = updated_text.split("\n\n")
        # The first paragraph should be "Path 11: ..." or "Sephira 3 (Bina) ...",
        # so we skip it.  The rest is the real description.

        final_description = "\n\n".join(updated_paragraphs[1:])

        return (item_title, item_name, final_description)

    data_dict = {}

    for i, ul_tag in enumerate(all_uls, start=1):
        li = ul_tag.find("li", recursive=False)
        if not li:
            continue

        item_title, item_name, item_desc = parse_li_for_data(li)

        data_dict[i] = {
            "title": item_title,
            "name": item_name,
            "description": item_desc
        }

    # 4) Write out "paths.js"
    with open("paths.js", "w", encoding="utf-8") as f:
        f.write("const pathsData = ")
        f.write(json.dumps(data_dict, ensure_ascii=False, indent=2))
        f.write(";\n")

    print("Done! Wrote 'paths.js' with keys 1..{}.".format(len(data_dict)))

if __name__ == "__main__":
    main()


Found 32 <ul> blocks right under that div.
Done! Wrote 'paths.js' with keys 1..32.
