# Simple man7 scrape with BeautifulSoup

Fetch the index page and return links whose anchor text ends with `(1)`.

In [30]:
import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup, Tag
from __future__ import annotations
from typing import Dict, List, Optional
import json
from functools import lru_cache
from typing import Optional, Dict, Any, List, Tuple

OUTPUT_PATH = '/scratch4/home/akrik/NTILC/data/man/raw.json'


In [None]:
_BULLET_RE = re.compile(r"^\s*•\s*(.+?)\s*$")
_ARG_SAME_LINE_RE = re.compile(r"(\s+<[^>]+>\s*)$")

def _find_section_pre(soup: BeautifulSoup, section_id: str):
    a = soup.find("a", id=section_id)
    if not a:
        return None
    h2 = a.find_parent("h2")
    if not h2:
        return None
    return h2.find_next_sibling("pre")

def _clean_pre_text(pre) -> str:
    if pre is None:
        return ""
    txt = pre.get_text("\n")
    lines = [ln.rstrip() for ln in txt.splitlines()]
    i, j = 0, len(lines)
    while i < j and not lines[i].strip():
        i += 1
    while j > i and not lines[j - 1].strip():
        j -= 1
    return "\n".join(lines[i:j])

def _parse_name_section(name_text: str) -> Dict[str, str]:
    lines = [ln.strip() for ln in name_text.splitlines()]
    nonempty = [ln for ln in lines if ln]

    command = short = desc = ""
    if nonempty:
        m = re.match(r"^(\S+)\s*-\s*(.+)$", nonempty[0])
        if m:
            command = m.group(1).strip()
            short = m.group(2).strip()

            orig = name_text.splitlines()
            first_idx = next((i for i, ln in enumerate(orig) if ln.strip() == nonempty[0]), 0)
            rest = orig[first_idx + 1 :]
            while rest and not rest[0].strip():
                rest.pop(0)
            desc = "\n".join(r.rstrip() for r in rest).strip()
        else:
            desc = name_text.strip()

    return {"command": command, "short_description": short, "description": desc}

def _parse_invocation_section(inv_text: str) -> str:
    lines = [ln.strip() for ln in inv_text.splitlines() if ln.strip()]
    s = " ".join(lines)
    return re.sub(r"\s+", " ", s).strip()

_OPT_START_RE = re.compile(r"^\s*(?:•|Â•)\s+(.+?)\s*$|^\s*(--?\S.*)$")

_ARG_SAME_LINE_RE = re.compile(r"(\s+<[^>]+>\s*)$")

def _split_option_blocks(options_text: str):
    lines = options_text.splitlines()

    starts = []
    for i, ln in enumerate(lines):
        if _OPT_START_RE.match(ln):
            starts.append(i)

    if not starts:
        return []

    blocks = []
    for k, s in enumerate(starts):
        e = starts[k + 1] if k + 1 < len(starts) else len(lines)
        blocks.append(lines[s:e])
    return blocks

def _parse_option_header(header_line: str):
    """
    Supports:
      • --help
      Â• --help
      --help
      -h
      --version | -v
    """
    m = _OPT_START_RE.match(header_line)
    if not m:
        return [], ""

    head = (m.group(1) or m.group(2) or "").strip()

    arg = ""
    arg_m = _ARG_SAME_LINE_RE.search(head)
    if arg_m:
        arg = arg_m.group(0).strip()
        head = head[:arg_m.start()].strip()

    parts = [p.strip() for p in head.split("|")]
    flags = [re.sub(r"\s+", " ", p).strip() for p in parts if p]
    return flags, arg

def _parse_option_block(block_lines: List[str]) -> Dict[str, str]:
    header = block_lines[0]
    flags, arg = _parse_option_header(header)

    rest = block_lines[1:]

    if not arg:
        for idx in range(min(3, len(rest))):
            s = rest[idx].strip()
            if re.fullmatch(r"<[^>]+>", s):
                arg = s
                rest = rest[:idx] + rest[idx+1:]
                break

    i = 0
    while i < len(rest) and not rest[i].strip():
        i += 1
    rest = rest[i:]

    paragraphs = []
    cur = []
    for ln in rest:
        if not ln.strip():
            if cur:
                paragraphs.append(" ".join(cur).strip())
                cur = []
            continue
        cur.append(re.sub(r"\s+", " ", ln.strip()))
    if cur:
        paragraphs.append(" ".join(cur).strip())

    return {
        "flags": flags,
        "arg": arg,
        "description": "\n\n".join(paragraphs).strip(),
    }

def _parse_options_section(options_text: str) -> List[Dict[str, Any]]:
    blocks = _split_option_blocks(options_text)
    return [_parse_option_block(b) for b in blocks]


In [None]:
_DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128 Safari/537.36"
}

@lru_cache(maxsize=256)
def _fetch_html_cached(url: str, timeout: float = 15.0) -> str:

    with requests.Session() as s:
        s.headers.update(_DEFAULT_HEADERS)
        r = s.get(url, timeout=timeout)
        r.raise_for_status()
        return r.text

def fetch_html(url: str, *, session=None, timeout: float = 15.0, use_cache: bool = True) -> str:
    if use_cache and session is None:
        return _fetch_html_cached(url, timeout=timeout)

    s = session or requests.Session()
    if "User-Agent" not in s.headers:
        s.headers.update(_DEFAULT_HEADERS)

    r = s.get(url, timeout=timeout)
    r.raise_for_status()

    r.encoding = "utf-8"
    return r.text



In [None]:
def parse_man7_page(url: str, *, session: Optional[requests.Session] = None, timeout: float = 15.0, use_cache: bool = True) -> Dict[str, Any]:
    """
    Fetch + parse a man7.org HTML manpage into:
      { name, one_line, description, invocation, options }

    Efficient choices:
      - requests.Session reuse (if you pass session)
      - optional LRU cache for repeated URLs
      - only parses the three sections you asked for (NAME/INVOCATION/OPTIONS)
    """
    html = fetch_html(url, session=session, timeout=timeout, use_cache=use_cache)
    soup = BeautifulSoup(html, "html.parser")

    name_text = _clean_pre_text(_find_section_pre(soup, "NAME"))
    inv_text  = _clean_pre_text(_find_section_pre(soup, "INVOCATION"))
    opt_text  = _clean_pre_text(_find_section_pre(soup, "OPTIONS"))

    name_parsed = _parse_name_section(name_text)
    invocation = _parse_invocation_section(inv_text) if inv_text else ""

    record = {
        "name": name_parsed["command"],
        "one_line": name_parsed["short_description"],
        "description": name_parsed["description"],
        "invocation": invocation,
        "options": _parse_options_section(opt_text) if opt_text else [],
        "source_url": url,
    }
    return record


In [34]:
url = "https://man7.org/linux/man-pages/dir_all_alphabetic.html"
response = requests.get(url, timeout=30)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")
pattern = re.compile(r"\(1\)$")

section_1_links = [
    urljoin(url, a.get("href"))
    for a in soup.find_all("a")
    if a.get_text(strip=True) and pattern.search(a.get_text(strip=True))
]

len(section_1_links)

1576

In [39]:
with requests.Session() as session, open(OUTPUT_PATH, "w") as f:
    f.write("[\n")

    for i, url in enumerate(section_1_links):
        rec = parse_man7_page(url, session=session, use_cache=False)

        json.dump(rec, f, indent=2)

        # Add comma except after last element
        if i < len(section_1_links) - 1:
            f.write(",\n")
        else:
            f.write("\n")

    f.write("]")
