In [1]:
!pip install requests bs4 lxml pandas tenacity tqdm



In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
# Target page
url = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3"

# Download the page
resp = requests.get(url)
resp.raise_for_status()   # make sure request succeeded

# Parse HTML
soup = BeautifulSoup(resp.text, "html.parser")

# Select all <a> tags inside ul.menu li.menu-item
items = soup.select("ul.menu li.menu-item a")

# Extract titles and links
titles = [a.get_text(strip=True) for a in items]
links  = [a["href"] for a in items]

# Make full URLs (they’re relative like "/en/preface")
links = ["https://rulebook.sama.gov.sa" + l for l in links]

# Put into DataFrame
df = pd.DataFrame({"title": titles, "url": links})

df.head(32)  # preview first 31

Unnamed: 0,title,url
0,Preface,https://rulebook.sama.gov.sa/en/preface
1,Scope of Application of Basel Framework,https://rulebook.sama.gov.sa/en/scope-applicat...
2,Minimum Capital Requirements,https://rulebook.sama.gov.sa/en/minimum-capita...
3,Minimum Capital Requirements for Credit Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
4,Minimum Capital Requirements for Market Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
5,Minimum Capital Requirements for Operational Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
6,Minimum Capital Requirements for Counterparty ...,https://rulebook.sama.gov.sa/en/minimum-capita...
7,Output Floor Requirements,https://rulebook.sama.gov.sa/en/output-floor-r...
8,Additional Requirements on Capital Adequacy fo...,https://rulebook.sama.gov.sa/en/additional-req...
9,Leverage,https://rulebook.sama.gov.sa/en/leverage


In [6]:
from bs4 import BeautifulSoup, NavigableString
import pandas as pd, re

HEADINGS = ("h1","h2","h3","h4","h5","h6")
ALLOWED  = {"p","ul","ol","table","blockquote","pre","div","section"}

def extract_outline(url, soup):
    # Prefer a main/article root if present
    root = (soup.select_one("main article") or
            soup.select_one("article") or
            soup.select_one("#main-content") or
            soup)

    def is_utility(tag):
        """True for left nav/related blocks/footers; False for main content wrappers."""
        if not tag: return False
        if tag.name in {"nav","aside","footer"}: return True
        if tag.get("role") in {"navigation","complementary"}: return True
        aria = (tag.get("aria-label") or "").lower()
        if "book outline" in aria: return True
        tid = (tag.get("id") or "")
        # left book menu block ids look like 'book-block-menu-1363' or 'block-rulebook-booknavigation'
        if tid.startswith("book-block-menu") or tid == "block-rulebook-booknavigation":
            return True
        return False

    # keep headings that are NOT inside utility containers
    hs = [h for h in root.find_all(HEADINGS) if h.find_parent(is_utility) is None]

    if not hs:
        txt = root.get_text(" ", strip=True)
        return pd.DataFrame([{
            "url": url, "order": 1, "level": 1,
            "number": None, "heading": "(unheaded content)", "text": txt
        }])

    rows = []
    for i, h in enumerate(hs, start=1):
        level = int(h.name[1])

        # collect until the next heading of same or higher level
        text_chunks = []
        for sib in h.next_siblings:
            if isinstance(sib, NavigableString):
                continue
            nm = getattr(sib, "name", None)
            if nm in HEADINGS and int(nm[1]) <= level:
                break
            if nm in ALLOWED and sib.find_parent(is_utility) is None:
                text_chunks.append(sib.get_text(" ", strip=True))
        full_text = "\n\n".join(t for t in text_chunks if t)

        title = h.get_text(" ", strip=True)
        m = re.search(r"^\s*([0-9IVXivx]+(?:\.[0-9A-Za-z]+)*)\b|\b(Pillar\s*[0-9]+)\b", title)
        number  = m.group(0) if m else None
        heading = re.sub(r"^\s*([0-9IVXivx]+(?:\.[0-9A-Za-z]+)*)\s*[-–—:]?\s*", "", title).strip()

        rows.append({
            "url": url, "order": i, "level": level,
            "number": number, "heading": heading, "text": full_text
        })

    return pd.DataFrame(rows)

In [7]:
import re, json, csv, time, hashlib
from urllib.parse import urljoin, urlparse, urldefrag
from collections import deque
import requests
from bs4 import BeautifulSoup

BASE = "https://rulebook.sama.gov.sa"
LANG = "en"
START_URL = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3/"

UA = {"User-Agent": "SAMA"}
CONTAINERS_CSS = (
    "main, article, .entry-content, .content, #content, #main, "
    ".wp-block-group, .wp-site-blocks, .wp-block-group__inner-container, .wp-block-columns"
)

def fetch(url, timeout=30):
    r = requests.get(url, headers=UA, timeout=timeout, allow_redirects=True)
    if r.status_code != 200: return None
    return BeautifulSoup(r.text, "html.parser")

def page_title(soup):
    t = soup.find("title")
    return t.get_text(" ", strip=True) if t else ""

def text_of(el):
    return re.sub(r"\s+"," ", el.get_text(" ", strip=True)).strip()

def choose_main_container(soup):
    conts = soup.select(CONTAINERS_CSS)
    return max(conts, key=lambda el: len(el.get_text().strip())) if conts else soup

In [8]:
def extract_blocks(soup):
    c = choose_main_container(soup)
    blocks, seq = [], 0
    def add(kind, text, meta=None):
        nonlocal seq; seq += 1
        blocks.append({"kind": kind, "text": text, "meta": meta or {}, "block_ix": seq})

    for el in c.descendants:
        if not getattr(el, "name", None): continue
        if re.fullmatch(r"h[1-6]", el.name, re.I):
            add("heading", text_of(el), {"level": int(el.name[1])})
        elif el.name == "p":
            t = text_of(el)
            if t: add("paragraph", t)
        elif el.name in ("ul","ol"):
            items = [text_of(li) for li in el.select(":scope > li") if text_of(li)]
            if items: add("list", "\n".join(items), {"ordered": el.name=="ol", "n": len(items)})
        elif el.name == "table":
            rows = []
            for tr in el.select("tr"):
                cells = [text_of(td) for td in tr.select("th,td")]
                rows.append(cells)
            if rows:
                md = []
                if rows[0]:
                    md += ["| " + " | ".join(rows[0]) + " |",
                           "| " + " | ".join(["---"]*len(rows[0])) + " |"]
                for r in rows[1:]:
                    md.append("| " + " | ".join(r) + " |")
                add("table", "\n".join(md), {"rows": len(rows)})
    return blocks

# quick sanity check on your page
s = fetch(START_URL)
print("Title:", page_title(s))
blks = extract_blocks(s)
print("Blocks:", len(blks))
blks[:6]

Title: Prudential and Supervisory Requirements | SAMA Rulebook
Blocks: 29


[{'kind': 'table',
  'text': '| Entire section | Custom print | Text Only | Rich Text | Print / Save as PDF |\n| --- | --- | --- | --- | --- |',
  'meta': {'rows': 1},
  'block_ix': 1},
 {'kind': 'list',
  'text': 'SAMA Rulebook\nBanking Sector\nPrudential and Supervisory Requirements',
  'meta': {'ordered': True, 'n': 3},
  'block_ix': 2},
 {'kind': 'table',
  'text': '|  |  | Versions |  |  |\n| --- | --- | --- | --- | --- |',
  'meta': {'rows': 1},
  'block_ix': 3},
 {'kind': 'heading',
  'text': 'Book traversal links for Prudential and Supervisory Requirements',
  'meta': {'level': 2},
  'block_ix': 4},
 {'kind': 'list',
  'text': '‹ Appendix-I\nUp\nPreface ›',
  'meta': {'ordered': False, 'n': 3},
  'block_ix': 5},
 {'kind': 'heading',
  'text': 'Prudential and Supervisory Requirements',
  'meta': {'level': 2},
  'block_ix': 6}]

In [30]:
from bs4 import BeautifulSoup
import requests

url = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3/"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

links = soup.select("ul.menu li.menu-item a")
for a in links[:10]:
    print(a.get_text(strip=True), "→", a["href"])
print("Total links found:", len(links))

Preface → /en/preface
Scope of Application of Basel Framework → /en/scope-application-basel-framework
Minimum Capital Requirements → /en/minimum-capital-requirements
Minimum Capital Requirements for Credit Risk → /en/minimum-capital-requirements-credit-risk
Minimum Capital Requirements for Market Risk → /en/minimum-capital-requirements-market-risk
Minimum Capital Requirements for Operational Risk → /en/minimum-capital-requirements-operational-risk
Minimum Capital Requirements for Counterparty Credit Risk (CCR) and Credit Valuation Adjustment (CVA) → /en/minimum-capital-requirements-counterparty-credit-risk-ccr-and-credit-valuation-adjustment-cva
Output Floor Requirements → /en/output-floor-requirements
Additional Requirements on Capital Adequacy for Shari’ah Compliant Banking → /en/additional-requirements-capital-adequacy-shari%E2%80%99ah-compliant-banking
Leverage → /en/leverage
Total links found: 58


In [31]:
base = "https://rulebook.sama.gov.sa"
toc = [(a.get_text(strip=True), base + a["href"]) for a in links]
import pandas as pd
structure = pd.DataFrame(toc, columns=["title","url"])
structure.to_csv("sama_structure.csv", index=False)
structure.head(32)

Unnamed: 0,title,url
0,Preface,https://rulebook.sama.gov.sa/en/preface
1,Scope of Application of Basel Framework,https://rulebook.sama.gov.sa/en/scope-applicat...
2,Minimum Capital Requirements,https://rulebook.sama.gov.sa/en/minimum-capita...
3,Minimum Capital Requirements for Credit Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
4,Minimum Capital Requirements for Market Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
5,Minimum Capital Requirements for Operational Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
6,Minimum Capital Requirements for Counterparty ...,https://rulebook.sama.gov.sa/en/minimum-capita...
7,Output Floor Requirements,https://rulebook.sama.gov.sa/en/output-floor-r...
8,Additional Requirements on Capital Adequacy fo...,https://rulebook.sama.gov.sa/en/additional-req...
9,Leverage,https://rulebook.sama.gov.sa/en/leverage


In [39]:
def get_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    c = soup.select_one("main, article, .entry-content, .content, #content, .wp-block-group")
    if not c:
        return ""
     
    content = []

    # Paragraphs & headers
    for p in c.select("p, h2"):
        content.append(p.get_text(" ", strip=True))

    # Tables with proper alignment
    for table in c.select("table"):
        table_rows = []
        for tr in table.select("tr"):
            cells = [td.get_text(" ", strip=True) for td in tr.select("th, td")]
            if cells:
                table_rows.append(cells)

        if table_rows:
            # Determine max width per column
            num_cols = max(len(row) for row in table_rows)
            col_widths = [0] * num_cols
            for row in table_rows:
                for j, cell in enumerate(row):
                    col_widths[j] = max(col_widths[j], len(cell))

            # Build separator
            separator = "+".join("-" * (w + 2) for w in col_widths)

            # Format table
            formatted_rows = []
            formatted_rows.append(separator)
            for row in table_rows:
                row_text = " | ".join(
                    row[j].ljust(col_widths[j]) if j < len(row) else " " * col_widths[j]
                    for j in range(num_cols)
                )
                formatted_rows.append(row_text)
                formatted_rows.append(separator)
            content.append("\n[TABLE]\n" + "\n".join(formatted_rows) + "\n[/TABLE]")

    # Images
    for img in c.select("img"):
        src = img.get("src")
        alt = img.get("alt", "")
        if src:
            if not src.startswith("http"):
                src = base + src
            content.append(f"[IMAGE: {alt}] {src}")

    return "\n".join(content)

with open("sama_rulebook.txt", "w", encoding="utf-8") as f:
    for i, row in structure.iterrows():
        f.write("=" * 120 + "\n")
        f.write(f"[{i+1}] {row['title']} → {row['url']}\n\n")
        content = get_content(row["url"])
        f.write(content + "\n\n")

print("✅ Exported all content to sama_rulebook.txt")

✅ Exported all content to sama_rulebook.txt


In [26]:
DROP_PHRASES = [
    "Entire section", "Custom print", "Text Only", "Rich Text",
    "Print / Save as PDF", "Book traversal links", "Up", "‹", "›"
]

def get_content(url):
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "html.parser")
    c = soup.select_one("main, article, .entry-content, .content, #content, .wp-block-group")
    if not c:
        return ""

    parts = []
    for node in c.select("p, h1, h2, h3, li"):
        t = node.get_text(" ", strip=True)
        if not t:
            continue
        # skip if it contains known junk
        if any(p.lower() in t.lower() for p in DROP_PHRASES):
            continue
        parts.append(t)

    text = "\n".join(parts)
    # remove pipe-heavy menu tables
    text = re.sub(r"[|]{2,}", " ", text)
    return text.strip()

In [27]:
print(get_content(structure["url"].iloc[0])[:800])

SAMA Rulebook
Banking Sector
Preface
Preface
Preface
Scope of Application of Basel Framework
The scope remains unchanged since the issuance of Basel II –Detailed Guidance Document relating to Pillar 1 issued by SAMA in 2006 in addition, the prudential returns requirements are also aligned to the scope of application implemented by SAMA.
1 Local Banks who are engaged predominantly in banking business including licensed subsidiaries of banks located outside the kingdom, operating in Saudi Arabia.
Introduction
Objective
Definition
For the purpose of this Guidance Note only:
Standalone (Solo) level: Refers to the local bank entity excluding it subsidiaries. For the avoidance of doubt, standalone level includes domestic and foreign branches and representative offices.
Consolidated level: Refers


In [28]:
from bs4 import BeautifulSoup
import requests, re, json
import pandas as pd

BASE_SELECTORS = "main, article, .entry-content, .content, #content, .wp-block-group, .wp-site-blocks, .wp-block-group__inner-container"

DROP_PHRASES = [
    "Entire section","Custom print","Text Only","Rich Text","Print / Save as PDF",
    "Book traversal links","Up","‹","›"
]

# --- heuristics for templates vs data tables -----------------
TEMPLATE_KEYS = {"name","signature","sign","date","position","title","stamp","prepared by","approved by"}
def is_template_table(text, header_cells):
    txt = text.lower()
    hdr = " ".join([h.lower() for h in header_cells])
    if any(k in txt for k in TEMPLATE_KEYS): return True
    if any(k in hdr for k in TEMPLATE_KEYS): return True
    # many empty cells and few characters -> likely a blank form
    words = re.findall(r"\w+", txt)
    return (len(words) < 30)

def table_to_rows(tb):
    rows = []
    for tr in tb.select("tr"):
        cells = [re.sub(r"\s+"," ", td.get_text(" ", strip=True)) for td in tr.select("th,td")]
        rows.append(cells)
    return rows

def rows_to_markdown(rows):
    if not rows: return ""
    md = []
    header = rows[0]
    if header:
        md.append("| " + " | ".join(header) + " |")
        md.append("| " + " | ".join(["---"] * len(header)) + " |")
    for r in rows[1:]:
        md.append("| " + " | ".join(r) + " |")
    return "\n".join(md)

# --- equations detection -------------------------------------
EQ_LATEX = re.compile(r"(\$\$.*?\$\$|\$[^$\n]+\$|\\\[.*?\\\]|\\\([^)]*\\\))", re.S)
def find_equations(container):
    eqs = []
    # MathML
    for m in container.select("math"):
        eqs.append({"kind":"mathml", "html":str(m)})
        m.decompose()
    # LaTeX in text nodes
    txt = container.get_text("\n", strip=False)
    for m in EQ_LATEX.findall(txt):
        eqs.append({"kind":"latex", "raw":m})
    # images with alt/title hinting they are equations
    for img in container.select("img[alt], img[title]"):
        hint = (img.get("alt","") + " " + img.get("title","")).lower()
        if any(k in hint for k in ["equation","formula"]):
            eqs.append({"kind":"img", "src":img.get("src",""), "alt":img.get("alt","")})
            img.decompose()
    return eqs

def strip_boilerplate(container):
    # remove menus/crumbs/nav/print blocks
    for sel in ["nav",".breadcrumbs",".breadcrumb",".book-navigation","#block-booknavigation",
                "#book-block-menu-1361","#book-block-menu-1363",".wp-block-navigation",
                ".contextual-region",".pager",".pagination",".print__links",".print-links"]:
        for node in container.select(sel):
            node.decompose()
    # kill paragraphs/headings that are obvious boilerplate
    for el in container.select("p, h1, h2, h3, h4, h5, h6, li"):
        t = el.get_text(" ", strip=True)
        if any(p.lower() in t.lower() for p in DROP_PHRASES):
            el.decompose()

def extract_prose(container):
    parts = []
    for node in container.descendants:
        if not getattr(node, "name", None):
            continue
        if re.fullmatch(r"h[1-6]", node.name, re.I):
            parts.append("\n\n" + node.get_text(" ", strip=True) + "\n")
        elif node.name == "p":
            t = node.get_text(" ", strip=True)
            if t: parts.append(t)
        elif node.name in ("ul","ol"):
            items = [li.get_text(" ", strip=True) for li in node.select(":scope > li")]
            items = [i for i in items if i]
            if items:
                parts.append("\n".join("- " + i for i in items))
    text = "\n".join(parts)
    text = re.sub(r"[|]{2,}", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def get_content_rich(url):
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, "html.parser")
    container = soup.select_one(BASE_SELECTORS) or soup

    # equations first (so we don't count them in prose)
    equations = find_equations(container)
    strip_boilerplate(container)

    # collect tables (remove them so prose won’t include them)
    tables = []
    for tb in container.select("table"):
        rows = table_to_rows(tb)
        header = rows[0] if rows else []
        txt = tb.get_text(" ", strip=True)
        templ = is_template_table(txt, header)
        tables.append({
            "shape": [len(rows), max((len(r) for r in rows), default=0)],
            "header": header,
            "rows": rows,
            "markdown": rows_to_markdown(rows),
            "is_template": templ
        })
        tb.decompose()

    prose_text = extract_prose(container)
    return {"prose_text": prose_text, "tables": tables, "equations": equations}

In [29]:
# assumes you already built `structure` (title, url)
prose_records = []
with open("sama_tables.jsonl", "w", encoding="utf-8") as ftab, \
     open("sama_equations.jsonl", "w", encoding="utf-8") as feq:
    for _, r in structure.iterrows():
        rich = get_content_rich(r["url"])
        # prose
        if rich["prose_text"].strip():
            prose_records.append({"title": r["title"], "url": r["url"], "text": rich["prose_text"]})
        # tables (write one record per table)
        for i, t in enumerate(rich["tables"], 1):
            rec = {"title": r["title"], "url": r["url"], "table_ix": i, **t}
            ftab.write(json.dumps(rec, ensure_ascii=False) + "\n")
        # equations
        for i, e in enumerate(rich["equations"], 1):
            rec = {"title": r["title"], "url": r["url"], "eq_ix": i, **e}
            feq.write(json.dumps(rec, ensure_ascii=False) + "\n")

df_prose = pd.DataFrame(prose_records)
df_prose.to_csv("sama_prose.csv", index=False)
print("Saved prose pages:", len(df_prose))

Saved prose pages: 8
