In [13]:
!pip install requests bs4 lxml pandas tenacity tqdm



In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [20]:
# Target page
url = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3"

# Download the page
resp = requests.get(url)
resp.raise_for_status()   # make sure request succeeded

# Parse HTML
soup = BeautifulSoup(resp.text, "html.parser")

# Select all <a> tags inside ul.menu li.menu-item
items = soup.select("ul.menu li.menu-item a")

# Extract titles and links
titles = [a.get_text(strip=True) for a in items]
links  = [a["href"] for a in items]

# Make full URLs (they’re relative like "/en/preface")
links = ["https://rulebook.sama.gov.sa" + l for l in links]

# Put into DataFrame
df = pd.DataFrame({"title": titles, "url": links})

df.head(32)  # preview first 31

Unnamed: 0,title,url
0,Preface,https://rulebook.sama.gov.sa/en/preface
1,Scope of Application of Basel Framework,https://rulebook.sama.gov.sa/en/scope-applicat...
2,Minimum Capital Requirements,https://rulebook.sama.gov.sa/en/minimum-capita...
3,Minimum Capital Requirements for Credit Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
4,Minimum Capital Requirements for Market Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
5,Minimum Capital Requirements for Operational Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
6,Minimum Capital Requirements for Counterparty ...,https://rulebook.sama.gov.sa/en/minimum-capita...
7,Output Floor Requirements,https://rulebook.sama.gov.sa/en/output-floor-r...
8,Additional Requirements on Capital Adequacy fo...,https://rulebook.sama.gov.sa/en/additional-req...
9,Leverage,https://rulebook.sama.gov.sa/en/leverage


In [22]:
from bs4 import BeautifulSoup, NavigableString
import pandas as pd, re

HEADINGS = ("h1","h2","h3","h4","h5","h6")
ALLOWED  = {"p","ul","ol","table","blockquote","pre","div","section"}

def extract_outline(url, soup):
    # Prefer a main/article root if present
    root = (soup.select_one("main article") or
            soup.select_one("article") or
            soup.select_one("#main-content") or
            soup)

    def is_utility(tag):
        """True for left nav/related blocks/footers; False for main content wrappers."""
        if not tag: return False
        if tag.name in {"nav","aside","footer"}: return True
        if tag.get("role") in {"navigation","complementary"}: return True
        aria = (tag.get("aria-label") or "").lower()
        if "book outline" in aria: return True
        tid = (tag.get("id") or "")
        # left book menu block ids look like 'book-block-menu-1363' or 'block-rulebook-booknavigation'
        if tid.startswith("book-block-menu") or tid == "block-rulebook-booknavigation":
            return True
        return False

    # keep headings that are NOT inside utility containers
    hs = [h for h in root.find_all(HEADINGS) if h.find_parent(is_utility) is None]

    if not hs:
        txt = root.get_text(" ", strip=True)
        return pd.DataFrame([{
            "url": url, "order": 1, "level": 1,
            "number": None, "heading": "(unheaded content)", "text": txt
        }])

    rows = []
    for i, h in enumerate(hs, start=1):
        level = int(h.name[1])

        # collect until the next heading of same or higher level
        text_chunks = []
        for sib in h.next_siblings:
            if isinstance(sib, NavigableString):
                continue
            nm = getattr(sib, "name", None)
            if nm in HEADINGS and int(nm[1]) <= level:
                break
            if nm in ALLOWED and sib.find_parent(is_utility) is None:
                text_chunks.append(sib.get_text(" ", strip=True))
        full_text = "\n\n".join(t for t in text_chunks if t)

        title = h.get_text(" ", strip=True)
        m = re.search(r"^\s*([0-9IVXivx]+(?:\.[0-9A-Za-z]+)*)\b|\b(Pillar\s*[0-9]+)\b", title)
        number  = m.group(0) if m else None
        heading = re.sub(r"^\s*([0-9IVXivx]+(?:\.[0-9A-Za-z]+)*)\s*[-–—:]?\s*", "", title).strip()

        rows.append({
            "url": url, "order": i, "level": level,
            "number": number, "heading": heading, "text": full_text
        })

    return pd.DataFrame(rows)

In [25]:
import re, json, csv, time, hashlib
from urllib.parse import urljoin, urlparse, urldefrag
from collections import deque
import requests
from bs4 import BeautifulSoup

BASE = "https://rulebook.sama.gov.sa"
LANG = "en"
START_URL = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3/"

UA = {"User-Agent": "SAMA"}
CONTAINERS_CSS = (
    "main, article, .entry-content, .content, #content, #main, "
    ".wp-block-group, .wp-site-blocks, .wp-block-group__inner-container, .wp-block-columns"
)

def fetch(url, timeout=30):
    r = requests.get(url, headers=UA, timeout=timeout, allow_redirects=True)
    if r.status_code != 200: return None
    return BeautifulSoup(r.text, "html.parser")

def page_title(soup):
    t = soup.find("title")
    return t.get_text(" ", strip=True) if t else ""

def text_of(el):
    return re.sub(r"\s+"," ", el.get_text(" ", strip=True)).strip()

def choose_main_container(soup):
    conts = soup.select(CONTAINERS_CSS)
    return max(conts, key=lambda el: len(el.get_text().strip())) if conts else soup

In [26]:
def extract_blocks(soup):
    c = choose_main_container(soup)
    blocks, seq = [], 0
    def add(kind, text, meta=None):
        nonlocal seq; seq += 1
        blocks.append({"kind": kind, "text": text, "meta": meta or {}, "block_ix": seq})

    for el in c.descendants:
        if not getattr(el, "name", None): continue
        if re.fullmatch(r"h[1-6]", el.name, re.I):
            add("heading", text_of(el), {"level": int(el.name[1])})
        elif el.name == "p":
            t = text_of(el)
            if t: add("paragraph", t)
        elif el.name in ("ul","ol"):
            items = [text_of(li) for li in el.select(":scope > li") if text_of(li)]
            if items: add("list", "\n".join(items), {"ordered": el.name=="ol", "n": len(items)})
        elif el.name == "table":
            rows = []
            for tr in el.select("tr"):
                cells = [text_of(td) for td in tr.select("th,td")]
                rows.append(cells)
            if rows:
                md = []
                if rows[0]:
                    md += ["| " + " | ".join(rows[0]) + " |",
                           "| " + " | ".join(["---"]*len(rows[0])) + " |"]
                for r in rows[1:]:
                    md.append("| " + " | ".join(r) + " |")
                add("table", "\n".join(md), {"rows": len(rows)})
    return blocks

# quick sanity check on your page
s = fetch(START_URL)
print("Title:", page_title(s))
blks = extract_blocks(s)
print("Blocks:", len(blks))
blks[:6]

Title: Prudential and Supervisory Requirements | SAMA Rulebook
Blocks: 29


[{'kind': 'table',
  'text': '| Entire section | Custom print | Text Only | Rich Text | Print / Save as PDF |\n| --- | --- | --- | --- | --- |',
  'meta': {'rows': 1},
  'block_ix': 1},
 {'kind': 'list',
  'text': 'SAMA Rulebook\nBanking Sector\nPrudential and Supervisory Requirements',
  'meta': {'ordered': True, 'n': 3},
  'block_ix': 2},
 {'kind': 'table',
  'text': '|  |  | Versions |  |  |\n| --- | --- | --- | --- | --- |',
  'meta': {'rows': 1},
  'block_ix': 3},
 {'kind': 'heading',
  'text': 'Book traversal links for Prudential and Supervisory Requirements',
  'meta': {'level': 2},
  'block_ix': 4},
 {'kind': 'list',
  'text': '‹ Appendix-I\nUp\nPreface ›',
  'meta': {'ordered': False, 'n': 3},
  'block_ix': 5},
 {'kind': 'heading',
  'text': 'Prudential and Supervisory Requirements',
  'meta': {'level': 2},
  'block_ix': 6}]

In [27]:
from bs4 import BeautifulSoup
import requests

url = "https://rulebook.sama.gov.sa/en/prudential-and-supervisory-requirements-3/"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

links = soup.select("ul.menu li.menu-item a")
for a in links[:10]:
    print(a.get_text(strip=True), "→", a["href"])
print("Total links found:", len(links))

Preface → /en/preface
Scope of Application of Basel Framework → /en/scope-application-basel-framework
Minimum Capital Requirements → /en/minimum-capital-requirements
Minimum Capital Requirements for Credit Risk → /en/minimum-capital-requirements-credit-risk
Minimum Capital Requirements for Market Risk → /en/minimum-capital-requirements-market-risk
Minimum Capital Requirements for Operational Risk → /en/minimum-capital-requirements-operational-risk
Minimum Capital Requirements for Counterparty Credit Risk (CCR) and Credit Valuation Adjustment (CVA) → /en/minimum-capital-requirements-counterparty-credit-risk-ccr-and-credit-valuation-adjustment-cva
Output Floor Requirements → /en/output-floor-requirements
Additional Requirements on Capital Adequacy for Shari’ah Compliant Banking → /en/additional-requirements-capital-adequacy-shari%E2%80%99ah-compliant-banking
Leverage → /en/leverage
Total links found: 58


In [28]:
base = "https://rulebook.sama.gov.sa"
toc = [(a.get_text(strip=True), base + a["href"]) for a in links]
import pandas as pd
structure = pd.DataFrame(toc, columns=["title","url"])
structure.to_csv("sama_structure.csv", index=False)
structure.head(32)

Unnamed: 0,title,url
0,Preface,https://rulebook.sama.gov.sa/en/preface
1,Scope of Application of Basel Framework,https://rulebook.sama.gov.sa/en/scope-applicat...
2,Minimum Capital Requirements,https://rulebook.sama.gov.sa/en/minimum-capita...
3,Minimum Capital Requirements for Credit Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
4,Minimum Capital Requirements for Market Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
5,Minimum Capital Requirements for Operational Risk,https://rulebook.sama.gov.sa/en/minimum-capita...
6,Minimum Capital Requirements for Counterparty ...,https://rulebook.sama.gov.sa/en/minimum-capita...
7,Output Floor Requirements,https://rulebook.sama.gov.sa/en/output-floor-r...
8,Additional Requirements on Capital Adequacy fo...,https://rulebook.sama.gov.sa/en/additional-req...
9,Leverage,https://rulebook.sama.gov.sa/en/leverage


In [30]:
DROP_PHRASES = [
    "Entire section", "Custom print", "Text Only", "Rich Text",
    "Print / Save as PDF", "Book traversal links", "Up", "‹", "›"
]

def get_content(url):
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "html.parser")
    c = soup.select_one("main, article, .entry-content, .content, #content, .wp-block-group")
    if not c:
        return ""

    parts = []
    for node in c.select("p, h1, h2, h3, li"):
        t = node.get_text(" ", strip=True)
        if not t:
            continue
        # skip if it contains known junk
        if any(p.lower() in t.lower() for p in DROP_PHRASES):
            continue
        parts.append(t)

    text = "\n".join(parts)
    # remove pipe-heavy menu tables
    text = re.sub(r"[|]{2,}", " ", text)
    return text.strip()

In [57]:
def get_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    c = soup.select_one("main, article, .entry-content, .content, #content, .wp-block-group")
    if not c:
        return ""

    parts = []
    
# 1) Handle tables
    for table in c.select("table"):
        for row in table.select("tr"):
            cells = [td.get_text(" ", strip=True) for td in row.select("td, th")]
            if cells:
                parts.append(" | ".join(cells))
        parts.append("")  # blank line between tables
        table.extract()   # remove table so it doesn’t get re-read

    # 2) Handle normal text (paragraphs & headers)
    for el in c.select("p, h2"):
        txt = el.get_text(" ", strip=True)
        if txt:
            parts.append(txt)

    return "\n".join(parts)
# test one
print(get_content(structure["url"].iloc[0])[:10000000]) 
print("=====================================================================================================================================")
print(structure["url"].iloc[2])
print(get_content("https://rulebook.sama.gov.sa/en/minimum-capital-requirements-credit-risk")[:10000000]) 


Entire section | Custom print | Text Only | Rich Text | Print / Save as PDF

 |  | Versions |  | 

No: 44047144 | Date(g): 27/12/2022 | Date(h): 4/6/1444 | Status: In-Force

1. | Local banks must comply with SAMA’s Basel Framework (the Framework) at both standalone and consolidated level 4 .
2. | For purposes of the Framework, the consolidation will include all subsidiaries undertaking financial or banking activities, which the bank have a majority ownership 5 or –control, except insurance entities.
3. | Where consolidation of a subsidiary is not feasible 6 , banks are required to seek SAMA’s approval to exclude the subsidiary from the scope of application and reporting requirements. The application should include proper justifications and risk management controls to ensure group risks are managed effectively.
4. | Subject to SAMA discretion, the framework may apply to the bank subsidiaries at every tier or level within the banking group on a consolidated and/or on standalone basis, as