In [51]:
%pip install -q pdfminer.six pypdf pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [54]:
from pathlib import Path
from pdfminer.high_level import extract_text

PDF = Path("SAMA_EN_4234_VER1.pdf")   
OUT = Path("results"); OUT.mkdir(exist_ok=True)

def extract_with_pdfminer(p: Path) -> str:
    try:
        return extract_text(str(p)) or ""
    except Exception:
        return ""
raw = extract_with_pdfminer(PDF)
if len(raw.strip()) < 300:           
    raw = extract_with_pypdf(PDF)

(Path(OUT/"pillar3_raw.txt")).write_text(raw, encoding="utf-8")
print("Raw length:", len(raw), "| saved →", (OUT/"pillar3_raw.txt").resolve())
print(raw[:800])

Raw length: 431914 | saved → /Users/alaajohani/RegulatoryComplexity_Public/results/pillar3_raw.txt
Saudi Central Bank (SAMA) 

Pillar 3 Disclosure Requirements Framework

December 2022 

Contents 
1. Introduction: ......................................................................................................................................................... 4 

2. Scope of application: ............................................................................................................................................ 5 

3. Implementation dates: ......................................................................................................................................... 5 

4. Guiding principles of banks’ Pillar 3 disclosures: ............................................................................................. 5 

5. Assurance of Pillar 3 data: ......


In [11]:
import re, json
from pathlib import Path

raw_text = (OUT/"pillar3_raw.txt").read_text(encoding="utf-8")

def strip_headers_footers(t: str) -> str:
    keep=[]
    for ln in t.splitlines():
        L = ln.strip()
        if L.startswith("Page Number") and "of" in L: continue
        if "Issue Date Version Number" in L: continue
        if L in {"Saudi Central Bank (SAMA)","Pillar 3 Disclosure Requirements Framework","December 2022","2.1"}: 
            continue
        keep.append(ln)
    return "\n".join(keep)

def strip_toc_dotleaders(t:str)->str:
    return re.sub(r"\.{2,}\s*\d+\s*$","",t,flags=re.M)

def drop_front_matter(t:str)->str:
    return re.sub(r"(?s)\bContents\b.*?\bGlossary\b","Glossary\n",t,flags=re.I)

def normalize_layout(t:str)->str:
    t=t.replace("\r","")
    t=re.sub(r"-\s*\n\s*(?=[A-Za-z])","-",t)        # join hyphen+wraps
    t=re.sub(r"(?<!\n)\n(?!\n)"," ",t)              # single NL → space
    t=re.sub(r"[ \t]+"," ",t)
    t=t.replace("–","-").replace("—","-").replace("’","'").replace("“","\"").replace("”","\"")
    t=re.sub(r"\n[ \t]+","\n",t)
    return t.strip()

clean = normalize_layout(drop_front_matter(strip_toc_dotleaders(strip_headers_footers(raw_text))))
(OUT/"pillar3_clean.txt").write_text(clean, encoding="utf-8")
print("Clean length:", len(clean), "| saved →", (OUT/"pillar3_clean.txt").resolve())
print(clean[:600])

# Extract Template/Table blocks:  "Template CODE: Title" or "Table CODE: Title"
ITEM_RX = re.compile(r"(?mi)^(Template|Table)\s+([A-Z0-9\-]+)\s*:\s*(.+?)\s*(?=^(?:Template|Table)\s+[A-Z0-9\-]+\s*:|\Z)", re.S)
items=[]
for m in ITEM_RX.finditer(clean):
    kind=m.group(1).title()
    code=m.group(2).strip()
    title=m.group(3).splitlines()[0].strip()
    body=m.group(0)
    items.append({"kind":kind,"code":code,"title":title,"text":body})

with open(OUT/"pillar3_items.jsonl","w",encoding="utf-8") as f:
    for r in items:
        f.write(json.dumps(r, ensure_ascii=False)+"\n")

print("Items found:", len(items), "| saved →", (OUT/'pillar3_items.jsonl').resolve())
for r in items[:8]:
    print(f"- {r['kind']} {r['code']}: {r['title'][:90]}")

Clean length: 393728 | saved → /Users/alaajohani/RegulatoryComplexity_Public/results/pillar3_clean.txt
Glossary 

SACAP 

SCRE 

SCCR 

SMAR 

SOPE 

SLEV 

SLCR 

SNSF 

SAMA's Final Guidance Document Concerning Implementation of Capital Reforms Under Basel III Framework No.341000015689 date 06/02/1434AH, Section A 

Calculation of the Minimum Capital Requirements for Credit Risk issued by SAMA as part of its adoption of Basel III post-crisis final reforms. 

Minimum Capital Requirements for Counterparty Credit Risk (CCR) and Credit Valuation Adjustment (CVA) issued by SAMA as part of its adoption of Basel III post-crisis final reforms. 

Minimum Capital Requirements for market risk issued by 
Items found: 60 | saved → /Users/alaajohani/RegulatoryComplexity_Public/results/pillar3_items.jsonl
- Template KM1: Key metrics (at consolidated group level) Purpose: To provide an overview of a bank's prud
- Template KM2: Key metrics - TLAC requirements (at resolution group level) Purpose: Prov

In [None]:
import re, json
from pathlib import Path

raw_text = (OUT/"pillar3_raw.txt").read_text(encoding="utf-8")

def strip_headers_footers(t: str) -> str:
    keep=[]
    for ln in t.splitlines():
        L = ln.strip()
        if L.startswith("Page Number") and "of" in L: continue
        if "Issue Date Version Number" in L: continue
        if L in {"Saudi Central Bank (SAMA)","Pillar 3 Disclosure Requirements Framework","December 2022","2.1"}: 
            continue
        keep.append(ln)
    return "\n".join(keep)

def strip_toc_dotleaders(t:str)->str:
    return re.sub(r"\.{2,}\s*\d+\s*$","",t,flags=re.M)

def drop_front_matter(t:str)->str:
    return re.sub(r"(?s)\bContents\b.*?\bGlossary\b","Glossary\n",t,flags=re.I)

def normalize_layout(t:str)->str:
    t=t.replace("\r","")
    t=re.sub(r"-\s*\n\s*(?=[A-Za-z])","-",t)        # join hyphen+wraps
    t=re.sub(r"(?<!\n)\n(?!\n)"," ",t)              # single NL → space
    t=re.sub(r"[ \t]+"," ",t)
    t=t.replace("–","-").replace("—","-").replace("’","'").replace("“","\"").replace("”","\"")
    t=re.sub(r"\n[ \t]+","\n",t)
    return t.strip()

clean = normalize_layout(drop_front_matter(strip_toc_dotleaders(strip_headers_footers(raw_text))))
(OUT/"pillar3_clean.txt").write_text(clean, encoding="utf-8")
print("Clean length:", len(clean), "| saved →", (OUT/"pillar3_clean.txt").resolve())
print(clean[:600])

# Extract Template/Table blocks:  "Template CODE: Title" or "Table CODE: Title"
ITEM_RX = re.compile(r"(?mi)^(Template|Table)\s+([A-Z0-9\-]+)\s*:\s*(.+?)\s*(?=^(?:Template|Table)\s+[A-Z0-9\-]+\s*:|\Z)", re.S)
items=[]
for m in ITEM_RX.finditer(clean):
    kind=m.group(1).title()
    code=m.group(2).strip()
    title=m.group(3).splitlines()[0].strip()
    body=m.group(0)
    items.append({"kind":kind,"code":code,"title":title,"text":body})

with open(OUT/"pillar3_items.jsonl","w",encoding="utf-8") as f:
    for r in items:
        f.write(json.dumps(r, ensure_ascii=False)+"\n")

print("Items found:", len(items), "| saved →", (OUT/'pillar3_items.jsonl').resolve())
for r in items[:8]:
    print(f"- {r['kind']} {r['code']}: {r['title'][:90]}")

from pathlib import Path
import json  # <-- separate import

clean_text = Path("results/pillar3_clean.txt").read_text(encoding="utf-8")

with open("results/pillar3_items.jsonl", "r", encoding="utf-8") as f:
    items = [json.loads(line) for line in f if line.strip()]

print("Items:", len(items))
print(items[0]["kind"], items[0]["code"], "→", items[0]["title"])

In [11]:
import re, json
from pathlib import Path

raw_text = (OUT/"pillar3_raw.txt").read_text(encoding="utf-8")

def strip_headers_footers(t: str) -> str:
    keep=[]
    for ln in t.splitlines():
        L = ln.strip()
        if L.startswith("Page Number") and "of" in L: continue
        if "Issue Date Version Number" in L: continue
        if L in {"Saudi Central Bank (SAMA)","Pillar 3 Disclosure Requirements Framework","December 2022","2.1"}: 
            continue
        keep.append(ln)
    return "\n".join(keep)

def strip_toc_dotleaders(t:str)->str:
    return re.sub(r"\.{2,}\s*\d+\s*$","",t,flags=re.M)

def drop_front_matter(t:str)->str:
    return re.sub(r"(?s)\bContents\b.*?\bGlossary\b","Glossary\n",t,flags=re.I)

def normalize_layout(t:str)->str:
    t=t.replace("\r","")
    t=re.sub(r"-\s*\n\s*(?=[A-Za-z])","-",t)        # join hyphen+wraps
    t=re.sub(r"(?<!\n)\n(?!\n)"," ",t)              # single NL → space
    t=re.sub(r"[ \t]+"," ",t)
    t=t.replace("–","-").replace("—","-").replace("’","'").replace("“","\"").replace("”","\"")
    t=re.sub(r"\n[ \t]+","\n",t)
    return t.strip()

clean = normalize_layout(drop_front_matter(strip_toc_dotleaders(strip_headers_footers(raw_text))))
(OUT/"pillar3_clean.txt").write_text(clean, encoding="utf-8")
print("Clean length:", len(clean), "| saved →", (OUT/"pillar3_clean.txt").resolve())
print(clean[:600])

# Extract Template/Table blocks:  "Template CODE: Title" or "Table CODE: Title"
ITEM_RX = re.compile(r"(?mi)^(Template|Table)\s+([A-Z0-9\-]+)\s*:\s*(.+?)\s*(?=^(?:Template|Table)\s+[A-Z0-9\-]+\s*:|\Z)", re.S)
items=[]
for m in ITEM_RX.finditer(clean):
    kind=m.group(1).title()
    code=m.group(2).strip()
    title=m.group(3).splitlines()[0].strip()
    body=m.group(0)
    items.append({"kind":kind,"code":code,"title":title,"text":body})

with open(OUT/"pillar3_items.jsonl","w",encoding="utf-8") as f:
    for r in items:
        f.write(json.dumps(r, ensure_ascii=False)+"\n")

print("Items found:", len(items), "| saved →", (OUT/'pillar3_items.jsonl').resolve())
for r in items[:8]:
    print(f"- {r['kind']} {r['code']}: {r['title'][:90]}")

Clean length: 393728 | saved → /Users/alaajohani/RegulatoryComplexity_Public/results/pillar3_clean.txt
Glossary 

SACAP 

SCRE 

SCCR 

SMAR 

SOPE 

SLEV 

SLCR 

SNSF 

SAMA's Final Guidance Document Concerning Implementation of Capital Reforms Under Basel III Framework No.341000015689 date 06/02/1434AH, Section A 

Calculation of the Minimum Capital Requirements for Credit Risk issued by SAMA as part of its adoption of Basel III post-crisis final reforms. 

Minimum Capital Requirements for Counterparty Credit Risk (CCR) and Credit Valuation Adjustment (CVA) issued by SAMA as part of its adoption of Basel III post-crisis final reforms. 

Minimum Capital Requirements for market risk issued by 
Items found: 60 | saved → /Users/alaajohani/RegulatoryComplexity_Public/results/pillar3_items.jsonl
- Template KM1: Key metrics (at consolidated group level) Purpose: To provide an overview of a bank's prud
- Template KM2: Key metrics - TLAC requirements (at resolution group level) Purpose: Prov

In [53]:
def six_measures(blocks, dictionary):
    import re, math, pandas as pd

    def phrase_rx(p):
        p = re.sub(r"\s+"," ", p.strip())
        return re.compile(rf"\b{re.escape(p)}\b", re.I)

    REG_RX = [(p, phrase_rx(p)) for p in sorted(REGULATORY_PHRASES, key=len, reverse=True)]
    LOG_RX = [(p, phrase_rx(p)) for p in sorted(LOGICAL_PHRASES,   key=len, reverse=True)]
    MATH_RX= [(p, phrase_rx(p)) for p in sorted(MATH_PHRASES,      key=len, reverse=True)]

    def make_token_regex(token: str) -> re.Pattern:
        tok = token.strip()
        m = re.fullmatch(r"([A-Za-z]{2,})(\d+)", tok)
        if m:
            letters, digits = m.groups()
            return re.compile(rf"\b{re.escape(letters)}\s*[-–]?\s*{digits}\b", re.I)
        esc = re.escape(tok).replace(r"\ ", r"\s+").replace(r"\-", r"[-–]?")
        return re.compile(rf"\b{esc}\b", re.I)

    def operands_in(text_block: str, dictionary: pd.DataFrame) -> pd.DataFrame:
        rows=[]
        for tok, typ in dictionary[["Token","Type"]].itertuples(index=False):
            n = sum(1 for _ in make_token_regex(str(tok)).finditer(text_block))
            if n:
                rows.append({"token": tok, "type": typ, "count": n})
        return pd.DataFrame(rows) if rows else pd.DataFrame(columns=["token","type","count"])

    def count_phrases(block: str, pairs):
        return sum(1 for _, rx in pairs for _ in rx.finditer(block))

    COND_SET = {"if","when","unless","subject to","except where"}
    def cyclo_count(block: str) -> int:
        cond = sum(1 for w in COND_SET if phrase_rx(w).search(block))
        or_cnt = 1 if phrase_rx("or").search(block) else 0
        return cond + or_cnt

    recs=[]
    for r in blocks:
        item = f"{r['kind']} {r['code']}"
        text_block = r["text"]

        ods = operands_in(text_block, dictionary)
        length     = int(ods["count"].sum()) if not ods.empty else 0
        potential  = int(ods["token"].nunique()) if not ods.empty else 0

        reg  = count_phrases(text_block, REG_RX)
        logg = count_phrases(text_block, LOG_RX)
        math = count_phrases(text_block, MATH_RX) + sum(1 for s in MATH_SYMBOLS if re.search(re.escape(s), text_block))
        quantity  = int(reg>0) + int(logg>0) + int(math>0)

        # FIXED: presence via .search(), not len(finditer)
        diversity = (
            sum(1 for _,rx in REG_RX  if rx.search(text_block)) +
            sum(1 for _,rx in LOG_RX  if rx.search(text_block)) +
            sum(1 for _,rx in MATH_RX if rx.search(text_block)) +
            sum(1 for s in MATH_SYMBOLS if re.search(re.escape(s), text_block))
        )

        cyclomatic = cyclo_count(text_block)
        level = round(potential/length, 2) if length>0 else 0.0

        recs.append({
            "Item": item,
            "length": length,
            "cyclomatic": cyclomatic,
            "quantity": quantity,
            "potential": potential,
            "diversity": diversity,
            "level": level
        })

    return (pd.DataFrame(recs)
            .sort_values(["length","cyclomatic","potential"], ascending=False)
            .reset_index(drop=True))

In [41]:
tbl6_full = six_measures(items, dict_df)   # using full dictionary
display(tbl6_full.head())
tbl6_full.to_csv("results/SAMA_six_complexity_measures_FULLDICT.csv", index=False)
print("Saved → results/SAMA_six_complexity_measures_FULLDICT.csv")
print("Items with length=0:", int((tbl6_full["length"]==0).sum()))

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level
0,Template OV1,10301,5,3,896,28,0.09
1,Template OR3,6712,5,3,776,24,0.12
2,Template LIQ2,5981,5,3,741,30,0.12
3,Template TLAC1,2398,4,3,411,28,0.17
4,Template LIQ1,1341,4,3,351,17,0.26


Saved → results/SAMA_six_complexity_measures_FULLDICT.csv
Items with length=0: 0


In [21]:
import pandas as pd
pd.set_option("display.max_rows", 1000)   # show plenty
display(tbl6_full)                        # no .head()

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level
0,Template OV1,10301,5,3,896,28,0.09
1,Template OR3,6712,5,3,776,24,0.12
2,Template LIQ2,5981,5,3,741,30,0.12
3,Template TLAC1,2398,4,3,411,28,0.17
4,Template LIQ1,1341,4,3,351,17,0.26
5,Table CRB,1071,3,3,275,12,0.26
6,Template KM1,1021,2,3,248,16,0.24
7,Template CCR8,1006,4,3,332,14,0.33
8,Template CR5,884,3,3,261,13,0.3
9,Table MR2,858,4,3,212,17,0.25


In [22]:
import re, json

def numbered_sections_from_text(txt):
    # capture headings like: 1. Introduction, 2. Scope of application, 3.1 Something, ...
    pat = re.compile(r"(?m)^\s*(\d+(?:\.\d+)*)\.\s+([A-Z][^\n]{3,120})\s*$")
    hits = [(m.start(), m.group(1), m.group(2)) for m in pat.finditer(txt)]
    blocks = []
    for i,(pos, code, title) in enumerate(hits):
        end = hits[i+1][0] if i+1 < len(hits) else len(txt)
        block = txt[pos:end].strip()
        blocks.append({"kind":"Section", "code":code, "title":title, "text":block})
    return blocks

# Build a combined item list = Templates/Tables (60) + Numbered Sections
section_items = numbered_sections_from_text(clean_text)
print("Templates/Tables:", len(items), "| Sections:", len(section_items))
items_all = items + section_items
print("Total items to analyze:", len(items_all))

Templates/Tables: 60 | Sections: 30
Total items to analyze: 90


In [23]:
tbl6_sections = six_measures(items_all, dict_df)   # uses FULL dictionary
tbl6_sections.to_csv("results/SAMA_six_measures_FULLDICT_TemplatesPlusSections.csv", index=False)
display(tbl6_sections.head())

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level
0,Template OV1,10301,5,3,896,28,0.09
1,Section 14,9299,5,3,842,31,0.09
2,Section 19,7396,5,3,770,26,0.1
3,Template OR3,6712,5,3,776,24,0.12
4,Template LIQ2,5981,5,3,741,30,0.12


In [24]:
display(tbl6_sections.head(len(tbl6_sections)))

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level
0,Template OV1,10301,5,3,896,28,0.09
1,Section 14,9299,5,3,842,31,0.09
2,Section 19,7396,5,3,770,26,0.1
3,Template OR3,6712,5,3,776,24,0.12
4,Template LIQ2,5981,5,3,741,30,0.12
5,Section 27,3556,4,3,417,18,0.12
6,Section 12,3532,5,3,557,24,0.16
7,Section 28,3295,5,3,560,23,0.17
8,Section 16,3019,4,3,528,18,0.17
9,Section 20,2890,4,3,527,21,0.18


In [25]:
import pandas as pd

# add titles
title_map = {f"{r['kind']} {r['code']}": r['title'] for r in items_all}  # use items or items_all
tbl6_sections["Title"] = tbl6_sections["Item"].map(title_map)

# original order instead of ranking by length
doc_order = [f"{r['kind']} {r['code']}" for r in items_all]
tbl_doc = (
    tbl6_sections.assign(_ord=pd.Categorical(tbl6_sections["Item"], doc_order, ordered=True))
                 .sort_values("_ord").drop(columns="_ord")
)

pd.set_option("display.max_rows", None)
display(tbl_doc)

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level,Title
23,Template KM1,1021,2,3,248,16,0.24,Key metrics (at consolidated group level) Purp...
39,Template KM2,567,3,3,199,13,0.35,Key metrics - TLAC requirements (at resolution...
58,Table OVA,319,2,3,153,5,0.48,Bank risk management approach Purpose: Descrip...
0,Template OV1,10301,5,3,896,28,0.09,Overview of RWA Purpose: To provide an overvie...
12,Template TLAC1,2398,4,3,411,28,0.17,TLAC composition for G-SIBs (at resolution gro...
30,Template CDC,756,3,3,206,15,0.27,Capital distribution constraints Purpose: To p...
62,Table LIA,280,2,3,127,6,0.45,Explanations of differences between accounting...
38,Template LI1,599,4,3,195,13,0.33,Differences between accounting and regulatory ...
27,Template LI2,835,3,3,228,12,0.27,Main sources of differences between regulatory...
29,Template PV1,814,2,3,279,12,0.34,Prudent valuation adjustments (PVAs) Purpose: ...


In [26]:
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

tbl_clean = tbl6_sections.copy().reset_index(drop=True)  # <- index 0..N
display(tbl_clean)

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level,Title
0,Template OV1,10301,5,3,896,28,0.09,Overview of RWA Purpose: To provide an overview of total RWA forming the denominator of the risk-based capital requirements. Further breakdowns of RWA are presented in subsequent parts. Scope of application: The template is mandatory for all banks.
1,Section 14,9299,5,3,842,31,0.09,Composition of capital and TLAC:
2,Section 19,7396,5,3,770,26,0.1,Credit risk:
3,Template OR3,6712,5,3,776,24,0.12,Minimum required operational risk capital Purpose: To disclose operational risk regulatory capital requirements.
4,Template LIQ2,5981,5,3,741,30,0.12,Net Stable Funding Ratio (NSFR) Purpose: Provide details of a bank's NSFR and selected details of its NSFR components.
5,Section 27,3556,4,3,417,18,0.12,Leverage ratio:
6,Section 12,3532,5,3,557,24,0.16,"Overview of risk management, key prudential metrics and RWA:"
7,Section 28,3295,5,3,560,23,0.17,Liquidity:
8,Section 16,3019,4,3,528,18,0.17,Links between financial statements and regulatory exposures:
9,Section 20,2890,4,3,527,21,0.18,Counterparty credit risk:


In [27]:
title_map = {f"{r['kind']} {r['code']}": r['title'] for r in items_all}
doc_order = [f"{r['kind']} {r['code']}" for r in items_all]

tbl_doc = (
    tbl6_sections.assign(Title=tbl6_sections["Item"].map(title_map),
                         _ord=pd.Categorical(tbl6_sections["Item"], doc_order, ordered=True))
                .sort_values("_ord")
                .drop(columns="_ord")
                .reset_index(drop=True)
)
display(tbl_doc)

Unnamed: 0,Item,length,cyclomatic,quantity,potential,diversity,level,Title
0,Template KM1,1021,2,3,248,16,0.24,Key metrics (at consolidated group level) Purpose: To provide an overview of a bank's prudential regulatory metrics.
1,Template KM2,567,3,3,199,13,0.35,"Key metrics - TLAC requirements (at resolution group level) Purpose: Provide summary information about total loss-absorbing capacity (TLAC) available, and TLAC requirements applied, at resolution group level under the single point of entry and multiple point of entry (MPE) approaches. Scope of application: The template is mandatory for all resolution groups of G-SIBs."
2,Table OVA,319,2,3,153,5,0.48,"Bank risk management approach Purpose: Description of the bank's strategy and how senior management and the board of directors assess and manage risks, enabling users to gain a clear understanding of the bank's risk tolerance/appetite in relation to its main activities and all significant risks. Scope of application: The template is mandatory for all banks. Content: Qualitative information. Frequency: Annual Format: Flexible Banks must describe their risk management objectives and policies, in particular:"
3,Template OV1,10301,5,3,896,28,0.09,Overview of RWA Purpose: To provide an overview of total RWA forming the denominator of the risk-based capital requirements. Further breakdowns of RWA are presented in subsequent parts. Scope of application: The template is mandatory for all banks.
4,Template TLAC1,2398,4,3,411,28,0.17,"TLAC composition for G-SIBs (at resolution group level) Purpose: Provide details of the composition of a G-SIB's TLAC. Scope of application: This template is mandatory for all G-SIBs. It should be completed at the level of each resolution group within a G-SIB. Content: Carrying values (corresponding to the values reported in financial statements). Frequency: Semiannual. Format: Fixed. Accompanying narrative: G-SIBs are expected to supplement the template with a narrative commentary to explain any significant changes over the reporting period and the key drivers of any such change(s). Qualitative narrative on the G-SIB resolution strategy, including the approach (SPE or multiple point of entry (MPE)) and structure to which the resolution measures are applied, may be included to help understand the templates."
5,Template CDC,756,3,3,206,15,0.27,"Capital distribution constraints Purpose: To provide disclosure of the capital ratio(s) below which capital distribution constraints are triggered as required under the Basel framework (i.e. risk-based, leverage, etc.) to allow meaningful assessment by market participants of the likelihood of capital distributions becoming restricted. Scope of application The table is mandatory for banks. Where applicable, the template may include additional rows to accommodate other national requirements that could trigger capital distribution constraints. Content: Quantitative information. Includes the CET1 capital ratio that would trigger capital distribution constraints when taking into account (i) CET1 capital that banks must maintain to meet the minimum CET1 capital ratio, applicable risk based buffer requirements (i.e. capital conservation buffer, G-SIB surcharge and countercyclical capital buffer) and Pillar 2 capital requirements (if CET1 capital is required); (ii) CET1 capital that banks must maintain to meet the minimum regulatory capital ratios and any CET1 capital used to meet Tier 1 capital, total capital and TLAC3 requirements, applicable risk-based buffer requirements (i.e. capital conservation buffer, G-SIB surcharge and countercyclical capital buffer) and Pillar 2 capital requirements (if CET1 capital is required); and (iii) the leverage ratio inclusive of leverage ratio buffer requirement. Frequency: Annual. Format: Fixed."
6,Table LIA,280,2,3,127,6,0.45,Explanations of differences between accounting and regulatory exposure amounts Purpose: Provide qualitative explanations on the differences observed between accounting carrying value (as defined in Template LI1) and amounts considered for regulatory purposes (as defined in Template LI2) under each framework. Scope of application: The template is mandatory for all banks. Content: Qualitative information. Frequency: Annual. Format: Flexible.
7,Template LI1,599,4,3,195,13,0.33,Differences between accounting and regulatory scopes of consolidation and mapping of financial statement categories with regulatory risk categories Purpose: Columns (a) and (b) enable users to identify the differences between the scope of accounting consolidation and the scope of regulatory consolidation; and columns (c)-(g) break down how the amounts reported in banks' financial statements (rows) correspond to regulatory risk categories. Scope of application: The template is mandatory for all banks. Content: Carrying values (corresponding to the values reported in financial statements). Frequency: Annual. Format: Flexible (but the rows must align with the presentation of the bank's financial report). Accompanying narrative: See Table LIA. Banks are expected to provide qualitative explanation on items that are subject to regulatory capital charges in more than one risk category.
8,Template LI2,835,3,3,228,12,0.27,Main sources of differences between regulatory exposure amounts and carrying values in financial statements Purpose: Provide information on the main sources of differences (other than due to different scopes of consolidation which are shown in Template LI1) between the financial statements' carrying value amounts and the exposure amounts used for regulatory purposes. Scope of application: The template is mandatory for all banks. Content: Carrying values that correspond to values reported in financial statements but according to the scope of regulatory consolidation (rows 1-3) and amounts considered for regulatory exposure purposes (row 10). Frequency: Annual. Format: Flexible. Row headings shown below are provided for illustrative purposes only and should be adapted by the bank to describe the most meaningful drivers for differences between its financial statement carrying values and the amounts considered for regulatory purposes. Accompanying narrative: See Table LIA.
9,Template PV1,814,2,3,279,12,0.34,"Prudent valuation adjustments (PVAs) Purpose: Provide a breakdown of the constituent elements of a bank's PVAs according to the requirements of Basel Framework ""prudent valuation guidance"", taking into account SAMA's circular No. 301000000768 on Supervisory guidance for assessing banks' financial instrument fair value practices, July 2009."


In [28]:
import re, pandas as pd
from functools import lru_cache

# --- safety: if helpers not in memory, re-declare minimal versions ---
try:
    phrase_rx
except NameError:
    def phrase_rx(p):
        p = re.sub(r"\s+"," ", p.strip())
        return re.compile(rf"\b{re.escape(p)}\b", re.I)

try:
    make_token_regex
except NameError:
    @lru_cache(maxsize=20000)
    def make_token_regex(token: str) -> re.Pattern:
        tok = token.strip()
        m = re.fullmatch(r"([A-Za-z]{2,})(\d+)", tok)
        if m:
            letters, digits = m.groups()
            return re.compile(rf"\b{re.escape(letters)}\s*[-–]?\s*{digits}\b", re.I)
        esc = re.escape(tok).replace(r"\ ", r"\s+").replace(r"\-", r"[-–]?")
        return re.compile(rf"\b{esc}\b", re.I)

# --- fast prefilter to avoid compiling regex for tokens that don't occur at all ---
simple_text = re.sub(r"\s+"," ", clean_text).lower().replace("-", " ")
dict_pref = [
    (str(tok), str(typ), re.sub(r"\s+"," ", str(tok)).lower().replace("-", " "))
    for tok, typ in dict_df[["Token","Type"]].itertuples(index=False)
]

op_rows = []
for tok, typ, simp in dict_pref:
    if not simp or simp not in simple_text:
        continue
    n = sum(1 for _ in make_token_regex(tok).finditer(clean_text))
    if n:
        op_rows.append({"token": tok.upper(), "count": n})
operands_df = (pd.DataFrame(op_rows)
               .groupby("token", as_index=False)["count"].sum()
               .sort_values("count", ascending=False)
               .reset_index(drop=True))
operands_top10 = operands_df.head(10)
display(operands_top10)

Unnamed: 0,token,count
0,THE,3540
1,OF,2437
2,TO,1570
3,AND,1418
4,IN,1283
5,A,990
6,FOR,788
7,RISK,681
8,CAPITAL,631
9,AS,522


In [29]:
# ensure the phrase sets exist; tweak/extend them if you want broader coverage
REG_RX  = [(p.upper(), phrase_rx(p)) for p in sorted(REGULATORY_PHRASES, key=len, reverse=True)]
LOG_RX  = [(p.upper(), phrase_rx(p)) for p in sorted(LOGICAL_PHRASES,   key=len, reverse=True)]
MATH_RX = [(p.upper(), phrase_rx(p)) for p in sorted(MATH_PHRASES,      key=len, reverse=True)]

def phrase_counts(pairs, text):
    rows=[]
    for name, rx in pairs:
        c = sum(1 for _ in rx.finditer(text))
        if c:
            rows.append({"token": name, "count": c})
    return (pd.DataFrame(rows)
            .sort_values("count", ascending=False)
            .reset_index(drop=True)
            if rows else pd.DataFrame(columns=["token","count"]))

reg_df  = phrase_counts(REG_RX,  clean_text)
log_df  = phrase_counts(LOG_RX,  clean_text)
math_df = phrase_counts(MATH_RX, clean_text)

# include mathematical symbols as separate "tokens"
sym_rows = []
for s in MATH_SYMBOLS:
    c = len(re.findall(re.escape(s), clean_text))
    if c: sym_rows.append({"token": s, "count": c})
if sym_rows:
    math_df = (pd.concat([math_df, pd.DataFrame(sym_rows)], ignore_index=True)
               .groupby("token", as_index=False)["count"].sum()
               .sort_values("count", ascending=False)
               .reset_index(drop=True))

reg_top10  = reg_df.head(10)
log_top10  = log_df.head(10)
math_top10 = math_df.head(10)

display(reg_top10, log_top10, math_top10)

Unnamed: 0,token,count
0,SHOULD,227
1,MUST,170
2,MAY,89
3,SHALL,32
4,ARE REQUIRED TO,22
5,MUST NOT,8
6,IS REQUIRED TO,4
7,SHOULD NOT,2


Unnamed: 0,token,count
0,AND,1418
1,OR,330
2,NOT,246
3,IF,151
4,SUBJECT TO,115
5,INCLUDING,98
6,WHEN,53
7,EXCLUDING,32
8,UNLESS,30
9,EXCEPT WHERE,1


Unnamed: 0,token,count
0,-,1270
1,/,378
2,EQUAL TO,91
3,+,71
4,SUM,69
5,ABOVE,51
6,<,24
7,BELOW,24
8,*,23
9,PLUS,21


In [30]:
# table with 10 rows, side-by-side categories
k = max(len(operands_top10), len(reg_top10), len(log_top10), len(math_top10), 10)

def pad_top(df, k):
    df = df.copy()
    if len(df) < k:
        df = pd.concat([df, pd.DataFrame([{"token":"", "count":""}]*(k-len(df)))], ignore_index=True)
    return df.iloc[:k].reset_index(drop=True)

opsW  = pad_top(operands_top10, k).rename(columns={"token":"Operands",   "count":"Ops_count"})
regW  = pad_top(reg_top10,      k).rename(columns={"token":"Regulatory", "count":"Reg_count"})
logW  = pad_top(log_top10,      k).rename(columns={"token":"Logical",    "count":"Log_count"})
mathW = pad_top(math_top10,     k).rename(columns={"token":"Mathematical","count":"Math_count"})

table15 = pd.concat([opsW, regW, logW, mathW], axis=1)
display(table15)

# save
table15.to_csv("results/SAMA_top10_words_by_category.csv", index=False)
with open("results/SAMA_top10_words_by_category.tex","w", encoding="utf-8") as f:
    f.write(table15.to_latex(index=False, column_format="rlrlrlrl"))
print("Saved → results/SAMA_top10_words_by_category.(csv|tex)")

Unnamed: 0,Operands,Ops_count,Regulatory,Reg_count,Logical,Log_count,Mathematical,Math_count
0,THE,3540,SHOULD,227.0,AND,1418,-,1270
1,OF,2437,MUST,170.0,OR,330,/,378
2,TO,1570,MAY,89.0,NOT,246,EQUAL TO,91
3,AND,1418,SHALL,32.0,IF,151,+,71
4,IN,1283,ARE REQUIRED TO,22.0,SUBJECT TO,115,SUM,69
5,A,990,MUST NOT,8.0,INCLUDING,98,ABOVE,51
6,FOR,788,IS REQUIRED TO,4.0,WHEN,53,<,24
7,RISK,681,SHOULD NOT,2.0,EXCLUDING,32,BELOW,24
8,CAPITAL,631,,,UNLESS,30,*,23
9,AS,522,,,EXCEPT WHERE,1,PLUS,21


Saved → results/SAMA_top10_words_by_category.(csv|tex)


In [37]:
dict_df["Type"].str.upper().value_counts().head(50)

Type
ECONOMICOPERANDS         5510
LEGALREFERENCES          2450
ATTRIBUTES                362
LOGICALCONNECTORS         230
FUNCTIONWORDS             222
REGULATORYOPERATORS       161
OTHER                     127
MATHEMATICALOPERATORS      38
Name: count, dtype: int64

In [55]:
clean = Path("results/pillar3_clean.txt").read_text(encoding="utf-8")  

def seen(tok, txt):
    import re
    patt = re.compile(rf"\b{re.escape(tok)}\b", re.I)
    return bool(patt.search(txt))

for tok in ["LCR","NSFR","RWA","CET1","credit risk","counterparty"]:
    print(
        tok,
        "→ in dictionary:", dict_df["Token"].str.fullmatch(tok, case=False).any(),
        "| in text:",       seen(tok, clean),
    )


LCR → in dictionary: True | in text: True
NSFR → in dictionary: True | in text: True
RWA → in dictionary: True | in text: True
CET1 → in dictionary: False | in text: True
credit risk → in dictionary: True | in text: True
counterparty → in dictionary: True | in text: True


In [42]:
from pathlib import Path
import pandas as pd
import re

# Path to dictionary file
DICT_PATH = Path("/Users/alaajohani/RegulatoryComplexity_Public/020_auxiliary_data/Master_clean.csv")

# Load dictionary (same logic as before, robust to commas/semicolons)
def load_dictionary(dict_path):
    raw = dict_path.read_text(encoding="utf-8", errors="ignore")
    first = next((ln for ln in raw.splitlines() if ln.strip()), "")
    sep = ";" if first.count(";") >= first.count(",") else ","
    try:
        df = pd.read_csv(dict_path, sep=sep, header=0, dtype=str, encoding="utf-8", on_bad_lines="skip", engine="python")
    except Exception:
        df = pd.read_csv(dict_path, sep=sep, header=None, names=["Token","Type"], dtype=str, encoding="utf-8", on_bad_lines="skip", engine="python")
    cols = {c.lower().strip(): c for c in df.columns}
    if "token" in cols and "type" in cols:
        df = df[[cols["token"], cols["type"]]].rename(columns={cols["token"]: "Token", cols["type"]: "Type"})
    else:
        df = df.iloc[:, :2]; df.columns = ["Token","Type"]
    df["Token"] = df["Token"].astype(str).str.strip()
    return df.dropna().drop_duplicates()

dict_df = load_dictionary(DICT_PATH)
print("Dictionary loaded:", len(dict_df), "entries")

# Load your cleaned Pillar 3 text
clean_text = Path("results/pillar3_clean.txt").read_text(encoding="utf-8")

# Helper: check if token is present in text
def seen(tok, txt):
    patt = re.compile(rf"\b{re.escape(tok)}\b", re.I)
    return bool(patt.search(txt))

# Terms to check
terms = ["LCR", "NSFR", "RWA", "CET1", "credit risk", "counterparty"]

# Report coverage
for tok in terms:
    in_dict = dict_df["Token"].str.fullmatch(tok, case=False).any()
    in_text = seen(tok, clean_text)
    print(f"{tok:<12} → in dictionary: {in_dict:<5} | in text: {in_text}")

Dictionary loaded: 0 entries
LCR          → in dictionary: 0     | in text: True
NSFR         → in dictionary: 0     | in text: True
RWA          → in dictionary: 0     | in text: True
CET1         → in dictionary: 0     | in text: True
credit risk  → in dictionary: 0     | in text: True
counterparty → in dictionary: 0     | in text: True


In [44]:
from pathlib import Path
import pandas as pd
import re

print("CWD:", Path.cwd())

# --- 1) Diagnose path visibility
ABS_DICT = Path("/Users/alaajohani/RegulatoryComplexity_Public/020_auxiliary_data/Master_clean.csv")
print("Absolute path exists?", ABS_DICT.exists())

# --- 2) Helper: robust loader (works for Master_clean.csv or MasterDictionary_v1.0.csv)
def load_dictionary(dict_path: Path) -> pd.DataFrame:
    raw = dict_path.read_text(encoding="utf-8", errors="ignore")
    first = next((ln for ln in raw.splitlines() if ln.strip()), "")
    sep = ";" if first.count(";") >= first.count(",") else ","
    try:
        df = pd.read_csv(dict_path, sep=sep, header=0, dtype=str, encoding="utf-8",
                         on_bad_lines="skip", engine="python")
    except Exception:
        df = pd.read_csv(dict_path, sep=sep, header=None, names=["Token","Type"], dtype=str,
                         encoding="utf-8", on_bad_lines="skip", engine="python")
    cols = {c.lower().strip(): c for c in df.columns}
    if "token" in cols and "type" in cols:
        df = df[[cols["token"], cols["type"]]].rename(columns={cols["token"]: "Token",
                                                               cols["type"]: "Type"})
    else:
        df = df.iloc[:, :2]; df.columns = ["Token","Type"]
    df["Token"] = df["Token"].astype(str).str.strip()
    return df.dropna().drop_duplicates()

# --- 3) Auto-locate the dictionary if the absolute path isn't reachable
def find_dictionary() -> Path:
    candidates = []
    roots = [
        Path.cwd(),
        Path.cwd() / "results",
        Path.cwd() / "020_auxiliary_data",
        Path.cwd().parent,                        # project root if you're in a subdir
        Path.home() / "RegulatoryComplexity_Public",      # your home clone
        Path.home() / "RegulatoryComplexity_Public" / "020_auxiliary_data"
    ]
    patterns = [
        "Master_clean.csv",
        "MasterDictionary_v1.0.csv",
        "*Master*clean*.csv",
        "*Master*Dictionary*.csv",
    ]
    for r in roots:
        if r.exists():
            for pat in patterns:
                candidates += list(r.rglob(pat))
    if not candidates:
        raise FileNotFoundError(
            "Could not find a master dictionary. "
            "Put Master_clean.csv or MasterDictionary_v1.0.csv in ./020_auxiliary_data or provide DICT_PATH."
        )
    # Prefer the cleaned version if both exist
    candidates_sorted = sorted(candidates, key=lambda p: ("clean" not in p.name.lower(), len(str(p))))
    best = candidates_sorted[0]
    print("Using dictionary at:", best)
    return best

# --- 4) Pick the path to use
DICT_PATH = ABS_DICT if ABS_DICT.exists() else find_dictionary()

# --- 5) Load dictionary
dict_df = load_dictionary(DICT_PATH)
print("Dictionary loaded:", len(dict_df), "entries")

# --- 6) Coverage check (same as before)
clean_text = Path("results/pillar3_clean.txt").read_text(encoding="utf-8")

def seen(tok, txt):
    patt = re.compile(rf"\b{re.escape(tok)}\b", re.I)
    return bool(patt.search(txt))

terms = ["LCR","NSFR","RWA","CET1","credit risk","counterparty"]
for tok in terms:
    in_dict = dict_df["Token"].str.fullmatch(tok, case=False).any()
    in_text = seen(tok, clean_text)
    print(f"{tok:<12} → in dictionary: {in_dict:<5} | in text: {in_text}")

CWD: /Users/alaajohani/RegulatoryComplexity_Public
Absolute path exists? True
Dictionary loaded: 0 entries
LCR          → in dictionary: 0     | in text: True
NSFR         → in dictionary: 0     | in text: True
RWA          → in dictionary: 0     | in text: True
CET1         → in dictionary: 0     | in text: True
credit risk  → in dictionary: 0     | in text: True
counterparty → in dictionary: 0     | in text: True


In [48]:
from pathlib import Path

DICT_PATH = Path("/Users/alaajohani/RegulatoryComplexity_Public/020_auxiliary_data/Master_clean.csv")

# --- show first 20 lines 
with open(DICT_PATH, encoding="utf-8", errors="ignore") as f:
    for i in range(20):
        print(f.readline().rstrip())


"$1,000"	"Attributes"
"$1,000,000"	"Attributes"
"$1,000,000,000"	"Attributes"
"$1,000,000,000,000"	"Attributes"
"$1,244,000,000"	"Attributes"
"$1,259,000,000"	"Attributes"
"$1,300,000,000"	"Attributes"
"$1,500,000,000"	"Attributes"
"$1,750,000,000"	"Attributes"
"$10,000"	"Attributes"
"$10,000,000,000"	"Attributes"
"$10.00"	"Attributes"
"$100"	"Attributes"
"$100,000"	"Attributes"
"$100,000,000"	"Attributes"
"$140,000"	"Attributes"
"$15,000,000,000"	"Attributes"
"$150"	"Attributes"
"$150,000"	"Attributes"
"$150,000,000"	"Attributes"


In [46]:
from pathlib import Path
import pandas as pd, re

DICT_PATH = Path("/Users/alaajohani/RegulatoryComplexity_Public/020_auxiliary_data/Master_clean.csv")

def load_master_clean_quoted(path: Path) -> pd.DataFrame:
    rows = []
    with open(path, encoding="utf-8", errors="ignore") as f:
        for ln in f:
            # grab all "..."; works for lines like: "1,000"    "Attributes"
            parts = re.findall(r'"([^"]*)"', ln)
            if len(parts) >= 2:
                token, typ = parts[0], parts[1]
                rows.append((token.strip(), typ.strip()))
    if not rows:
        raise ValueError("Parsed 0 rows. Check file format/path.")
    df = pd.DataFrame(rows, columns=["Token","Type"])
    # clean & de-duplicate
    df["Token"] = df["Token"].astype(str).str.strip()
    df["Type"]  = df["Type"].astype(str).str.strip()
    return df.dropna().drop_duplicates()

dict_df = load_master_clean_quoted(DICT_PATH)
print("Dictionary loaded:", len(dict_df), "entries")
print(dict_df.head(10))

Dictionary loaded: 14939 entries
                Token        Type
0              $1,000  Attributes
1          $1,000,000  Attributes
2      $1,000,000,000  Attributes
3  $1,000,000,000,000  Attributes
4      $1,244,000,000  Attributes
5      $1,259,000,000  Attributes
6      $1,300,000,000  Attributes
7      $1,500,000,000  Attributes
8      $1,750,000,000  Attributes
9             $10,000  Attributes


In [47]:
from pathlib import Path
import re

clean_text = Path("results/pillar3_clean.txt").read_text(encoding="utf-8")

def seen(tok, txt):
    patt = re.compile(rf"\b{re.escape(tok)}\b", re.I)
    return bool(patt.search(txt))

terms = ["LCR","NSFR","RWA","CET1","credit risk","counterparty"]
for tok in terms:
    in_dict = dict_df["Token"].str.fullmatch(tok, case=False).any()
    in_text = seen(tok, clean_text)
    print(f"{tok:<12} → in dictionary: {in_dict:<5} | in text: {in_text}")


LCR          → in dictionary: 1     | in text: True
NSFR         → in dictionary: 1     | in text: True
RWA          → in dictionary: 1     | in text: True
CET1         → in dictionary: 0     | in text: True
credit risk  → in dictionary: 1     | in text: True
counterparty → in dictionary: 1     | in text: True
