In [5]:
import os
from pathlib import Path

PDF_PATH = Path("data/annual-report-ubs-group-2024.pdf")

assert PDF_PATH.exists(), f"PDF not found at: {PDF_PATH.resolve()}"
PDF_PATH

PosixPath('data/annual-report-ubs-group-2024.pdf')

In [6]:
!pip -q install pypdf

In [7]:
from pypdf import PdfReader

reader = PdfReader(str(PDF_PATH))
n_pages = len(reader.pages)
n_pages

395

In [8]:
pages = []
for i, page in enumerate(reader.pages):
    text = page.extract_text() or ""
    pages.append({"page": i + 1, "text": text})

total_chars = sum(len(p["text"]) for p in pages)
empty_pages = sum(1 for p in pages if len(p["text"].strip()) == 0)

print("Pages:", n_pages)
print("Total characters extracted:", total_chars)
print("Empty pages:", empty_pages)

Pages: 395
Total characters extracted: 1736065
Empty pages: 1


In [9]:
# Show a sample of extracted text from a few pages
for idx in [0, 1, 2, n_pages//2, n_pages-1]:
    snippet = pages[idx]["text"].strip().replace("\n", " ")
    print(f"\n--- Page {pages[idx]['page']} ---")
    print(snippet[:800] if snippet else "[NO TEXT EXTRACTED]")


--- Page 1 ---
Annual Report  2024 UBS Group

--- Page 2 ---
Our external reporting approach The scope and content of our external reports are determined by Swiss legal and regulatory requirements, accounting  standards, relevant stock and debt listing rules, including regulations promulgated by the Swiss Financial Market  Supervisory Authority (FINMA), the SIX Swiss Exchange, the US Securities and Exchange Commission (the SEC) and other  regulatory requirements, as well as by our financial reporting policies. At the center of our external reporting approach is the annual report of the UBS Group, which consists of disclosures for  UBS Group AG and its consolidated subsidiaries. We also provide a separate annual report for UBS AG on a sub- consolidated basis. Both of the aforementioned annual reports are the basis for the corresponding 2024 SEC Form 

--- Page 3 ---
Contents 2 Letter to shareholders 8 Our key figures 10 Our Board of Directors 12 Our Group Executive Board 14 Our evoluti

In [10]:
import re

def page_has_keywords(text: str, keywords):
    t = text.lower()
    return any(k.lower() in t for k in keywords)

keywords = ["risk", "outlook", "forward-looking", "uncertainty", "regulatory"]
candidate_pages = [p["page"] for p in pages if page_has_keywords(p["text"], keywords)]

print("Candidate pages count:", len(candidate_pages))
print("First 30 candidate pages:", candidate_pages[:30])

Candidate pages count: 295
First 30 candidate pages: [2, 3, 4, 5, 6, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 24, 26, 28, 29, 30, 31, 32, 35, 36, 37, 40, 41, 42, 43, 44]


In [11]:
import re

anchors = [
    r"\brisk factors\b",
    r"\brisk management\b",
    r"\bprincipal risks\b",
    r"\bkey risks\b",
    r"\bforward[-\s]?looking statements\b",
]

hits = []
for p in pages:
    text = (p["text"] or "").lower()
    for a in anchors:
        if re.search(a, text):
            hits.append((p["page"], a))
            break

hits[:30], len(hits)

([(3, '\\brisk factors\\b'),
  (4, '\\brisk management\\b'),
  (5, '\\brisk management\\b'),
  (16, '\\brisk factors\\b'),
  (17, '\\brisk management\\b'),
  (18, '\\brisk management\\b'),
  (21, '\\brisk management\\b'),
  (29, '\\brisk management\\b'),
  (40, '\\brisk management\\b'),
  (41, '\\brisk factors\\b'),
  (43, '\\brisk factors\\b'),
  (45, '\\brisk management\\b'),
  (50, '\\brisk factors\\b'),
  (51, '\\brisk factors\\b'),
  (52, '\\brisk factors\\b'),
  (53, '\\brisk factors\\b'),
  (54, '\\brisk factors\\b'),
  (55, '\\brisk factors\\b'),
  (56, '\\brisk factors\\b'),
  (57, '\\brisk factors\\b'),
  (58, '\\brisk factors\\b'),
  (59, '\\brisk factors\\b'),
  (60, '\\brisk factors\\b'),
  (61, '\\brisk factors\\b'),
  (62, '\\brisk factors\\b'),
  (63, '\\brisk factors\\b'),
  (64, '\\brisk factors\\b'),
  (69, '\\brisk factors\\b'),
  (70, '\\brisk factors\\b'),
  (73, '\\brisk management\\b')],
 127)

In [12]:
# Show unique pages where anchors appear (first 50)
unique_pages = sorted({h[0] for h in hits})
unique_pages[:50], len(unique_pages)

([3,
  4,
  5,
  16,
  17,
  18,
  21,
  29,
  40,
  41,
  43,
  45,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  69,
  70,
  73,
  75,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106],
 127)

In [13]:
# Inspect a few candidate pages to identify the true section start
for pg in unique_pages[:15]:
    txt = pages[pg-1]["text"].strip()
    print(f"\n===== PAGE {pg} =====")
    print(txt[:1200])


===== PAGE 3 =====
Contents
2 Letter to shareholders
8 Our key figures
10 Our Board of Directors
12 Our Group Executive Board
14 Our evolution
1
Our strategy, business model and 
environment
15 Integration of Credit Suisse
17 Our strategy
19 Targets, capital guidance and ambitions
20 Our businesses
29 Our environment
34 How we create value for our stakeholders
41 Regulation and supervision
46 Regulatory and legal developments
50 Risk factors
2
Financial and 
operating performance
64 Accounting and financial reporting
65 Group performance
73 Global Wealth Management
76 Personal & Corporate Banking
79 Asset Management
82 Investment Bank
84 Non-core and Legacy
86 Group Items
3
Risk, capital, liquidity and funding,
and balance sheet
88 Risk management and control
136 Capital, liquidity and funding, and balance sheet
4
Corporate governance 
and compensation
161 Corporate governance
199 Compensation
5
Financial 
statements
243 Consolidated financial statements
6
Significant regulated subsid

In [14]:
SCOPE_START = 41
SCOPE_END = 75  # inclusive

scoped_pages = [p for p in pages if SCOPE_START <= p["page"] <= SCOPE_END]

print("Scoped pages:", len(scoped_pages))
print("Scoped page range:", scoped_pages[0]["page"], "to", scoped_pages[-1]["page"])
print("Scoped characters:", sum(len(p["text"]) for p in scoped_pages))

Scoped pages: 35
Scoped page range: 41 to 75
Scoped characters: 188870


In [15]:
for pg in [41, 46, 50, 55, 64, 75]:
    txt = pages[pg-1]["text"].strip().replace("\n", " ")
    print(f"\n--- Page {pg} ---")
    print(txt[:900] if txt else "[NO TEXT EXTRACTED]")


--- Page 41 ---
Annual Report 2024 | Our strategy, business model and environment | How we create value for our stakeholders 41 Our Code of Conduct and Ethics In our Code, the BoD and the GEB set out the principles and practices that define our ethical standards, and the way we  do business, which apply to all aspects of our business. All employees must affirm annually that they have read and will  adhere to the Code and other key policies, supporting a culture where ethical and responsible behavior is part of our  everyday operations. In our Code, we make a commitment to acting with the long term in mind and creating value for  clients, employees, communities and investors. We aspire to play our part in creating a fairer and more prosperous society,  championing a healthier environment and addressing inequalities. Every year, the BoD and the GEB conduct a review of  our Code to ensure that developments

--- Page 46 ---
Annual Report 2024 | Our strategy, business model and environment

In [16]:
def chunk_text(text: str, chunk_size: int = 1800, overlap: int = 250):
    chunks = []
    start = 0
    text = text.replace("\x00", " ")
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

docs = []
for p in scoped_pages:
    for j, chunk in enumerate(chunk_text(p["text"])):
        docs.append({
            "page": p["page"],
            "chunk_id": f"p{p['page']}_c{j}",
            "text": chunk
        })

print("Total chunks:", len(docs))
print("Example chunk:", docs[0]["chunk_id"], "page", docs[0]["page"])
print(docs[0]["text"][:600])

Total chunks: 141
Example chunk: p41_c0 page 41
Annual Report 2024 | Our strategy, business model and environment | How we create value for our stakeholders 41
Our Code of Conduct and Ethics
In our Code, the BoD and the GEB set out the principles and practices that define our ethical standards, and the way we 
do business, which apply to all aspects of our business. All employees must affirm annually that they have read and will 
adhere to the Code and other key policies, supporting a culture where ethical and responsible behavior is part of our 
everyday operations. In our Code, we make a commitment to acting with the long term in mind and


In [17]:
!pip -q install sentence-transformers numpy

In [18]:
import numpy as np
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

texts = [d["text"] for d in docs]
embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

embeddings.shape

  from .autonotebook import tqdm as notebook_tqdm


(141, 384)

In [19]:
def retrieve(query: str, top_k: int = 5):
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    scores = embeddings @ q_emb  # cosine similarity because normalized
    top_idx = np.argsort(-scores)[:top_k]

    results = []
    for idx in top_idx:
        results.append({
            "score": float(scores[idx]),
            "page": docs[idx]["page"],
            "chunk_id": docs[idx]["chunk_id"],
            "text": docs[idx]["text"]
        })
    return results

In [20]:
test_queries = [
    "What does UBS say about regulatory fragmentation and how it expects regulation to evolve in 2025?",
    "What heightened risks does UBS mention related to the acquisition of Credit Suisse?",
    "What does the report say about risks that are not currently considered material or known?",
    "What are 'critical accounting estimates and judgments' and why do they matter under IFRS?",
    "Where does UBS mention litigation risk and regulatory scrutiny in relation to Credit Suisse integration?"
]

for q in test_queries:
    print("\n" + "="*120)
    print("QUERY:", q)
    results = retrieve(q, top_k=3)
    for r in results:
        print(f"\n  -> score={r['score']:.3f} | page={r['page']} | {r['chunk_id']}")
        print(r["text"][:700].replace("\n", " "))


QUERY: What does UBS say about regulatory fragmentation and how it expects regulation to evolve in 2025?

  -> score=0.614 | page=46 | p46_c0
Annual Report 2024 | Our strategy, business model and environment | Regulation and supervision 46 In 2025, various jurisdictions, including the EU, UK, and US, are shifting their stated policy and regulatory approaches  toward promoting a growth- and competitiveness-focused agenda, with related measures to simplify and boost the  framework conditions, while other jurisdictions, including Switzerland, remain focused on strengthening their regulatory  environment as a consequence of the events in March 2023. This adds to the ongoing trend of regulatory fragmentation.  However, we believe the continued adaptations made to our business model and our proactive management of  regulatory change put

  -> score=0.614 | page=48 | p48_c4
pean Commission announced an intention to streamline and simplify sustainability  regulations, including the Taxonomy R

In [21]:
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [22]:
SYSTEM_PROMPT = """You are a careful analyst.
Answer ONLY using the provided context.
If the context does not contain the answer, say: "Not found in the provided pages."
For every factual claim, include citations like (page 46).
Do not add outside knowledge.
"""

def build_context(results, max_chars=7000):
    parts, total = [], 0
    for r in results:
        chunk = r["text"].strip().replace("\n", " ")
        header = f"[page {r['page']} | {r['chunk_id']} | score={r['score']:.3f}] "
        piece = header + chunk
        if total + len(piece) > max_chars:
            break
        parts.append(piece)
        total += len(piece)
    return "\n\n".join(parts)

def answer_with_citations(question: str, top_k: int = 5, model: str = "gpt-4o-mini"):
    results = retrieve(question, top_k=top_k)
    context = build_context(results)

    user_prompt = f"""Question: {question}

Context:
{context}

Write 5-8 sentences. Every sentence with a factual claim MUST include a citation like (page 46).
"""
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.1,
    )
    return resp.choices[0].message.content, results

In [24]:
portfolio_questions = [
    "What does UBS say about regulatory fragmentation and how it expects regulation to evolve in 2025?",
    "What heightened risks does UBS mention related to the acquisition of Credit Suisse?",
    "What does UBS say about risks that may only become apparent with hindsight?",
    "What are 'critical accounting estimates and judgments' and why do they matter under IFRS?",
    "Where does UBS mention litigation risk and regulatory scrutiny related to the Credit Suisse integration?"
]

    
def format_sources(sources, max_sources=5):
    lines = []
    for s in sources[:max_sources]:
        lines.append(f"- page {s['page']} | {s['chunk_id']} | score={s['score']:.3f}")
    return "\n".join(lines)

for q in portfolio_questions:
    print("\n" + "="*120)
    print("Q:", q)
    ans, sources = answer_with_citations(q, top_k=5)
    print("\nANSWER:\n", ans)
    print("\nSOURCES:\n", format_sources(sources))


Q: What does UBS say about regulatory fragmentation and how it expects regulation to evolve in 2025?

ANSWER:
 UBS indicates that in 2025, various jurisdictions, including the EU, UK, and US, are shifting their regulatory approaches towards promoting a growth- and competitiveness-focused agenda, while Switzerland remains focused on strengthening its regulatory environment (page 46). This situation contributes to ongoing regulatory fragmentation (page 46). UBS believes that its proactive management of regulatory change positions it well to handle upcoming challenges in the regulatory environment (page 46). However, it also notes that Switzerland is expected to introduce proposals for regulatory changes in 2025, particularly in response to the failure of Credit Suisse, which may include adjustments to capital and liquidity requirements (page 51). These changes could place UBS at a competitive disadvantage compared to institutions subject to less stringent regulations (page 51). Addition