In [1]:
import requests, pandas as pd
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE = "https://www.bcb.gov.br/api/servico/sitebcb/copom"

# Create session with retry logic
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

def get_comunicados(n=50):
    url = f"{BASE}/comunicados?quantidade={n}"
    return pd.DataFrame(session.get(url, timeout=30).json()["conteudo"])

def get_comunicado_detalhe(nro_reuniao: int):
    url = f"{BASE}/comunicados_detalhes?nro_reuniao={nro_reuniao}"
    j = session.get(url, timeout=30).json()["conteudo"][0]
    return j  # includes textoComunicado

lst = get_comunicados(50)
rows = []
for r in lst["nro_reuniao"]:
    rows.append(get_comunicado_detalhe(int(r)))
    time.sleep(0.5)  # Small delay to avoid rate limiting

df_meetings = pd.DataFrame(rows)

In [2]:
# cell 1
from __future__ import annotations

import os, re, json, time
from dataclasses import dataclass
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

from bs4 import BeautifulSoup

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

DATA_DIR = "data_copom"
RAW_DIR  = os.path.join(DATA_DIR, "raw_json")
os.makedirs(RAW_DIR, exist_ok=True)

# ---- API endpoints (from BCB open-data resources) ----
BASE = "https://www.bcb.gov.br/api/servico/sitebcb/copom"
URL_LIST = f"{BASE}/comunicados"             # ?quantidade=50
URL_DET  = f"{BASE}/comunicados_detalhes"    # ?nro_reuniao=255

# How many meetings?
N_MEETINGS = 50

# Embedding model: pick a strong multilingual encoder.
# Good defaults:
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (solid, common)
# - "intfloat/multilingual-e5-large" (often stronger retrieval embeddings; slower/heavier)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# Paragraph-unit splitting controls
SPLIT_ENUMERATIONS = True   # split "(i) ... (ii) ... (iii) ..." into separate units
SPLIT_BULLETS      = True   # split "• item • item" into separate units (also handles bullet lines)
MAX_CHUNK_CHARS    = 700    # chunk long units for embeddings to reduce truncation

# Clustering controls
MERGE_SIM_THRESHOLD = 0.92  # merge clusters whose centroids are very similar (style changes)
TOP2_MARGIN         = 0.04  # allow 2 themes if top1-top2 similarity is within this margin
LOWCONF_SIM         = 0.30  # mark low-confidence assignments
RANDOM_SEED         = 42


  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# cell 2
session = requests.Session()
session.headers.update({"User-Agent": "copom-theme-notebook/1.0"})

def _cache_path(nro_reuniao: int) -> str:
    return os.path.join(RAW_DIR, f"comunicado_{nro_reuniao}.json")

def list_comunicados(quantidade: int = 50) -> pd.DataFrame:
    r = session.get(URL_LIST, params={"quantidade": int(quantidade)}, timeout=30)
    r.raise_for_status()
    j = r.json()
    return pd.DataFrame(j["conteudo"])

def get_comunicado_detalhe(nro_reuniao: int, use_cache: bool = True, sleep_s: float = 0.15) -> Dict:
    path = _cache_path(nro_reuniao)
    if use_cache and os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

    r = session.get(URL_DET, params={"nro_reuniao": int(nro_reuniao)}, timeout=30)
    r.raise_for_status()
    j = r.json()["conteudo"][0]

    with open(path, "w", encoding="utf-8") as f:
        json.dump(j, f, ensure_ascii=False, indent=2)

    time.sleep(sleep_s)
    return j

df_list = list_comunicados(N_MEETINGS)
df_list["nro_reuniao"] = df_list["nro_reuniao"].astype(int)
df_list["dataReferencia"] = pd.to_datetime(df_list["dataReferencia"])

rows = []
for nro in tqdm(df_list["nro_reuniao"].tolist(), desc="Fetching detalhes"):
    rows.append(get_comunicado_detalhe(nro, use_cache=True))

df_meetings = pd.DataFrame(rows)
df_meetings["nro_reuniao"] = df_meetings["nro_reuniao"].astype(int)
df_meetings["dataReferencia"] = pd.to_datetime(df_meetings["dataReferencia"])
df_meetings = df_meetings.sort_values("dataReferencia").reset_index(drop=True)

df_meetings[["nro_reuniao","dataReferencia","titulo"]].tail()


Fetching detalhes: 100%|██████████| 50/50 [00:01<00:00, 48.57it/s]


Unnamed: 0,nro_reuniao,dataReferencia,titulo
45,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a."
46,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a."
47,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a."
48,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a."
49,275,2025-12-10,"Copom mantém a taxa Selic em 15,00% a.a."


In [7]:
# cell 3
ENUM_RE = re.compile(r"\(\s*([ivx]{1,5}|\d{1,2})\s*\)", flags=re.IGNORECASE)
BULLET_SPLIT_RE = re.compile(r"(?:^|\s)([•\u2022])\s+")   # "• " bullets
BULLET_LINE_RE  = re.compile(r"^\s*(?:[-–•*]|\d+[\.\)])\s+")

def strip_html(text: str) -> str:
    # Sometimes textoComunicado is plain text; this safely handles either.
    soup = BeautifulSoup(text, "lxml")
    return soup.get_text("\n")

def norm_ws(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\u00a0", " ")
    # collapse repeated spaces, but keep newlines
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def split_enumerations(para: str) -> List[str]:
    """Split '(i) ... (ii) ...' paragraphs; prefix each item with header to keep context."""
    matches = list(ENUM_RE.finditer(para))
    if not (SPLIT_ENUMERATIONS and len(matches) >= 2 and len(para) >= 180):
        return [para]

    header = para[:matches[0].start()].strip().rstrip(":;,-")
    rest = para[matches[0].start():]

    out = []
    ms = list(ENUM_RE.finditer(rest))
    for k, m in enumerate(ms):
        start = m.end()
        end = ms[k+1].start() if k+1 < len(ms) else len(rest)
        item = rest[start:end].strip().strip(";").strip()
        if not item:
            continue
        idx = m.group(1).lower()
        if header:
            out.append(f"{header}: ({idx}) {item}")
        else:
            out.append(f"({idx}) {item}")
    return out if out else [para]

def split_bullets_inline(para: str) -> List[str]:
    """Split '... • item • item' inside a single line."""
    if not (SPLIT_BULLETS and "•" in para and para.count("•") >= 2):
        return [para]
    parts = [p.strip() for p in para.split("•") if p.strip()]
    header = parts[0].rstrip(":;,-")
    items = parts[1:]
    out = []
    for it in items:
        # keep context
        if header and len(header) < 180:
            out.append(f"{header}: {it}")
        else:
            out.append(it)
    return out if out else [para]

def merge_continuations(lines: List[str]) -> List[str]:
    """Merge lines that look like soft-wrapped continuations (rare, but happens)."""
    out = []
    buf = ""
    for line in lines:
        if not buf:
            buf = line
            continue
        # If previous doesn't look finished AND current doesn't look like a new paragraph starter:
        prev = buf
        looks_finished = bool(re.search(r"[.!?]\s*$", prev)) or prev.endswith("):")
        starts_new = bool(BULLET_LINE_RE.match(line)) or bool(re.match(r"^[A-ZÁÂÃÀÉÊÍÓÔÕÚÜÇ]", line))
        if (not looks_finished) and (not starts_new) and len(line) < 180:
            buf = (buf + " " + line).strip()
        else:
            out.append(buf.strip())
            buf = line
    if buf:
        out.append(buf.strip())
    return out

def is_clerical(para: str) -> bool:
    t = para.strip().lower()
    return (
        t.startswith("votaram por essa decisão") or
        t.startswith("* no cenário") or
        t.startswith("o valor para o câmbio") or
        t.startswith("* no cenário de referência")
    )

def statement_to_units(texto: str) -> Tuple[List[Dict], List[str]]:
    raw = strip_html(texto)
    txt = norm_ws(raw)

    # Coarse split: BCB often uses single newline per paragraph (as in the JSON).
    lines = [ln.strip() for ln in re.split(r"\n+", txt) if ln.strip()]
    lines = merge_continuations(lines)

    units = []
    for coarse_id, para in enumerate(lines):
        # 1) inline bullets
        parts = split_bullets_inline(para)

        # 2) enumerations inside each part
        subparts = []
        for p in parts:
            subparts.extend(split_enumerations(p))

        for sub_id, p in enumerate(subparts):
            units.append({
                "coarse_id": coarse_id,
                "sub_id": sub_id,
                "text": p.strip()
            })
    return units, lines

para_rows = []
for _, row in df_meetings.iterrows():
    units, lines = statement_to_units(row["textoComunicado"])
    for i, u in enumerate(units):
        para_rows.append({
            "nro_reuniao": row["nro_reuniao"],
            "dataReferencia": row["dataReferencia"],
            "titulo": row["titulo"],
            "para_id": i,                 # final sequential unit id within meeting
            "coarse_id": u["coarse_id"],  # original line-paragraph index
            "sub_id": u["sub_id"],        # within-paragraph split index
            "text": u["text"],
            "is_clerical": is_clerical(u["text"]),
        })

df_paras = pd.DataFrame(para_rows).sort_values(["dataReferencia","para_id"]).reset_index(drop=True)

# Sanity check: average units per statement (after splitting)
avg_units = df_paras.groupby("nro_reuniao").size().mean()
avg_units


np.float64(18.38)

In [5]:
# cell 4
def chunk_text(text: str, max_chars: int = 700) -> List[str]:
    text = text.strip()
    if len(text) <= max_chars:
        return [text]
    # Split on sentence-ish boundaries then pack
    pieces = re.split(r"(?<=[.;:!?])\s+", text)
    chunks, cur = [], ""
    for p in pieces:
        if not p:
            continue
        if len(cur) + len(p) + 1 <= max_chars:
            cur = (cur + " " + p).strip()
        else:
            if cur:
                chunks.append(cur)
            cur = p.strip()
    if cur:
        chunks.append(cur)
    return chunks if chunks else [text]

model = SentenceTransformer(MODEL_NAME)

texts = df_paras["text"].tolist()

# Build chunk index mapping
all_chunks = []
chunk_map = []   # list of list of chunk indices per paragraph
for t in texts:
    chs = chunk_text(t, max_chars=MAX_CHUNK_CHARS)
    idxs = []
    for ch in chs:
        idxs.append(len(all_chunks))
        all_chunks.append(ch)
    chunk_map.append(idxs)

chunk_emb = model.encode(
    all_chunks,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)

emb = np.zeros((len(texts), chunk_emb.shape[1]), dtype=np.float32)
for i, idxs in enumerate(chunk_map):
    emb[i] = chunk_emb[idxs].mean(axis=0)

# Re-normalize (mean of unit vectors isn't unit)
emb /= (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12)

# Persist
os.makedirs(DATA_DIR, exist_ok=True)
np.save(os.path.join(DATA_DIR, "embeddings.npy"), emb)
df_paras.to_parquet(os.path.join(DATA_DIR, "df_paras_raw.parquet"), index=False)

emb.shape


Batches: 100%|██████████| 15/15 [00:15<00:00,  1.06s/it]


(919, 768)

In [8]:
# cell 5
avg_units = df_paras.groupby("nro_reuniao").size().mean()
k_min = int(np.ceil(0.50 * avg_units))
k_max = int(np.ceil(2.00 * avg_units))
k_min, k_max

# Build candidate thresholds based on empirical distance distribution
rng = np.random.RandomState(RANDOM_SEED)
n = emb.shape[0]
sample_n = min(300, n)
sample_idx = rng.choice(n, size=sample_n, replace=False)
D = pairwise_distances(emb[sample_idx], metric="cosine")
vals = D[np.triu_indices_from(D, k=1)]
lo, hi = np.quantile(vals, [0.05, 0.35])
thresholds = np.linspace(lo, hi, 28)

rows = []
for t in tqdm(thresholds, desc="Threshold search"):
    try:
        m = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=float(t),
            linkage="average",
            metric="cosine",
            compute_full_tree=True,
        )
        labels = m.fit_predict(emb)
        k = len(set(labels))
        if k < 2:
            continue
        # Restrict to your expected theme-count band
        if not (k_min <= k <= k_max):
            continue
        sil = silhouette_score(emb, labels, metric="cosine")
        rows.append((t, k, sil))
    except Exception as e:
        rows.append((t, np.nan, np.nan))

df_search = pd.DataFrame(rows, columns=["threshold","k","sil"]).dropna()
df_search.sort_values("sil", ascending=False).head(10)

best = df_search.sort_values("sil", ascending=False).iloc[0]
best_t = float(best["threshold"])
best_k = int(best["k"])
best_t, best_k

Threshold search: 100%|██████████| 28/28 [00:11<00:00,  2.42it/s]


(0.2937389224767685, 36)

In [9]:
# cell 6
m = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=best_t,
    linkage="average",
    metric="cosine",
    compute_full_tree=True,
)
labels = m.fit_predict(emb)
df_paras["theme_id_raw"] = labels

def normalize_rows(X: np.ndarray) -> np.ndarray:
    return X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)

# Centroids
theme_ids = sorted(df_paras["theme_id_raw"].unique().tolist())
centroids = []
for tid in theme_ids:
    idx = df_paras.index[df_paras["theme_id_raw"] == tid].to_numpy()
    c = emb[idx].mean(axis=0)
    centroids.append(c)
centroids = normalize_rows(np.vstack(centroids))
S = centroids @ centroids.T  # cosine similarity

# Union-find merge where sim > threshold
parent = {i: i for i in range(len(theme_ids))}
def find(x):
    while parent[x] != x:
        parent[x] = parent[parent[x]]
        x = parent[x]
    return x
def union(a,b):
    ra, rb = find(a), find(b)
    if ra != rb:
        parent[rb] = ra

for i in range(len(theme_ids)):
    for j in range(i+1, len(theme_ids)):
        if S[i,j] >= MERGE_SIM_THRESHOLD:
            union(i,j)

rep = [find(i) for i in range(len(theme_ids))]
# Map old theme_id_raw -> merged_id (compressed to 0..K-1)
merge_groups: Dict[int, List[int]] = {}
for pos, r in enumerate(rep):
    merge_groups.setdefault(r, []).append(pos)

merged_map = {}
new_id = 0
for root, members in merge_groups.items():
    for pos in members:
        merged_map[theme_ids[pos]] = new_id
    new_id += 1

df_paras["theme_id"] = df_paras["theme_id_raw"].map(merged_map).astype(int)
df_paras["theme_id"].nunique()

36

In [10]:
# cell 7
K = df_paras["theme_id"].nunique()
centroids = np.zeros((K, emb.shape[1]), dtype=np.float32)
for k in range(K):
    idx = df_paras.index[df_paras["theme_id"] == k].to_numpy()
    c = emb[idx].mean(axis=0)
    centroids[k] = c
centroids = normalize_rows(centroids)

sims = emb @ centroids.T
top2 = np.argsort(-sims, axis=1)[:, :2]
t1 = top2[:,0]
t2 = top2[:,1]
s1 = sims[np.arange(n), t1]
s2 = sims[np.arange(n), t2]

df_paras["theme_1"] = t1
df_paras["score_1"] = s1
df_paras["theme_2"] = t2
df_paras["score_2"] = s2

df_paras["is_lowconf"] = df_paras["score_1"] < LOWCONF_SIM
df_paras["is_multilabel"] = (df_paras["score_1"] - df_paras["score_2"]) <= TOP2_MARGIN

df_paras[["dataReferencia","nro_reuniao","para_id","theme_id","theme_1","score_1","theme_2","score_2","is_multilabel","is_lowconf"]].head()


Unnamed: 0,dataReferencia,nro_reuniao,para_id,theme_id,theme_1,score_1,theme_2,score_2,is_multilabel,is_lowconf
0,2019-10-30,226,0,33,33,1.0,35,0.429008,False,False
1,2019-10-30,226,1,4,4,0.821997,28,0.461895,False,False
2,2019-10-30,226,2,20,20,0.998384,12,0.6314,False,False
3,2019-10-30,226,3,6,6,0.921193,25,0.810556,False,False
4,2019-10-30,226,4,2,2,0.967505,1,0.788569,False,False


In [11]:
# cell 8
# Minimal PT stopwords (keep small; you can expand later)
PT_STOP = [
    "a","o","as","os","um","uma","uns","umas","de","do","da","dos","das","em","no","na","nos","nas",
    "e","ou","para","por","com","sem","que","se","ao","à","às","aos","como","mais","menos","também",
    "já","não","sim","ser","são","foi","sua","seu","suas","seus","sobre","entre","após","antes",
    "ano","anos","meses","mês","trimestre","trimestres"
]

def ctfidf_keywords(texts: List[str], labels: np.ndarray, topn: int = 12) -> Dict[int, List[Tuple[str,float]]]:
    # Count terms
    vec = CountVectorizer(
        lowercase=True,
        stop_words=PT_STOP,
        ngram_range=(1,2),
        min_df=2
    )
    X = vec.fit_transform(texts)
    vocab = np.array(vec.get_feature_names_out())

    # Aggregate per cluster
    labs = np.array(labels)
    uniq = np.unique(labs)
    agg = []
    for k in uniq:
        rows = X[labs == k]
        agg.append(np.asarray(rows.sum(axis=0)).ravel())
    agg = np.vstack(agg)  # (K, V)

    # c-TF-IDF
    tf = agg / (agg.sum(axis=1, keepdims=True) + 1e-12)
    df = (agg > 0).sum(axis=0)
    idf = np.log((1 + len(uniq)) / (1 + df)) + 1.0
    ctfidf = tf * idf

    out = {}
    for i, k in enumerate(uniq):
        top_idx = np.argsort(-ctfidf[i])[:topn]
        out[int(k)] = [(vocab[j], float(ctfidf[i, j])) for j in top_idx]
    return out

kw = ctfidf_keywords(df_paras["text"].tolist(), df_paras["theme_id"].to_numpy(), topn=12)

# Representative paragraph per theme: nearest to centroid
rep_rows = []
for k in range(K):
    idx = df_paras.index[df_paras["theme_id"] == k].to_numpy()
    sims_k = emb[idx] @ centroids[k]
    rep_i = idx[int(np.argmax(sims_k))]
    rep_rows.append({
        "theme_id": k,
        "size": len(idx),
        "keywords": ", ".join([w for w,_ in kw.get(k, [])[:8]]),
        "representative_paragraph": df_paras.loc[rep_i, "text"]
    })

df_themes = pd.DataFrame(rep_rows).sort_values("size", ascending=False).reset_index(drop=True)
df_themes.to_clipboard()
df_themes.head(10)

Unnamed: 0,theme_id,size,keywords,representative_paragraph
0,7,258,"inflação, riscos, inflacionário, cenário infla...","O Comitê ressalta que, em seus cenários para a..."
1,3,157,"inflação, taxa, focus, pesquisa focus, pesquis...",As expectativas de inflação para 2024 e 2025 a...
2,0,147,"inflação, balanço riscos, balanço, essa decisã...","Considerando o cenário básico, o balanço de ri..."
3,11,50,"votaram, seguintes membros, presidente, seguin...",Votaram por essa decisão os seguintes membros ...
4,9,33,"ipca, livres, ipca administrados, ipca livres,...","IPCA 4,4 3,5 3,2"
5,6,31,"economia, reformas, processo, brasileira, recu...",O Copom enfatiza que perseverar no processo de...
6,4,27,"copom decidiu, decidiu, reunião comitê, comitê...","Em sua 247ª reunião, o Comitê de Política Mone..."
7,20,27,"seguintes observações, descrita, observações, ...",A atualização do cenário básico do Copom pode ...
8,28,26,"valor obtido, úteis encerrados, úteis, valor, ...",*Valor obtido pelo procedimento usual de arred...
9,15,22,"inflação, relação cenário, meta inflação, domé...","Em relação ao cenário doméstico, o conjunto do..."


In [12]:
# cell 9
# Fill this as you inspect df_themes (keywords + representative paragraph).
theme_name = {
    0: 'balanco de riscos',
    11: 'votos'
}
df_paras["theme_name"] = df_paras["theme_id"].map(theme_name).fillna(df_paras["theme_id"].astype(str))
df_themes["theme_name"] = df_themes["theme_id"].map(theme_name).fillna(df_themes["theme_id"].astype(str))
df_themes.head(15)


Unnamed: 0,theme_id,size,keywords,representative_paragraph,theme_name
0,7,258,"inflação, riscos, inflacionário, cenário infla...","O Comitê ressalta que, em seus cenários para a...",7
1,3,157,"inflação, taxa, focus, pesquisa focus, pesquis...",As expectativas de inflação para 2024 e 2025 a...,3
2,0,147,"inflação, balanço riscos, balanço, essa decisã...","Considerando o cenário básico, o balanço de ri...",balanco de riscos
3,11,50,"votaram, seguintes membros, presidente, seguin...",Votaram por essa decisão os seguintes membros ...,votos
4,9,33,"ipca, livres, ipca administrados, ipca livres,...","IPCA 4,4 3,5 3,2",9
5,6,31,"economia, reformas, processo, brasileira, recu...",O Copom enfatiza que perseverar no processo de...,6
6,4,27,"copom decidiu, decidiu, reunião comitê, comitê...","Em sua 247ª reunião, o Comitê de Política Mone...",4
7,20,27,"seguintes observações, descrita, observações, ...",A atualização do cenário básico do Copom pode ...,20
8,28,26,"valor obtido, úteis encerrados, úteis, valor, ...",*Valor obtido pelo procedimento usual de arred...,28
9,15,22,"inflação, relação cenário, meta inflação, domé...","Em relação ao cenário doméstico, o conjunto do...",15


In [13]:
# cell 10
def show_theme(df: pd.DataFrame, name_or_id, only_primary: bool = True) -> pd.DataFrame:
    if isinstance(name_or_id, str):
        m = df["theme_name"] == name_or_id
    else:
        m = df["theme_id"] == int(name_or_id)
    if only_primary:
        # only those whose primary assignment matches the theme_id
        if not isinstance(name_or_id, str):
            m = m & (df["theme_1"] == int(name_or_id))
    out = (df.loc[m]
             .sort_values(["dataReferencia","para_id"])
             [["dataReferencia","nro_reuniao","para_id","coarse_id","sub_id",
               "theme_id","theme_name","score_1","theme_2","score_2","is_multilabel","is_lowconf","text"]])
    return out

# Example (after you set theme_name mapping):
df_bal = show_theme(df_paras, "balanco de riscos", only_primary=False)

df_bal.to_clipboard()
df_bal.head(30)

Unnamed: 0,dataReferencia,nro_reuniao,para_id,coarse_id,sub_id,theme_id,theme_name,score_1,theme_2,score_2,is_multilabel,is_lowconf,text
5,2019-10-30,226,5,5,0,0,balanco de riscos,0.816038,14,0.781218,True,False,O Comitê avalia que diversas medidas de inflaç...
15,2019-10-30,226,15,10,0,0,balanco de riscos,0.883396,3,0.738698,False,False,"Considerando o cenário básico, o balanço de ri..."
16,2019-10-30,226,16,11,0,0,balanco de riscos,0.809753,2,0.72592,False,False,O Copom reitera que a conjuntura econômica pre...
18,2019-10-30,226,18,13,0,0,balanco de riscos,0.931359,7,0.872078,False,False,"Na avaliação do Copom, a evolução do cenário b..."
25,2019-12-11,227,4,4,0,0,balanco de riscos,0.816038,14,0.781218,True,False,O Comitê avalia que diversas medidas de inflaç...
34,2019-12-11,227,13,9,0,0,balanco de riscos,0.897645,3,0.7541,False,False,"Considerando o cenário básico, o balanço de ri..."
35,2019-12-11,227,14,10,0,0,balanco de riscos,0.809753,2,0.72592,False,False,O Copom reitera que a conjuntura econômica pre...
37,2019-12-11,227,16,12,0,0,balanco de riscos,0.903562,7,0.826399,False,False,O Copom entende que o atual estágio do ciclo e...
54,2020-02-05,228,14,9,0,0,balanco de riscos,0.898502,3,0.76383,False,False,"Considerando o cenário básico, o balanço de ri..."
55,2020-02-05,228,15,10,0,0,balanco de riscos,0.809753,2,0.72592,False,False,O Copom reitera que a conjuntura econômica pre...


In [14]:
# cell 10 B
def theme_by_meeting(df: pd.DataFrame, name_or_id) -> pd.DataFrame:
    x = show_theme(df, name_or_id, only_primary=False)
    g = (x.groupby(["dataReferencia","nro_reuniao"])["text"]
           .apply(lambda s: "\n\n".join(s.tolist()))
           .reset_index(name="paragraphs"))
    return g.sort_values("dataReferencia")

theme_by_meeting(df_paras, "balanço de riscos").tail(10)

Unnamed: 0,dataReferencia,nro_reuniao,paragraphs
40,2024-09-18,265,"O ambiente externo permanece desafiador, em fu..."
41,2024-11-06,266,"O ambiente externo permanece desafiador, em fu..."
42,2024-12-11,267,"O ambiente externo permanece desafiador, em fu..."
43,2025-01-29,268,O ambiente externo permanece desafiador em fun...
44,2025-03-19,269,​O ambiente externo permanece desafiador em fu...
45,2025-05-07,270,em função da conjuntura e da política econômic...
46,2025-06-18,271,​O ambiente externo mantém-se adverso e partic...
47,2025-07-30,272,​O ambiente externo está mais adverso e incert...
48,2025-09-17,273,externo se mantém incerto em função da conjunt...
49,2025-11-05,274,"Os riscos para a inflação, tanto de alta quant..."


In [None]:
# cell 11 
#   export for later fine-tuning / analysis
OUT = os.path.join(DATA_DIR, "df_paras_labeled.parquet")
df_paras.to_parquet(OUT, index=False)
OUT
