In [4]:
# Imports and Configuration
from __future__ import annotations

import os
import json
import time
from typing import Dict, List

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

# ---- Configuration ----
DATA_DIR = "data_copom"
RAW_DIR = os.path.join(DATA_DIR, "raw_json")
os.makedirs(RAW_DIR, exist_ok=True)

# BCB API endpoints
BASE = "https://www.bcb.gov.br/api/servico/sitebcb/copom"
URL_LIST = f"{BASE}/comunicados"           # ?quantidade=N
URL_DET = f"{BASE}/comunicados_detalhes"   # ?nro_reuniao=255

# How many meetings to fetch (increase as needed)
N_MEETINGS = 100

In [5]:
# Session setup with retry logic
session = requests.Session()
session.headers.update({"User-Agent": "copom-parser-dec2025/1.0"})

retries = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504]
)
session.mount("https://", HTTPAdapter(max_retries=retries))


def _cache_path(nro_reuniao: int) -> str:
    """Return the local cache path for a given meeting number."""
    return os.path.join(RAW_DIR, f"comunicado_{nro_reuniao}.json")


def list_comunicados(quantidade: int = 50) -> pd.DataFrame:
    """Fetch list of available COPOM statements."""
    r = session.get(URL_LIST, params={"quantidade": int(quantidade)}, timeout=30)
    r.raise_for_status()
    j = r.json()
    return pd.DataFrame(j["conteudo"])


def get_comunicado_detalhe(
    nro_reuniao: int,
    use_cache: bool = True,
    sleep_s: float = 0.2
) -> Dict:
    """Fetch detailed statement for a specific meeting, with local caching."""
    path = _cache_path(nro_reuniao)
    
    if use_cache and os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    
    r = session.get(URL_DET, params={"nro_reuniao": int(nro_reuniao)}, timeout=30)
    r.raise_for_status()
    j = r.json()["conteudo"][0]
    
    with open(path, "w", encoding="utf-8") as f:
        json.dump(j, f, ensure_ascii=False, indent=2)
    
    time.sleep(sleep_s)
    return j

In [6]:
# Fetch list of meetings
df_list = list_comunicados(N_MEETINGS)
df_list["nro_reuniao"] = df_list["nro_reuniao"].astype(int)
df_list["dataReferencia"] = pd.to_datetime(df_list["dataReferencia"])

print(f"Found {len(df_list)} meetings available")
print(f"Date range: {df_list['dataReferencia'].min()} to {df_list['dataReferencia'].max()}")
df_list.head()

Found 100 meetings available
Date range: 2013-07-10 00:00:00 to 2025-12-10 00:00:00


Unnamed: 0,nro_reuniao,dataReferencia,titulo
0,275,2025-12-10,275ª reunião - Copom mantém a taxa Selic em 15...
1,274,2025-11-05,274ª reunião - Copom mantém a taxa Selic em 15...
2,273,2025-09-17,273ª reunião - Copom mantém a taxa Selic em 15...
3,272,2025-07-30,272ª reunião - Copom mantém a taxa Selic em 15...
4,271,2025-06-18,271ª reunião - Copom eleva a taxa Selic para 1...


In [7]:
# Fetch detailed statements for all meetings (uses cache when available)
rows = []
for nro in tqdm(df_list["nro_reuniao"].tolist(), desc="Fetching statements"):
    rows.append(get_comunicado_detalhe(nro, use_cache=True))

df_meetings = pd.DataFrame(rows)
df_meetings["nro_reuniao"] = df_meetings["nro_reuniao"].astype(int)
df_meetings["dataReferencia"] = pd.to_datetime(df_meetings["dataReferencia"])
df_meetings = df_meetings.sort_values("dataReferencia").reset_index(drop=True)

print(f"Loaded {len(df_meetings)} meeting statements")
df_meetings[["nro_reuniao", "dataReferencia", "titulo"]].tail(10)

Fetching statements: 100%|██████████| 100/100 [00:42<00:00,  2.37it/s]

Loaded 100 meeting statements





Unnamed: 0,nro_reuniao,dataReferencia,titulo
90,266,2024-11-06,"Copom eleva a taxa Selic para 11,25% a.a."
91,267,2024-12-11,"Copom eleva a taxa Selic para 12,25% a.a."
92,268,2025-01-29,"Copom eleva a taxa Selic para 13,25% a.a."
93,269,2025-03-19,"Copom eleva a taxa Selic para 14,25% a.a."
94,270,2025-05-07,"Copom eleva a taxa Selic para 14,75% a.a."
95,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a."
96,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a."
97,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a."
98,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a."
99,275,2025-12-10,"Copom mantém a taxa Selic em 15,00% a.a."


In [8]:
# Quick summary of the data
print("Columns available:")
print(df_meetings.columns.tolist())
print(f"\nTotal meetings: {len(df_meetings)}")
print(f"Date range: {df_meetings['dataReferencia'].min().date()} to {df_meetings['dataReferencia'].max().date()}")

Columns available:
['nro_reuniao', 'dataReferencia', 'titulo', 'textoComunicado']

Total meetings: 100
Date range: 2013-07-10 to 2025-12-10


In [9]:
# Example: View a sample statement
sample = df_meetings.iloc[-1]  # Most recent
print(f"Meeting #{sample['nro_reuniao']} - {sample['dataReferencia'].date()}")
print(f"Title: {sample['titulo']}")
print("\n" + "="*60 + "\n")
print(sample["textoComunicado"][:2000] + "...")

Meeting #275 - 2025-12-10
Title: Copom mantém a taxa Selic em 15,00% a.a.


<div class="ExternalClassCDCBFF1C33134307B946FED3C094CD9C"><p style="text-align&#58;justify;">​O ambiente externo ainda se mantém incerto em função da conjuntura e da política econômica nos Estados Unidos, com reflexos nas condições financeiras globais. Tal cenário exige cautela por parte de países emergentes em ambiente marcado por tensão geopolítica.<br></p><p style="text-align&#58;justify;">Em relação ao cenário doméstico, o conjunto dos indicadores segue apresentando, conforme esperado, trajetória de moderação no crescimento da atividade econômica, como observado na última divulgação do PIB, enquanto o mercado de trabalho mostra resiliência. Nas divulgações mais recentes, a inflação cheia e as medidas subjacentes seguiram apresentando algum arrefecimento, mas mantiveram-se acima da meta para a inflação.</p><p style="text-align&#58;justify;">As expectativas de inflação para 2025 e 2026 apuradas pela pesquisa

---

## Your Analysis Starts Here

The `df_meetings` DataFrame contains all fetched COPOM statements with columns:
- `nro_reuniao`: Meeting number
- `dataReferencia`: Meeting date
- `titulo`: Title
- `textoComunicado`: Full statement text (HTML or plain text)

Add your custom analysis below!

In [None]:
# Your analysis code here

In [10]:
# Extract tables from HTML statements using BeautifulSoup

def extract_tables_from_html(html_text: str) -> List[pd.DataFrame]:
    """Parse HTML and extract all <table> elements as DataFrames."""
    soup = BeautifulSoup(html_text, "lxml")
    tables = soup.find_all("table")
    
    dfs = []
    for table in tables:
        try:
            # pandas can parse HTML tables directly
            df = pd.read_html(str(table))[0]
            dfs.append(df)
        except Exception as e:
            print(f"  Warning: Could not parse table - {e}")
    return dfs


def has_table(html_text: str) -> bool:
    """Check if HTML contains any <table> element."""
    soup = BeautifulSoup(html_text, "lxml")
    return len(soup.find_all("table")) > 0


# Check which meetings have tables
meetings_with_tables = []
for _, row in df_meetings.iterrows():
    html = row["textoComunicado"]
    if has_table(html):
        meetings_with_tables.append({
            "nro_reuniao": row["nro_reuniao"],
            "dataReferencia": row["dataReferencia"],
            "titulo": row["titulo"],
        })

df_with_tables = pd.DataFrame(meetings_with_tables)
print(f"Meetings with tables: {len(df_with_tables)} out of {len(df_meetings)}")
df_with_tables

Meetings with tables: 11 out of 100


Unnamed: 0,nro_reuniao,dataReferencia,titulo
0,265,2024-09-18,"Copom eleva a taxa Selic para 10,75% a.a."
1,266,2024-11-06,"Copom eleva a taxa Selic para 11,25% a.a."
2,267,2024-12-11,"Copom eleva a taxa Selic para 12,25% a.a."
3,268,2025-01-29,"Copom eleva a taxa Selic para 13,25% a.a."
4,269,2025-03-19,"Copom eleva a taxa Selic para 14,25% a.a."
5,270,2025-05-07,"Copom eleva a taxa Selic para 14,75% a.a."
6,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a."
7,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a."
8,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a."
9,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a."


In [22]:
# Extract and display tables from a specific meeting

def show_meeting_tables(nro_reuniao: int) -> List[pd.DataFrame]:
    """Extract and display all tables from a specific meeting."""
    row = df_meetings[df_meetings["nro_reuniao"] == nro_reuniao].iloc[0]
    html = row["textoComunicado"]
    
    print(f"Meeting #{nro_reuniao} - {row['dataReferencia'].date()}")
    print(f"Title: {row['titulo']}")
    print("=" * 60)
    
    tables = extract_tables_from_html(html)
    if not tables:
        print("No tables found in this statement.")
        return []
    
    for i, df_table in enumerate(tables):
        print(f"\nTable {i+1}:")
        display(df_table)
    
    return tables


# Example: View tables from the most recent meeting
latest_nro = df_meetings["nro_reuniao"].max()
#comunicado = 264
tables = show_meeting_tables(latest_nro)

Meeting #275 - 2025-12-10
Title: Copom mantém a taxa Selic em 15,00% a.a.

Table 1:


  df = pd.read_html(str(table))[0]


Unnamed: 0,0,1,2,3
0,Índice de preços,2025,2026,2º tri 2027
1,IPCA,44,35,32
2,IPCA livres,40,36,32
3,IPCA administrados,53,32,34


In [23]:
# Build a consolidated DataFrame with all tables from all meetings

all_table_rows = []

for _, row in tqdm(df_meetings.iterrows(), total=len(df_meetings), desc="Extracting tables"):
    html = row["textoComunicado"]
    tables = extract_tables_from_html(html)
    
    for table_idx, df_table in enumerate(tables):
        # Add metadata to each table
        df_table = df_table.copy()
        df_table["_nro_reuniao"] = row["nro_reuniao"]
        df_table["_dataReferencia"] = row["dataReferencia"]
        df_table["_table_idx"] = table_idx
        all_table_rows.append(df_table)

if all_table_rows:
    df_all_tables = pd.concat(all_table_rows, ignore_index=True)
    print(f"Total tables extracted: {len(all_table_rows)}")
    print(f"Total rows across all tables: {len(df_all_tables)}")
    print(f"\nColumns: {df_all_tables.columns.tolist()}")
    df_all_tables.head(20)
else:
    print("No tables found in any meeting.")

  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
Extracting tables: 100%|██████████| 100/100 [00:00<00:00, 741.93it/s]

Total tables extracted: 11
Total rows across all tables: 44

Columns: [0, 1, 2, 3, '_nro_reuniao', '_dataReferencia', '_table_idx']





In [21]:
# Save the consolidated tables for later use

if all_table_rows:
    # Convert all columns to string to avoid type conflicts 
    # (tables have mixed types like "3º tri 2026" and numeric values)
    df_export = df_all_tables.copy()
    for col in df_export.columns:
        if col not in ["_dataReferencia"]:  # Keep datetime as-is
            df_export[col] = df_export[col].astype(str)
    
    output_path = os.path.join(DATA_DIR, "copom_tables_consolidated.parquet")
    df_export.to_parquet(output_path, index=False)
    print(f"Saved to: {os.path.abspath(output_path)}")
    
    # Also save as CSV for easy inspection
    csv_path = os.path.join(DATA_DIR, "copom_tables_consolidated.csv")
    df_all_tables.to_csv(csv_path, index=False)
    print(f"Saved to: {os.path.abspath(csv_path)}")

Saved to: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_tables_consolidated.parquet
Saved to: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_tables_consolidated.csv
