In [2]:
# Imports and Configuration
from __future__ import annotations

import os
import json
import time
from typing import Dict, List

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

# ---- Configuration ----
DATA_DIR = "data_copom"
RAW_DIR = os.path.join(DATA_DIR, "raw_json")
os.makedirs(RAW_DIR, exist_ok=True)

# BCB API endpoints
BASE = "https://www.bcb.gov.br/api/servico/sitebcb/copom"
URL_LIST = f"{BASE}/comunicados"           # ?quantidade=N
URL_DET = f"{BASE}/comunicados_detalhes"   # ?nro_reuniao=255

# How many meetings to fetch (increase as needed)
N_MEETINGS = 100

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Session setup with retry logic
session = requests.Session()
session.headers.update({"User-Agent": "copom-parser-dec2025/1.0"})

retries = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504]
)
session.mount("https://", HTTPAdapter(max_retries=retries))


def _cache_path(nro_reuniao: int) -> str:
    """Return the local cache path for a given meeting number."""
    return os.path.join(RAW_DIR, f"comunicado_{nro_reuniao}.json")


def list_comunicados(quantidade: int = 50) -> pd.DataFrame:
    """Fetch list of available COPOM statements."""
    r = session.get(URL_LIST, params={"quantidade": int(quantidade)}, timeout=30)
    r.raise_for_status()
    j = r.json()
    return pd.DataFrame(j["conteudo"])


def get_comunicado_detalhe(
    nro_reuniao: int,
    use_cache: bool = True,
    sleep_s: float = 0.2
) -> Dict:
    """Fetch detailed statement for a specific meeting, with local caching."""
    path = _cache_path(nro_reuniao)
    
    if use_cache and os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    
    r = session.get(URL_DET, params={"nro_reuniao": int(nro_reuniao)}, timeout=30)
    r.raise_for_status()
    j = r.json()["conteudo"][0]
    
    with open(path, "w", encoding="utf-8") as f:
        json.dump(j, f, ensure_ascii=False, indent=2)
    
    time.sleep(sleep_s)
    return j

In [4]:
# Fetch list of meetings
df_list = list_comunicados(N_MEETINGS)
df_list["nro_reuniao"] = df_list["nro_reuniao"].astype(int)
df_list["dataReferencia"] = pd.to_datetime(df_list["dataReferencia"])

print(f"Found {len(df_list)} meetings available")
print(f"Date range: {df_list['dataReferencia'].min()} to {df_list['dataReferencia'].max()}")
df_list.head()

Found 100 meetings available
Date range: 2013-07-10 00:00:00 to 2025-12-10 00:00:00


Unnamed: 0,nro_reuniao,dataReferencia,titulo
0,275,2025-12-10,275ª reunião - Copom mantém a taxa Selic em 15...
1,274,2025-11-05,274ª reunião - Copom mantém a taxa Selic em 15...
2,273,2025-09-17,273ª reunião - Copom mantém a taxa Selic em 15...
3,272,2025-07-30,272ª reunião - Copom mantém a taxa Selic em 15...
4,271,2025-06-18,271ª reunião - Copom eleva a taxa Selic para 1...


In [5]:
# Fetch detailed statements for all meetings (uses cache when available)
rows = []
for nro in tqdm(df_list["nro_reuniao"].tolist(), desc="Fetching statements"):
    rows.append(get_comunicado_detalhe(nro, use_cache=True))

df_meetings = pd.DataFrame(rows)
df_meetings["nro_reuniao"] = df_meetings["nro_reuniao"].astype(int)
df_meetings["dataReferencia"] = pd.to_datetime(df_meetings["dataReferencia"])
df_meetings = df_meetings.sort_values("dataReferencia").reset_index(drop=True)

print(f"Loaded {len(df_meetings)} meeting statements")
df_meetings[["nro_reuniao", "dataReferencia", "titulo"]].tail(10)

Fetching statements:   0%|          | 0/100 [00:00<?, ?it/s]

Fetching statements: 100%|██████████| 100/100 [00:00<00:00, 2941.49it/s]

Loaded 100 meeting statements





Unnamed: 0,nro_reuniao,dataReferencia,titulo
90,266,2024-11-06,"Copom eleva a taxa Selic para 11,25% a.a."
91,267,2024-12-11,"Copom eleva a taxa Selic para 12,25% a.a."
92,268,2025-01-29,"Copom eleva a taxa Selic para 13,25% a.a."
93,269,2025-03-19,"Copom eleva a taxa Selic para 14,25% a.a."
94,270,2025-05-07,"Copom eleva a taxa Selic para 14,75% a.a."
95,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a."
96,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a."
97,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a."
98,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a."
99,275,2025-12-10,"Copom mantém a taxa Selic em 15,00% a.a."


In [6]:
# Quick summary of the data
print("Columns available:")
print(df_meetings.columns.tolist())
print(f"\nTotal meetings: {len(df_meetings)}")
print(f"Date range: {df_meetings['dataReferencia'].min().date()} to {df_meetings['dataReferencia'].max().date()}")

Columns available:
['nro_reuniao', 'dataReferencia', 'titulo', 'textoComunicado']

Total meetings: 100
Date range: 2013-07-10 to 2025-12-10


In [7]:
# Example: View a sample statement
sample = df_meetings.iloc[-1]  # Most recent
print(f"Meeting #{sample['nro_reuniao']} - {sample['dataReferencia'].date()}")
print(f"Title: {sample['titulo']}")
print("\n" + "="*60 + "\n")
print(sample["textoComunicado"][:2000] + "...")

Meeting #275 - 2025-12-10
Title: Copom mantém a taxa Selic em 15,00% a.a.


<div class="ExternalClassCDCBFF1C33134307B946FED3C094CD9C"><p style="text-align&#58;justify;">​O ambiente externo ainda se mantém incerto em função da conjuntura e da política econômica nos Estados Unidos, com reflexos nas condições financeiras globais. Tal cenário exige cautela por parte de países emergentes em ambiente marcado por tensão geopolítica.<br></p><p style="text-align&#58;justify;">Em relação ao cenário doméstico, o conjunto dos indicadores segue apresentando, conforme esperado, trajetória de moderação no crescimento da atividade econômica, como observado na última divulgação do PIB, enquanto o mercado de trabalho mostra resiliência. Nas divulgações mais recentes, a inflação cheia e as medidas subjacentes seguiram apresentando algum arrefecimento, mas mantiveram-se acima da meta para a inflação.</p><p style="text-align&#58;justify;">As expectativas de inflação para 2025 e 2026 apuradas pela pesquisa

---

## Your Analysis Starts Here

The `df_meetings` DataFrame contains all fetched COPOM statements with columns:
- `nro_reuniao`: Meeting number
- `dataReferencia`: Meeting date
- `titulo`: Title
- `textoComunicado`: Full statement text (HTML or plain text)

Add your custom analysis below!

In [22]:
# Extract tables from HTML statements using BeautifulSoup
import re
from io import StringIO

def parse_table_preserve_decimals(table_element) -> pd.DataFrame:
    """
    Parse an HTML table element, preserving Portuguese decimal notation.
    Manually extracts cell values to avoid pd.read_html's comma-stripping.
    """
    rows = []
    for tr in table_element.find_all("tr"):
        cells = tr.find_all(["td", "th"])
        row = [cell.get_text(strip=True) for cell in cells]
        rows.append(row)
    
    if not rows:
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(rows[1:], columns=rows[0]) if len(rows) > 1 else pd.DataFrame(rows)
    
    # Convert Portuguese decimals to floats
    for col in df.columns:
        def convert_pt_decimal(val):
            if pd.isna(val):
                return val
            s = str(val).strip()
            # Pattern: digits, comma, digits (Portuguese decimal like "4,4" or "10,25")
            if re.match(r"^\d+,\d+$", s):
                return float(s.replace(",", "."))
            return val
        
        df[col] = df[col].apply(convert_pt_decimal)
    
    return df


def extract_tables_from_html(html_text: str) -> List[pd.DataFrame]:
    """Parse HTML and extract all <table> elements as DataFrames."""
    soup = BeautifulSoup(html_text, "lxml")
    tables = soup.find_all("table")
    
    dfs = []
    for table in tables:
        try:
            df = parse_table_preserve_decimals(table)
            if not df.empty:
                dfs.append(df)
        except Exception as e:
            print(f"  Warning: Could not parse table - {e}")
    return dfs


def has_table(html_text: str) -> bool:
    """Check if HTML contains any <table> element."""
    soup = BeautifulSoup(html_text, "lxml")
    return len(soup.find_all("table")) > 0


# Check which meetings have tables
meetings_with_tables = []
for _, row in df_meetings.iterrows():
    html = row["textoComunicado"]
    if has_table(html):
        meetings_with_tables.append({
            "nro_reuniao": row["nro_reuniao"],
            "dataReferencia": row["dataReferencia"],
            "titulo": row["titulo"],
        })

df_with_tables = pd.DataFrame(meetings_with_tables)
print(f"Meetings with tables: {len(df_with_tables)} out of {len(df_meetings)}")
df_with_tables

Meetings with tables: 11 out of 100


Unnamed: 0,nro_reuniao,dataReferencia,titulo
0,265,2024-09-18,"Copom eleva a taxa Selic para 10,75% a.a."
1,266,2024-11-06,"Copom eleva a taxa Selic para 11,25% a.a."
2,267,2024-12-11,"Copom eleva a taxa Selic para 12,25% a.a."
3,268,2025-01-29,"Copom eleva a taxa Selic para 13,25% a.a."
4,269,2025-03-19,"Copom eleva a taxa Selic para 14,25% a.a."
5,270,2025-05-07,"Copom eleva a taxa Selic para 14,75% a.a."
6,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a."
7,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a."
8,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a."
9,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a."


In [23]:
# Extract and display tables from a specific meeting

def show_meeting_tables(nro_reuniao: int) -> List[pd.DataFrame]:
    """Extract and display all tables from a specific meeting."""
    row = df_meetings[df_meetings["nro_reuniao"] == nro_reuniao].iloc[0]
    html = row["textoComunicado"]
    
    print(f"Meeting #{nro_reuniao} - {row['dataReferencia'].date()}")
    print(f"Title: {row['titulo']}")
    print("=" * 60)
    
    tables = extract_tables_from_html(html)
    if not tables:
        print("No tables found in this statement.")
        return []
    
    for i, df_table in enumerate(tables):
        print(f"\nTable {i+1}:")
        display(df_table)
    
    return tables


# Example: View tables from the most recent meeting
latest_nro = df_meetings["nro_reuniao"].max()
#comunicado = 264
tables = show_meeting_tables(latest_nro)


Meeting #275 - 2025-12-10
Title: Copom mantém a taxa Selic em 15,00% a.a.

Table 1:


Unnamed: 0,Índice de preços,2025,2026,2º tri 2027
0,IPCA,4.4,3.5,3.2
1,IPCA livres,4.0,3.6,3.2
2,IPCA administrados,5.3,3.2,3.4


In [24]:
# Copy tables to clipboard (for pasting into Excel, etc.) -- temporary

def tables_to_clipboard(tables: List[pd.DataFrame], sep: str = "\n\n"):
    """Combine all tables into one text block and copy to clipboard."""
    if not tables:
        print("No tables to copy.")
        return
    
    # Combine all tables with separator
    combined = pd.concat(tables, ignore_index=True)
    combined.to_clipboard(index=False)
    print(f"Copied {len(tables)} table(s) ({len(combined)} rows) to clipboard.")
    return combined


# Copy the tables from the most recent meeting
df_clipboard = tables_to_clipboard(tables)
df_clipboard

Copied 1 table(s) (3 rows) to clipboard.


Unnamed: 0,Índice de preços,2025,2026,2º tri 2027
0,IPCA,4.4,3.5,3.2
1,IPCA livres,4.0,3.6,3.2
2,IPCA administrados,5.3,3.2,3.4


In [25]:
# Build a consolidated DataFrame with all tables from all meetings

all_table_rows = []

for _, row in tqdm(df_meetings.iterrows(), total=len(df_meetings), desc="Extracting tables"):
    html = row["textoComunicado"]
    tables = extract_tables_from_html(html)
    
    for table_idx, df_table in enumerate(tables):
        # Add metadata to each table
        df_table = df_table.copy()
        df_table["_nro_reuniao"] = row["nro_reuniao"]
        df_table["_dataReferencia"] = row["dataReferencia"]
        df_table["_table_idx"] = table_idx
        all_table_rows.append(df_table)

if all_table_rows:
    df_all_tables = pd.concat(all_table_rows, ignore_index=True)
    print(f"Total tables extracted: {len(all_table_rows)}")
    print(f"Total rows across all tables: {len(df_all_tables)}")
    print(f"\nColumns: {df_all_tables.columns.tolist()}")
    df_all_tables.head(20)
else:
    print("No tables found in any meeting.")

df_all_tables.to_clipboard()

Extracting tables: 100%|██████████| 100/100 [00:00<00:00, 646.30it/s]

Total tables extracted: 11
Total rows across all tables: 33

Columns: ['Índice de preços', '2024', '2025', '1º tri 2026', '_nro_reuniao', '_dataReferencia', '_table_idx', '2º tri \u200b2026', '2º tri 2026', '3º tri 2026', '2026', '\u200b2026', '1º tri 2027', '2º tri 2027']





In [26]:
# Save the consolidated tables for later use

if all_table_rows:
    # Convert all columns to string to avoid type conflicts 
    # (tables have mixed types like "3º tri 2026" and numeric values)
    df_export = df_all_tables.copy()
    for col in df_export.columns:
        if col not in ["_dataReferencia"]:  # Keep datetime as-is
            df_export[col] = df_export[col].astype(str)
    
    output_path = os.path.join(DATA_DIR, "copom_tables_consolidated.parquet")
    df_export.to_parquet(output_path, index=False)
    print(f"Saved to: {os.path.abspath(output_path)}")
    
    # Also save as CSV for easy inspection
    csv_path = os.path.join(DATA_DIR, "copom_tables_consolidated.csv")
    df_all_tables.to_csv(csv_path, index=False)
    print(f"Saved to: {os.path.abspath(csv_path)}")

Saved to: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_tables_consolidated.parquet
Saved to: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_tables_consolidated.csv


In [27]:
'''---

## Reconstructing Tables from Pre-265 Paragraphs

Before meeting #265, COPOM published the same projection data in paragraph form rather than tables. 
We'll extract this structured data using regex patterns and add it to our consolidated dataset with a `_reconstructed=True` flag.
'''

"---\n\n## Reconstructing Tables from Pre-265 Paragraphs\n\nBefore meeting #265, COPOM published the same projection data in paragraph form rather than tables. \nWe'll extract this structured data using regex patterns and add it to our consolidated dataset with a `_reconstructed=True` flag.\n"

In [59]:
# Regex patterns to extract projection data from paragraphs (multi-scenario support)
import re

def find_projection_paragraph(text: str) -> str | None:
    """Find the paragraph containing scenario projections."""
    soup = BeautifulSoup(text, "lxml")
    plain = soup.get_text("\n")
    
    # Look for paragraphs mentioning scenario projections
    for para in plain.split("\n\n"):
        para_lower = para.lower()
        # Match various scenario descriptions
        if any(term in para_lower for term in [
            "cenário de referência", "cenário básico", "cenário com trajetórias",
            "cenário com juros", "projeções do copom", "projeções de inflação"
        ]):
            if any(term in para_lower for term in ["situam-se", "situa-se", "projeções"]):
                return para.strip()
    return None


def normalize_abbreviations(text: str) -> str:
    """Replace common abbreviations to avoid period-based sentence splitting issues."""
    # Replace "a.a." with a placeholder that won't be split on
    return text.replace("a.a.", "a·a·")


def restore_abbreviations(text: str) -> str:
    """Restore abbreviations after processing."""
    return text.replace("a·a·", "a.a.")


def extract_year_values(text: str) -> dict:
    """
    Extract year-value pairs like '3,3% para 2019 e 3,6% para 2020'.
    Returns dict like {"2019": 3.3, "2020": 3.6}
    """
    results = {}
    # Pattern: number% para/em YYYY
    matches = re.findall(r"(\d+[,\.]\d+)\s*%?\s*(?:a\.a\.|a·a·)?\s*(?:para|em|até[^0-9]*final[^0-9]*de)?\s*(\d{4})", text)
    for val, year in matches:
        results[year] = float(val.replace(",", "."))
    return results


def extract_fx_values(text: str) -> dict:
    """
    Extract FX assumptions like 'R$/US$ 3,90' with years.
    Handles patterns like:
    - "termina 2019 em R$/US$ 3,90 e permanece nesse patamar até o final de 2020"
    - "taxa de câmbio constante a R$/US$ 4,05"
    """
    results = {}
    
    # Pattern 1: "termina YYYY em R$/US$ X,XX"
    term_matches = re.findall(r"termina\s+(\d{4})\s+em\s+R\$\s*/?\s*US\$\s*(\d+[,\.]\d+)", text, re.IGNORECASE)
    for year, val in term_matches:
        results[year] = float(val.replace(",", "."))
    
    # Pattern 2: R$/US$ X,XX ... YYYY
    matches = re.findall(r"R\$\s*/?\s*US\$\s*(\d+[,\.]\d+)[^0-9]*?(\d{4})", text)
    for val, year in matches:
        if year not in results:
            results[year] = float(val.replace(",", "."))
    
    # Check for "permanece nesse patamar até o final de YYYY" - propagate value
    if "permanece" in text.lower() and results:
        perm_match = re.search(r"permanece[^0-9]*(?:até[^0-9]*final[^0-9]*de\s+)?(\d{4})", text, re.IGNORECASE)
        if perm_match:
            end_year = perm_match.group(1)
            base_val = list(results.values())[0]  # Use first extracted value
            results[end_year] = base_val
    
    # Check for "constante" pattern
    if "constante" in text.lower():
        const_match = re.search(r"R\$\s*/?\s*US\$\s*(\d+[,\.]\d+)", text)
        if const_match:
            base_val = float(const_match.group(1).replace(",", "."))
            all_years = re.findall(r"\b(20\d{2})\b", text)
            for year in all_years:
                if year not in results:
                    results[year] = base_val
    
    return results


def extract_selic_values(text: str) -> dict:
    """
    Extract Selic assumptions like '5,00% a.a.' with years.
    Handles patterns like:
    - "encerra 2019 em 5,00% a.a. e permanece nesse patamar até o final de 2020"
    - "juros constantes a 6,00% a.a."
    """
    results = {}
    
    # Pattern 1: "encerra YYYY em X,XX% a.a."
    enc_matches = re.findall(r"encerra\s+(\d{4})\s+em\s+(\d+[,\.]\d+)\s*%", text, re.IGNORECASE)
    for year, val in enc_matches:
        results[year] = float(val.replace(",", "."))
    
    # Pattern 2: Generic "X,XX% a.a. ... YYYY"
    if not results:
        gen_matches = re.findall(r"(\d+[,\.]\d+)\s*%\s*(?:a\.a\.|a·a·)?\s*[^0-9]*?(\d{4})", text)
        for val, year in gen_matches:
            if year not in results:
                results[year] = float(val.replace(",", "."))
    
    # Check for "permanece nesse patamar até o final de YYYY" - propagate value
    if "permanece" in text.lower() and results:
        perm_match = re.search(r"permanece[^0-9]*(?:até[^0-9]*final[^0-9]*de\s+)?(\d{4})", text, re.IGNORECASE)
        if perm_match:
            end_year = perm_match.group(1)
            base_val = list(results.values())[0]  # Use first extracted value
            if end_year not in results:
                results[end_year] = base_val
    
    # Check for "constante" pattern - same value for all years
    if "constante" in text.lower():
        const_match = re.search(r"(\d+[,\.]\d+)\s*%\s*(?:a\.a\.|a·a·)?", text)
        if const_match:
            base_val = float(const_match.group(1).replace(",", "."))
            all_years = re.findall(r"\b(20\d{2})\b", text)
            for year in all_years:
                if year not in results:
                    results[year] = base_val
    
    return results


def extract_projections_from_text(text: str, nro_reuniao: int, data_ref) -> List[dict]:
    """
    Extract inflation projections from a paragraph with multiple scenarios.
    Returns list of dicts with structured data.
    """
    records = []
    
    # Normalize abbreviations to avoid period-splitting issues
    text_norm = normalize_abbreviations(text)
    
    # ========== SCENARIO A: Focus trajectory ==========
    focus_match = re.search(
        r"cenário\s+(?:básico\s+)?(?:com\s+)?trajetórias?\s+(?:para\s+)?(?:as\s+)?(?:taxas?\s+de\s+)?(?:juros\s+e\s+)?(?:câmbio\s+)?(?:extraídas?\s+da\s+)?(?:pesquisa\s+)?[Ff]ocus[^.]*?projeções[^.]*?situam-se\s+em\s+torno\s+de\s+([^.]+)",
        text_norm, re.IGNORECASE | re.DOTALL
    )
    
    if focus_match:
        projection_text = focus_match.group(1)
        ipca_vals = extract_year_values(projection_text)
        
        if ipca_vals:
            # Get the full Focus scenario block (up to next "No cenário" or end)
            focus_start = focus_match.start()
            next_scenario = re.search(r"No\s+cenário\s+com\s+juros", text_norm[focus_match.end():], re.IGNORECASE)
            focus_end = focus_match.end() + next_scenario.start() if next_scenario else len(text_norm)
            focus_block = text_norm[focus_start:focus_end]
            
            # Extract Selic: "trajetória de juros que encerra 2019 em 5,00% a.a. e permanece..."
            # Now we can safely split on periods since "a.a." is normalized
            selic_section = re.search(r"trajetória\s+de\s+juros[^.]+\.", focus_block, re.IGNORECASE)
            selic_vals = extract_selic_values(selic_section.group(0)) if selic_section else {}
            
            # Extract FX: "trajetória para a taxa de câmbio que termina 2019 em R$/US$ 3,90 e permanece..."
            fx_section = re.search(r"trajetória\s+(?:para\s+a\s+)?(?:taxa\s+de\s+)?câmbio[^.]+\.", focus_block, re.IGNORECASE)
            fx_vals = extract_fx_values(fx_section.group(0)) if fx_section else {}
            
            record = {
                "Índice de preços": "IPCA",
                "_nro_reuniao": nro_reuniao,
                "_dataReferencia": data_ref,
                "_table_idx": 0,
                "_reconstructed": True,
                "_scenario": "Focus (trajetórias pesquisa Focus)",
            }
            record.update({f"ipca_{y}": v for y, v in ipca_vals.items()})
            record.update({f"selic_{y}": v for y, v in selic_vals.items()})
            record.update({f"fx_{y}": v for y, v in fx_vals.items()})
            records.append(record)
    
    # ========== SCENARIO B: Selic constant ==========
    const_match = re.search(
        r"cenário\s+com\s+juros\s+constantes?\s+(?:a\s+)?(\d+[,\.]\d+)\s*%\s*(?:a\.a\.|a·a·)?\s+e\s+taxa\s+de\s+câmbio\s+constante\s+a\s+R\$\s*/?\s*US\$\s*(\d+[,\.]\d+)[^.]*projeções\s+situam-se\s+em\s+torno\s+de\s+([^.]+)",
        text_norm, re.IGNORECASE | re.DOTALL
    )
    
    if const_match:
        selic_const = float(const_match.group(1).replace(",", "."))
        fx_const = float(const_match.group(2).replace(",", "."))
        projection_text = const_match.group(3)
        ipca_vals = extract_year_values(projection_text)
        
        if ipca_vals:
            years = list(ipca_vals.keys())
            record = {
                "Índice de preços": "IPCA",
                "_nro_reuniao": nro_reuniao,
                "_dataReferencia": data_ref,
                "_table_idx": 0,
                "_reconstructed": True,
                "_scenario": "Selic constante",
            }
            record.update({f"ipca_{y}": v for y, v in ipca_vals.items()})
            for y in years:
                record[f"selic_{y}"] = selic_const
                record[f"fx_{y}"] = fx_const
            records.append(record)
    
    # ========== SCENARIO C: Hybrid (FX constant, Selic from Focus) ==========
    hybrid_match = re.search(
        r"cenário\s+híbrido\s+com\s+taxa\s+de\s+câmbio\s+constante\s+e\s+trajetória\s+de\s+juros\s+da\s+pesquisa\s+[Ff]ocus[^.]*?(?:implica\s+)?(?:inflação\s+)?(?:em\s+torno\s+de\s+)?([^.]+)",
        text_norm, re.IGNORECASE | re.DOTALL
    )
    
    if hybrid_match:
        projection_text = hybrid_match.group(1)
        ipca_vals = extract_year_values(projection_text)
        
        if ipca_vals:
            years = list(ipca_vals.keys())
            record = {
                "Índice de preços": "IPCA",
                "_nro_reuniao": nro_reuniao,
                "_dataReferencia": data_ref,
                "_table_idx": 0,
                "_reconstructed": True,
                "_scenario": "Híbrido (FX constante, Selic Focus)",
            }
            record.update({f"ipca_{y}": v for y, v in ipca_vals.items()})
            
            # Hybrid uses Selic from Focus scenario
            for rec in records:
                if "Focus" in rec.get("_scenario", ""):
                    for k, v in rec.items():
                        if k.startswith("selic_"):
                            record[k] = v
                    break
            
            # Hybrid uses FX from constant scenario (same constant FX value)
            for rec in records:
                if "constante" in rec.get("_scenario", "").lower():
                    for k, v in rec.items():
                        if k.startswith("fx_"):
                            record[k] = v
                    break
            
            records.append(record)
    
    # ========== NEWER FORMAT: "cenário de referência" or "cenário básico" ==========
    if not records:
        ref_match = re.search(
            r"(?:cenário\s+(?:de\s+referência|básico)[^.]*)?projeções\s+(?:de\s+inflação\s+)?(?:do\s+Copom\s+)?situam-se\s+em\s+torno\s+de\s+([^.]+)",
            text_norm, re.IGNORECASE | re.DOTALL
        )
        
        if ref_match:
            projection_text = ref_match.group(1)
            ipca_vals = extract_year_values(projection_text)
            
            if ipca_vals:
                record = {
                    "Índice de preços": "IPCA",
                    "_nro_reuniao": nro_reuniao,
                    "_dataReferencia": data_ref,
                    "_table_idx": 0,
                    "_reconstructed": True,
                    "_scenario": "referência",
                }
                record.update({f"ipca_{y}": v for y, v in ipca_vals.items()})
                
                # Look for Selic trajectory
                selic_match = re.search(r"trajetória\s+(?:para\s+a\s+taxa\s+)?de\s+juros[^.]+\.", text_norm, re.IGNORECASE)
                if selic_match:
                    selic_vals = extract_selic_values(selic_match.group(0))
                    record.update({f"selic_{y}": v for y, v in selic_vals.items()})
                
                # Look for FX
                fx_match = re.search(r"R\$\s*(\d+[,\.]\d+)\s*/\s*US\$", text_norm)
                if fx_match:
                    record["_fx_assumption"] = float(fx_match.group(1).replace(",", "."))
                
                records.append(record)
    
    # ========== IPCA Administrados (applies to main scenario) ==========
    adm_match = re.search(
        r"(?:projeções\s+para\s+)?(?:a\s+)?(?:inflação\s+de\s+)?(?:preços\s+)?administrados\s+(?:são\s+de\s+)?([^.]+)",
        text_norm, re.IGNORECASE
    )
    
    if adm_match:
        adm_vals = extract_year_values(adm_match.group(1))
        if adm_vals:
            scenario = records[0]["_scenario"] if records else "referência"
            record = {
                "Índice de preços": "IPCA administrados",
                "_nro_reuniao": nro_reuniao,
                "_dataReferencia": data_ref,
                "_table_idx": 0,
                "_reconstructed": True,
                "_scenario": scenario,
            }
            record.update({f"ipca_{y}": v for y, v in adm_vals.items()})
            records.append(record)
    
    return records


# Test on meeting 225's paragraph
example_225 = """No cenário com trajetórias para as taxas de juros e câmbio extraídas da pesquisa Focus, as projeções do Copom situam-se em torno de 3,3% para 2019 e 3,6% para 2020. Esse cenário supõe trajetória de juros que encerra 2019 em 5,00% a.a. e permanece nesse patamar até o final de 2020. Também supõe trajetória para a taxa de câmbio que termina 2019 em R$/US$ 3,90 e permanece nesse patamar até o final de 2020. No cenário com juros constantes a 6,00% a.a. e taxa de câmbio constante a R$/US$ 4,05*, as projeções situam-se em torno de 3,4% para 2019 e 3,6% para 2020. O cenário híbrido com taxa de câmbio constante e trajetória de juros da pesquisa Focus implica inflação em torno de 3,4% para 2019 e 3,8% para 2020."""

test_records = extract_projections_from_text(example_225, 225, pd.Timestamp("2019-05-08"))
print(f"Test extraction (meeting 225) - {len(test_records)} records:")
df_test = pd.DataFrame(test_records)

# Reorder columns for clarity
col_order = ["Índice de preços", "_scenario", "ipca_2019", "ipca_2020", "selic_2019", "selic_2020", "fx_2019", "fx_2020", "_nro_reuniao", "_dataReferencia", "_table_idx", "_reconstructed"]
col_order = [c for c in col_order if c in df_test.columns]
df_test = df_test[col_order]

df_test.to_clipboard(index=False)
print("\nExpected:")
print("Focus:    ipca 3.3/3.6, selic 5.0/5.0, fx 3.9/3.9")
print("Constant: ipca 3.4/3.6, selic 6.0/6.0, fx 4.05/4.05") 
print("Hybrid:   ipca 3.4/3.8, selic 5.0/5.0, fx 4.05/4.05")
print()
df_test

Test extraction (meeting 225) - 3 records:

Expected:
Focus:    ipca 3.3/3.6, selic 5.0/5.0, fx 3.9/3.9
Constant: ipca 3.4/3.6, selic 6.0/6.0, fx 4.05/4.05
Hybrid:   ipca 3.4/3.8, selic 5.0/5.0, fx 4.05/4.05



Unnamed: 0,Índice de preços,_scenario,ipca_2019,ipca_2020,selic_2019,selic_2020,fx_2019,fx_2020,_nro_reuniao,_dataReferencia,_table_idx,_reconstructed
0,IPCA,Focus (trajetórias pesquisa Focus),3.3,3.6,5.0,5.0,3.9,3.9,225,2019-05-08,0,True
1,IPCA,Selic constante,3.4,3.6,6.0,6.0,4.05,4.05,225,2019-05-08,0,True
2,IPCA,"Híbrido (FX constante, Selic Focus)",3.4,3.8,5.0,5.0,4.05,4.05,225,2019-05-08,0,True


In [60]:
# Process all pre-265 meetings to extract projections from text

# Get meetings without HTML tables (pre-265)
meetings_without_tables = df_meetings[~df_meetings["nro_reuniao"].isin(df_with_tables["nro_reuniao"])]
print(f"Meetings without HTML tables: {len(meetings_without_tables)}")

reconstructed_records = []
failed_extractions = []

for _, row in tqdm(meetings_without_tables.iterrows(), total=len(meetings_without_tables), desc="Extracting from text"):
    html = row["textoComunicado"]
    para = find_projection_paragraph(html)
    
    if para:
        records = extract_projections_from_text(para, row["nro_reuniao"], row["dataReferencia"])
        if records:
            reconstructed_records.extend(records)
        else:
            failed_extractions.append({
                "nro_reuniao": row["nro_reuniao"],
                "dataReferencia": row["dataReferencia"],
                "paragraph": para[:200] + "..."
            })
    else:
        failed_extractions.append({
            "nro_reuniao": row["nro_reuniao"],
            "dataReferencia": row["dataReferencia"],
            "paragraph": None
        })

print(f"\nSuccessfully extracted: {len(reconstructed_records)} records")
print(f"Failed extractions: {len(failed_extractions)} meetings")

if reconstructed_records:
    df_reconstructed = pd.DataFrame(reconstructed_records)
    display(df_reconstructed.head(10))

Meetings without HTML tables: 89


Extracting from text: 100%|██████████| 89/89 [00:00<00:00, 515.01it/s]


Successfully extracted: 75 records
Failed extractions: 34 meetings





Unnamed: 0,Índice de preços,_nro_reuniao,_dataReferencia,_table_idx,_reconstructed,_scenario,ipca_2017,ipca_2018,ipca_2019,selic_2019,...,fx_2020,ipca_2021,selic_2021,_fx_assumption,ipca_2022,selic_2022,ipca_2023,selic_2023,ipca_2024,ipca_2025
0,IPCA,210,2017-10-25,0,True,Focus (trajetórias pesquisa Focus),3.3,4.3,4.2,8.0,...,,,,,,,,,,
1,IPCA,211,2017-12-06,0,True,Focus (trajetórias pesquisa Focus),2.9,4.2,4.2,7.0,...,,,,,,,,,,
2,IPCA,212,2018-02-07,0,True,Focus (trajetórias pesquisa Focus),,4.2,,,...,,,,,,,,,,
3,IPCA,213,2018-03-21,0,True,Focus (trajetórias pesquisa Focus),,3.8,4.1,,...,,,,,,,,,,
4,IPCA,214,2018-05-16,0,True,Focus (trajetórias pesquisa Focus),,3.6,3.9,,...,,,,,,,,,,
5,IPCA,214,2018-05-16,0,True,Selic constante,,4.0,,,...,,,,,,,,,,
6,IPCA,215,2018-06-20,0,True,Focus (trajetórias pesquisa Focus),,4.2,3.7,,...,,,,,,,,,,
7,IPCA,215,2018-06-20,0,True,Selic constante,,4.2,4.1,6.5,...,,,,,,,,,,
8,IPCA,216,2018-08-01,0,True,Focus (trajetórias pesquisa Focus),,4.2,3.8,,...,,,,,,,,,,
9,IPCA,216,2018-08-01,0,True,Selic constante,,4.2,4.1,6.5,...,,,,,,,,,,


In [61]:
# Inspect failed extractions to understand edge cases
if failed_extractions:
    df_failed = pd.DataFrame(failed_extractions)
    print("Failed extractions:")
    display(df_failed)
    
    # Show a sample paragraph that failed
    sample_fail = df_failed[df_failed["paragraph"].notna()].iloc[0] if df_failed["paragraph"].notna().any() else None
    if sample_fail is not None:
        print(f"\nSample failed paragraph (meeting {sample_fail['nro_reuniao']}):")
        print(sample_fail["paragraph"])

Failed extractions:


Unnamed: 0,nro_reuniao,dataReferencia,paragraph
0,176,2013-07-10,
1,177,2013-08-28,
2,178,2013-10-09,
3,179,2013-11-27,
4,180,2014-01-15,
5,181,2014-02-26,
6,182,2014-04-02,
7,183,2014-05-28,
8,184,2014-07-16,
9,185,2014-09-03,



Sample failed paragraph (meeting 200):
"O Copom decidiu, por unanimidade, manter a taxa Selic em 14,25% a.a., sem viés.
 
O cenário básico com que o Comitê trabalha pode ser resumido pelas seguintes observações:
 
O conjunto dos indicadore...


In [62]:
# Combine reconstructed data with HTML-extracted tables

if reconstructed_records:
    # Add _reconstructed=False to original tables
    df_all_tables["_reconstructed"] = False
    df_all_tables["_scenario"] = "referência"  # Tables are typically reference scenario
    
    # Merge
    df_combined = pd.concat([df_all_tables, df_reconstructed], ignore_index=True)
    df_combined = df_combined.sort_values(["_dataReferencia", "_nro_reuniao"]).reset_index(drop=True)
    
    print(f"Combined dataset:")
    print(f"  - Original table rows: {len(df_all_tables)}")
    print(f"  - Reconstructed rows: {len(df_reconstructed)}")
    print(f"  - Total rows: {len(df_combined)}")
    print(f"\nMeetings covered: {df_combined['_nro_reuniao'].nunique()}")
    print(f"Date range: {df_combined['_dataReferencia'].min().date()} to {df_combined['_dataReferencia'].max().date()}")
    
    display(df_combined.tail(15))
else:
    df_combined = df_all_tables.copy()
    df_combined["_reconstructed"] = False
    print("No reconstructed records to add.")

Combined dataset:
  - Original table rows: 33
  - Reconstructed rows: 75
  - Total rows: 108

Meetings covered: 66
Date range: 2017-10-25 to 2025-12-10


Unnamed: 0,Índice de preços,2024,2025,1º tri 2026,_nro_reuniao,_dataReferencia,_table_idx,2º tri ​2026,2º tri 2026,3º tri 2026,...,fx_2020,ipca_2021,selic_2021,_fx_assumption,ipca_2022,selic_2022,ipca_2023,selic_2023,ipca_2024,ipca_2025
93,IPCA,,4.9,,271,2025-06-18,0,,,,...,,,,,,,,,,
94,IPCA livres,,5.2,,271,2025-06-18,0,,,,...,,,,,,,,,,
95,IPCA administrados,,3.8,,271,2025-06-18,0,,,,...,,,,,,,,,,
96,IPCA,,4.9,,272,2025-07-30,0,,,,...,,,,,,,,,,
97,IPCA livres,,5.1,,272,2025-07-30,0,,,,...,,,,,,,,,,
98,IPCA administrados,,4.4,,272,2025-07-30,0,,,,...,,,,,,,,,,
99,IPCA,,4.8,,273,2025-09-17,0,,,,...,,,,,,,,,,
100,IPCA livres,,5.0,,273,2025-09-17,0,,,,...,,,,,,,,,,
101,IPCA administrados,,4.3,,273,2025-09-17,0,,,,...,,,,,,,,,,
102,IPCA,,4.6,,274,2025-11-05,0,,,,...,,,,,,,,,,


In [63]:
# Save the combined dataset

output_path = os.path.join(DATA_DIR, "copom_projections_combined.parquet")
csv_path = os.path.join(DATA_DIR, "copom_projections_combined.csv")

# Convert mixed columns to string for parquet compatibility
df_export_combined = df_combined.copy()
for col in df_export_combined.columns:
    if col not in ["_dataReferencia", "_reconstructed"]:
        df_export_combined[col] = df_export_combined[col].astype(str)

df_export_combined.to_parquet(output_path, index=False)
df_combined.to_csv(csv_path, index=False)

print(f"Saved combined projections:")
print(f"  - Parquet: {os.path.abspath(output_path)}")
print(f"  - CSV: {os.path.abspath(csv_path)}")

Saved combined projections:
  - Parquet: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_projections_combined.parquet
  - CSV: c:\Users\alber\projects\hello-nlp\notebooks\data_copom\copom_projections_combined.csv


In [64]:
# Test the fix - should now show 4.4 instead of 44
tables_test = show_meeting_tables(275)
tables_test[0] if tables_test else "No tables"

Meeting #275 - 2025-12-10
Title: Copom mantém a taxa Selic em 15,00% a.a.

Table 1:


Unnamed: 0,Índice de preços,2025,2026,2º tri 2027
0,IPCA,4.4,3.5,3.2
1,IPCA livres,4.0,3.6,3.2
2,IPCA administrados,5.3,3.2,3.4


Unnamed: 0,Índice de preços,2025,2026,2º tri 2027
0,IPCA,4.4,3.5,3.2
1,IPCA livres,4.0,3.6,3.2
2,IPCA administrados,5.3,3.2,3.4


In [65]:
# Combine original tables with reconstructed tables and copy to clipboard

# Add flags to original tables if not already present
if "_reconstructed" not in df_all_tables.columns:
    df_all_tables["_reconstructed"] = False
if "_scenario" not in df_all_tables.columns:
    df_all_tables["_scenario"] = "referência"

# Combine
if reconstructed_records:
    df_combined = pd.concat([df_all_tables, df_reconstructed], ignore_index=True)
else:
    df_combined = df_all_tables.copy()

df_combined = df_combined.sort_values(["_dataReferencia", "_nro_reuniao"]).reset_index(drop=True)

print(f"Combined dataset:")
print(f"  - Original table rows (HTML): {len(df_all_tables)}")
print(f"  - Reconstructed rows (text): {len(df_reconstructed) if reconstructed_records else 0}")
print(f"  - Total rows: {len(df_combined)}")
print(f"\nMeetings covered: {df_combined['_nro_reuniao'].nunique()}")
print(f"Date range: {df_combined['_dataReferencia'].min().date()} to {df_combined['_dataReferencia'].max().date()}")

# Copy to clipboard
df_combined.to_clipboard(index=False)
print(f"\n✓ Copied {len(df_combined)} rows to clipboard!")

df_combined

Combined dataset:
  - Original table rows (HTML): 33
  - Reconstructed rows (text): 75
  - Total rows: 108

Meetings covered: 66
Date range: 2017-10-25 to 2025-12-10

✓ Copied 108 rows to clipboard!


Unnamed: 0,Índice de preços,2024,2025,1º tri 2026,_nro_reuniao,_dataReferencia,_table_idx,2º tri ​2026,2º tri 2026,3º tri 2026,...,fx_2020,ipca_2021,selic_2021,_fx_assumption,ipca_2022,selic_2022,ipca_2023,selic_2023,ipca_2024,ipca_2025
0,IPCA,,,,210,2017-10-25,0,,,,...,,,,,,,,,,
1,IPCA,,,,211,2017-12-06,0,,,,...,,,,,,,,,,
2,IPCA,,,,212,2018-02-07,0,,,,...,,,,,,,,,,
3,IPCA,,,,213,2018-03-21,0,,,,...,,,,,,,,,,
4,IPCA,,,,214,2018-05-16,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,IPCA livres,,4.5,,274,2025-11-05,0,,,,...,,,,,,,,,,
104,IPCA administrados,,5.0,,274,2025-11-05,0,,,,...,,,,,,,,,,
105,IPCA,,4.4,,275,2025-12-10,0,,,,...,,,,,,,,,,
106,IPCA livres,,4.0,,275,2025-12-10,0,,,,...,,,,,,,,,,


In [66]:
# Create a long-format (melted) version - all meetings stacked vertically

# Identify year columns (columns that look like years or quarters like "2024", "2025", "2º tri 2027")
meta_cols = ["Índice de preços", "_nro_reuniao", "_dataReferencia", "_table_idx", "_reconstructed", "_scenario", "_fx_assumption"]
year_cols = [c for c in df_combined.columns if c not in meta_cols]

print(f"Year/period columns found: {year_cols}")

# Melt: transform year columns into rows
df_long = df_combined.melt(
    id_vars=[c for c in meta_cols if c in df_combined.columns],
    value_vars=year_cols,
    var_name="periodo",
    value_name="projecao"
)

# Drop rows with missing projections
df_long = df_long.dropna(subset=["projecao"])

# Sort by meeting date, then by index type
df_long = df_long.sort_values(["_dataReferencia", "_nro_reuniao", "Índice de preços", "periodo"]).reset_index(drop=True)

print(f"\nLong-format dataset:")
print(f"  - Total rows: {len(df_long)}")
print(f"  - Meetings covered: {df_long['_nro_reuniao'].nunique()}")

# Copy to clipboard
df_long.to_clipboard(index=False)
print(f"\n✓ Copied {len(df_long)} rows to clipboard!")

df_long

Year/period columns found: ['2024', '2025', '1º tri 2026', '2º tri \u200b2026', '2º tri 2026', '3º tri 2026', '2026', '\u200b2026', '1º tri 2027', '2º tri 2027', 'ipca_2017', 'ipca_2018', 'ipca_2019', 'selic_2019', 'selic_2018', 'fx_2018', 'fx_2019', 'ipca_2020', 'selic_2020', 'fx_2020', 'ipca_2021', 'selic_2021', 'ipca_2022', 'selic_2022', 'ipca_2023', 'selic_2023', 'ipca_2024', 'ipca_2025']

Long-format dataset:
  - Total rows: 391
  - Meetings covered: 66

✓ Copied 391 rows to clipboard!


Unnamed: 0,Índice de preços,_nro_reuniao,_dataReferencia,_table_idx,_reconstructed,_scenario,_fx_assumption,periodo,projecao
0,IPCA,210,2017-10-25,0,True,Focus (trajetórias pesquisa Focus),,ipca_2017,3.3
1,IPCA,210,2017-10-25,0,True,Focus (trajetórias pesquisa Focus),,ipca_2018,4.3
2,IPCA,210,2017-10-25,0,True,Focus (trajetórias pesquisa Focus),,ipca_2019,4.2
3,IPCA,210,2017-10-25,0,True,Focus (trajetórias pesquisa Focus),,selic_2019,8.0
4,IPCA,211,2017-12-06,0,True,Focus (trajetórias pesquisa Focus),,ipca_2017,2.9
...,...,...,...,...,...,...,...,...,...
386,IPCA administrados,275,2025-12-10,0,False,referência,,2026,3.2
387,IPCA administrados,275,2025-12-10,0,False,referência,,2º tri 2027,3.4
388,IPCA livres,275,2025-12-10,0,False,referência,,2025,4.0
389,IPCA livres,275,2025-12-10,0,False,referência,,2026,3.6


In [71]:
# Pretty-print a statement - strip HTML and show paragraphs

def pretty_print_statement(nro_reuniao: int, show_paragraphs: bool = True) -> List[str]:
    """
    Extract clean text from a COPOM statement and optionally show numbered paragraphs.
    Returns list of paragraphs for further processing.
    """
    row = df_meetings[df_meetings["nro_reuniao"] == nro_reuniao].iloc[0]
    html = row["textoComunicado"]
    
    soup = BeautifulSoup(html, "lxml")
    
    # Extract paragraphs from <p> tags
    paragraphs = []
    for p in soup.find_all("p"):
        text = p.get_text(separator=" ", strip=True)
        if text:  # Skip empty paragraphs
            paragraphs.append(text)
    
    # Also extract table content if present
    tables = soup.find_all("table")
    if tables:
        paragraphs.append("[TABLE DATA PRESENT - see df_all_tables for structured data]")
    
    # Print header
    print(f"{'='*80}")
    print(f"Meeting #{nro_reuniao} - {row['dataReferencia'].date()}")
    print(f"Title: {row['titulo']}")
    print(f"{'='*80}\n")
    
    if show_paragraphs:
        for i, para in enumerate(paragraphs, 1):
            print(f"[{i:02d}] {para}\n")
    else:
        print("\n\n".join(paragraphs))
    
    return paragraphs


# Test on meeting 270
paragraphs_270 = pretty_print_statement(270)

Meeting #270 - 2025-05-07
Title: Copom eleva a taxa Selic para 14,75% a.a.

[01] ​ O a mbiente externo mo stra-se adverso e particularmente incerto em função da conjuntura e da política econômica nos Estados Unidos, principalmente acerca de sua política comercial e de seus efeitos. A política comercial alimenta incertezas sobre a economia global, notadamente acerca da m​agnitude da desaceleração econômica e sobre o efeito heterogêneo no cenário inflacionário entre os países, com repercussões relevantes sobre a condução da política monetária. Além disso, o comportamento e a volatilidade de diferentes classes de ativos também têm sido afetados, com fortes reflexos nas condições financeiras globais. Tal cenário segue exigindo cautela por parte de países emergentes em ambiente de maior tensão geopolítica.

[02] Em relação ao cenário doméstico, o conjunto dos indicadores de atividade econômica e do mercado de trabalho ainda tem apresentado dinamismo, mas observa-se uma incipiente moderação 

In [72]:
# Prepare dataset for semantic paragraph labeling

def extract_paragraphs_for_labeling(df_meetings: pd.DataFrame) -> pd.DataFrame:
    """
    Extract all paragraphs from all meetings for semantic labeling.
    Returns a DataFrame ready for annotation.
    """
    rows = []
    
    for _, meeting in tqdm(df_meetings.iterrows(), total=len(df_meetings), desc="Extracting paragraphs"):
        html = meeting["textoComunicado"]
        soup = BeautifulSoup(html, "lxml")
        
        for para_idx, p in enumerate(soup.find_all("p")):
            text = p.get_text(separator=" ", strip=True)
            if text and len(text) > 20:  # Skip very short/empty paragraphs
                rows.append({
                    "nro_reuniao": meeting["nro_reuniao"],
                    "dataReferencia": meeting["dataReferencia"],
                    "para_idx": para_idx,
                    "text": text,
                    "char_len": len(text),
                    "word_count": len(text.split()),
                    # Placeholder for labels - to be filled manually or by model
                    "label": None,
                })
    
    return pd.DataFrame(rows)


# Suggested semantic labels for COPOM paragraphs:
SEMANTIC_LABELS = {
    "cenario_externo": "External scenario / global conditions",
    "cenario_domestico": "Domestic economic scenario",
    "atividade": "Economic activity and labor market",
    "inflacao": "Inflation observations and measures",
    "expectativas": "Inflation expectations",
    "projecoes": "COPOM projections / forecasts",
    "riscos_alta": "Upside risks to inflation",
    "riscos_baixa": "Downside risks to inflation",
    "fiscal": "Fiscal policy considerations",
    "decisao": "Monetary policy decision",
    "forward_guidance": "Forward guidance / next steps",
    "votacao": "Voting record",
    "tabela": "Table reference",
    "premissas": "Scenario assumptions (Selic, FX, etc.)",
    "outros": "Other / miscellaneous",
}

print("Suggested semantic labels:")
for k, v in SEMANTIC_LABELS.items():
    print(f"  {k:20} - {v}")

# Extract paragraphs
df_paragraphs = extract_paragraphs_for_labeling(df_meetings)
print(f"\nExtracted {len(df_paragraphs)} paragraphs from {df_paragraphs['nro_reuniao'].nunique()} meetings")
print(f"Avg paragraph length: {df_paragraphs['word_count'].mean():.1f} words")

df_paragraphs.head(10)

Suggested semantic labels:
  cenario_externo      - External scenario / global conditions
  cenario_domestico    - Domestic economic scenario
  atividade            - Economic activity and labor market
  inflacao             - Inflation observations and measures
  expectativas         - Inflation expectations
  projecoes            - COPOM projections / forecasts
  riscos_alta          - Upside risks to inflation
  riscos_baixa         - Downside risks to inflation
  fiscal               - Fiscal policy considerations
  decisao              - Monetary policy decision
  forward_guidance     - Forward guidance / next steps
  votacao              - Voting record
  tabela               - Table reference
  premissas            - Scenario assumptions (Selic, FX, etc.)
  outros               - Other / miscellaneous


Extracting paragraphs: 100%|██████████| 100/100 [00:00<00:00, 289.58it/s]


Extracted 837 paragraphs from 92 meetings
Avg paragraph length: 55.5 words





Unnamed: 0,nro_reuniao,dataReferencia,para_idx,text,char_len,word_count,label
0,176,2013-07-10,0,"""Dando prosseguimento ao ajuste da taxa básica...",139,24,
1,176,2013-07-10,1,O Comitê avalia que essa decisão contribuirá p...,136,22,
2,176,2013-07-10,2,Votaram por essa decisão os seguintes membros ...,269,38,
3,177,2013-08-28,0,"""Dando prosseguimento ao ajuste da taxa básica...",139,24,
4,177,2013-08-28,1,O Comitê avalia que essa decisão contribuirá p...,136,22,
5,177,2013-08-28,2,Votaram por essa decisão os seguintes membros ...,270,39,
6,178,2013-10-09,0,"""Dando prosseguimento ao ajuste da taxa básica...",139,24,
7,178,2013-10-09,1,O Comitê avalia que essa decisão contribuirá p...,136,22,
8,178,2013-10-09,2,Votaram por essa decisão os seguintes membros ...,270,39,
9,179,2013-11-27,0,"""Dando prosseguimento ao processo de ajuste da...",190,33,


In [73]:
# Auto-label paragraphs using keyword heuristics (as a starting point for fine-tuning)

def auto_label_paragraph(text: str) -> str:
    """
    Heuristic-based auto-labeling using keywords.
    Returns suggested label (to be reviewed/corrected for training data).
    """
    text_lower = text.lower()
    
    # Check patterns in priority order
    if "votaram por essa decisão" in text_lower or "membros do comitê" in text_lower:
        return "votacao"
    
    if "tabela" in text_lower and ("projeções" in text_lower or "cenário" in text_lower):
        return "tabela"
    
    if "cenário de referência" in text_lower and ("trajetória" in text_lower or "taxa de câmbio" in text_lower):
        return "premissas"
    
    if "decidiu" in text_lower and ("taxa" in text_lower or "selic" in text_lower or "juros" in text_lower):
        return "decisao"
    
    if "próxima reunião" in text_lower or "próximos passos" in text_lower or "comunicará" in text_lower:
        return "forward_guidance"
    
    if "riscos de alta" in text_lower or "riscos para a inflação" in text_lower:
        if "riscos de baixa" in text_lower:
            return "riscos_alta"  # Usually discusses both, but alta first
        return "riscos_alta"
    
    if "riscos de baixa" in text_lower:
        return "riscos_baixa"
    
    if "projeção" in text_lower or "projeções" in text_lower:
        if "copom" in text_lower or "situa-se" in text_lower:
            return "projecoes"
    
    if "expectativas de inflação" in text_lower or "pesquisa focus" in text_lower:
        return "expectativas"
    
    if "ambiente externo" in text_lower or "economia global" in text_lower or "estados unidos" in text_lower:
        return "cenario_externo"
    
    if "fiscal" in text_lower or "contas públicas" in text_lower:
        return "fiscal"
    
    if "atividade econômica" in text_lower or "mercado de trabalho" in text_lower or "pib" in text_lower:
        return "atividade"
    
    if "cenário doméstico" in text_lower or "economia brasileira" in text_lower:
        return "cenario_domestico"
    
    if "inflação" in text_lower or "ipca" in text_lower:
        return "inflacao"
    
    return "outros"


# Apply auto-labeling
df_paragraphs["label_auto"] = df_paragraphs["text"].apply(auto_label_paragraph)

# Show distribution
print("Auto-label distribution:")
print(df_paragraphs["label_auto"].value_counts())

# Save for manual review and fine-tuning
output_path = os.path.join(DATA_DIR, "paragraphs_for_labeling.csv")
df_paragraphs.to_csv(output_path, index=False)
print(f"\n✓ Saved {len(df_paragraphs)} paragraphs to: {output_path}")
print("  → Review 'label_auto' column and correct as needed for training data")

df_paragraphs.sample(10)

Auto-label distribution:
label_auto
decisao              137
inflacao             111
outros               106
votacao               99
expectativas          73
projecoes             64
atividade             42
fiscal                42
riscos_alta           36
forward_guidance      35
cenario_externo       32
premissas             26
cenario_domestico     23
tabela                11
Name: count, dtype: int64

✓ Saved 837 paragraphs to: data_copom\paragraphs_for_labeling.csv
  → Review 'label_auto' column and correct as needed for training data


Unnamed: 0,nro_reuniao,dataReferencia,para_idx,text,char_len,word_count,label,label_auto
263,223,2019-06-19,8,"Considerando o cenário básico, o balanço de ri...",478,78,,decisao
633,257,2023-09-20,7,"Em se confirmando o cenário esperado, os membr...",631,97,,votacao
63,200,2016-07-20,4,As expectativas de inflação apuradas pela pesq...,129,22,,expectativas
104,203,2016-11-30,10,"Por outro lado, (iv) a atividade econômica mai...",451,73,,projecoes
129,205,2017-02-22,6,"No cenário de mercado, as projeções do Copom r...",243,44,,projecoes
387,232,2020-08-05,8,Apesar de uma assimetria em seu balanço dos ri...,512,84,,projecoes
774,270,2025-05-07,1,"Em relação ao cenário doméstico, o conjunto do...",314,46,,atividade
833,275,2025-12-10,6,"O cenário atual, marcado por elevada incerteza...",474,76,,inflacao
284,225,2019-09-18,1,"Em sua 225ª reunião, o Copom decidiu, por unan...",91,16,,decisao
402,234,2020-10-28,2,A atualização do cenário básico do Copom pode ...,88,14,,outros
