In [78]:
import pandas as pd
import requests
import re
import html
from bs4 import BeautifulSoup
import time

# Configuration
BASE_URL = "https://www.bcb.gov.br/api/servico/sitebcb/copom/comunicados_detalhes?nro_reuniao={}"

In [79]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def fetch_copom_data(meeting_number):
    """Fetches raw data for a specific meeting number."""
    url = BASE_URL.format(meeting_number)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "application/json"
    }
    try:
        # BCB API often has certificate issues, so we skip verification
        response = requests.get(url, headers=headers, timeout=30, verify=False)
        response.raise_for_status()
        data = response.json()
        
        if not data:
            return None

        # Check for "conteudo" key (new API structure)
        if isinstance(data, dict) and "conteudo" in data:
            items = data["conteudo"]
            if items and len(items) > 0:
                return items[0]
        
        # Fallback for list structure
        if isinstance(data, list) and len(data) > 0:
            return data[0]
            
        return None
    except Exception as e:
        print(f"Error fetching meeting {meeting_number}: {e}")
        return None

def clean_html(raw_html):
    """Parses HTML, extracts text and tables."""
    if not raw_html:
        return "", []

    decoded = html.unescape(raw_html)
    # Remove zero-width spaces and other artifacts
    decoded = re.sub(r'[\u200b\ufeff\u00a0]', ' ', decoded)
    soup = BeautifulSoup(decoded, 'html.parser')

    tables = []
    # Extract tables
    for i, table in enumerate(soup.find_all('table')):
        rows = []
        for tr in table.find_all('tr'):
            cells = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])]
            if any(cells): # Only add non-empty rows
                rows.append(cells)
        if rows:
            df_table = pd.DataFrame(rows)
            tables.append(df_table)
        
        # Try to find and remove the table title (preceding paragraph)
        prev = table.find_previous_sibling()
        while prev and isinstance(prev, str) and not prev.strip():
             prev = prev.find_previous_sibling()
             
        if prev and prev.name in ['p', 'div', 'span', 'strong', 'b', 'h4', 'h5', 'h6']:
            text_content = prev.get_text(strip=True)
            # Heuristic: Titles are usually short (< 200 chars)
            if len(text_content) < 200:
                 prev.decompose()

        table.decompose() # Remove table from soup to avoid duplicating text

    # formatting
    for br in soup.find_all('br'):
        br.replace_with('\n')
    for p in soup.find_all('p'):
        p.insert_after('\n\n')

    text = soup.get_text()
    # Clean up whitespace
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = text.strip()

    return text, tables

In [80]:
def extract_metadata(text, title, date_ref):
    """Extracts structured data from the text using Regex."""
    metadata = {
        "selic_rate": None,
        "decision": None,
        "num_directors": None,
        "directors": [],
        "board_president": None,
        "score": None,
    }

    # 1. Selic Rate
    # Patterns: "Selic para X,XX%", "Selic em X,XX%"
    selic_match = re.search(r'Selic\s+(?:para|em|de)\s+(\d+(?:[,\.]\d+)?)\s*%', text, re.IGNORECASE)
    if selic_match:
        metadata["selic_rate"] = selic_match.group(1).replace(',', '.')

    # 2. Decision
    text_lower = text.lower()
    if "manteve" in text_lower or "manter" in text_lower:
        metadata["decision"] = "maintain"
    elif "elevou" in text_lower or "elevar" in text_lower or "aumentou" in text_lower:
        metadata["decision"] = "hike"
    elif "reduziu" in text_lower or "reduzir" in text_lower:
        metadata["decision"] = "cut"

    # 3. Score / Unanimity
    if "unanimidade" in text_lower:
        metadata["score"] = "unanimous"
    else:
        # Try to find "X votos a Y"
        vote_match = re.search(r'(\d+)\s+votos?\s+a\s+(\d+)', text_lower)
        if vote_match:
            metadata["score"] = f"{vote_match.group(1)}x{vote_match.group(2)}"

    # 4. Directors & President
    # Simple extraction of President
    pres_match = re.search(r'Presidente:\s*([A-Z][a-záéíóúàâêôãõç\s\.]+)(?:\.|,|$)', text)
    if pres_match:
        metadata["board_president"] = pres_match.group(1).strip()

    # Try to extract list of directors (heuristic)
    # Look for the section starting with "Votaram por..." or "Membros do Copom presentes:"
    members_match = re.search(r'(?:Votaram por|Membros do Copom presentes)[:\s]+([\s\S]+?)(?:\n\n|$)', text)
    if members_match:
        members_text = members_match.group(1)
        # Split by commas or 'e'
        names = re.split(r',|\se\s', members_text)
        clean_names = [n.strip().strip('.') for n in names if len(n.strip()) > 3 and n.strip()[0].isupper()]
        metadata["directors"] = clean_names
        metadata["num_directors"] = len(clean_names)

    return metadata

In [81]:
def build_dataset(start_meeting, end_meeting):
    meetings_data = []
    paragraphs_data = []
    tables_data = []

    for meeting_num in range(start_meeting, end_meeting + 1):
        print(f"Processing meeting {meeting_num}...")
        raw_data = fetch_copom_data(meeting_num)
        
        if not raw_data:
            print(f"  No data found for {meeting_num}")
            continue

        # Basic info
        date_ref = raw_data.get('dataReferencia')
        title = raw_data.get('titulo')
        raw_html = raw_data.get('textoComunicado') or raw_data.get('conteudoHtml')

        # Clean and Parse
        full_text, tables = clean_html(raw_html)
        
        # Extract Metadata
        meta = extract_metadata(full_text, title, date_ref)

        # Store Meeting Level Data
        meeting_entry = {
            "meeting_number": meeting_num,
            "date": date_ref,
            "title": title,
            "full_text": full_text,
            **meta
        }
        meetings_data.append(meeting_entry)

        # Store Paragraphs
        # Split by double newlines
        paras = [p.strip() for p in full_text.split('\n\n') if p.strip()]
        for i, p in enumerate(paras):
            paragraphs_data.append({
                "meeting_number": meeting_num,
                "paragraph_id": i,
                "text": p,
                "char_count": len(p)
            })

        # Store Tables
        for i, tbl in enumerate(tables):
            tables_data.append({
                "meeting_number": meeting_num,
                "table_id": i,
                "dataframe": tbl
            })
        
        time.sleep(0.5) # Be nice to the API

    return pd.DataFrame(meetings_data), pd.DataFrame(paragraphs_data), tables_data

# Run for a sample range (e.g., last 10 meetings)
# Assuming current is around 274
df_meetings, df_paragraphs, list_tables = build_dataset(255, 274)

Processing meeting 255...
Processing meeting 256...
Processing meeting 256...
Processing meeting 257...
Processing meeting 257...
Processing meeting 258...
Processing meeting 258...
Processing meeting 259...
Processing meeting 259...
Processing meeting 260...
Processing meeting 260...
Processing meeting 261...
Processing meeting 261...
Processing meeting 262...
Processing meeting 262...
Processing meeting 263...
Processing meeting 263...
Processing meeting 264...
Processing meeting 264...
Processing meeting 265...
Processing meeting 265...
Processing meeting 266...
Processing meeting 266...
Processing meeting 267...
Processing meeting 267...
Processing meeting 268...
Processing meeting 268...
Processing meeting 269...
Processing meeting 269...
Processing meeting 270...
Processing meeting 270...
Processing meeting 271...
Processing meeting 271...
Processing meeting 272...
Processing meeting 272...
Processing meeting 273...
Processing meeting 273...
Processing meeting 274...
Processing m

In [84]:
# Display Results
print("Meetings DataFrame:")
display(df_meetings.tail())

print("\nParagraphs DataFrame:")
display(df_paragraphs.head())

print(f"\nTotal Tables Extracted: {len(list_tables)}")
if list_tables:
    print("Example Table:")
    display(list_tables[-1]['dataframe'])

Meetings DataFrame:


Unnamed: 0,meeting_number,date,title,full_text,selic_rate,decision,num_directors,directors,board_president,score
15,270,2025-05-07,"Copom eleva a taxa Selic para 14,75% a.a.",O ambiente externo mostra-se adverso e particu...,,maintain,8,"[Ailton de Aquino Santos, Diogo Abry Guillen, ...",,
16,271,2025-06-18,"Copom eleva a taxa Selic para 15,00% a.a.",O ambiente externo mantém-se adverso e particu...,,hike,8,"[Ailton de Aquino Santos, Diogo Abry Guillen, ...",,
17,272,2025-07-30,"Copom mantém a taxa Selic em 15,00% a.a.",O ambiente externo está mais adverso e incerto...,,maintain,8,"[Ailton de Aquino Santos, Diogo Abry Guillen, ...",,
18,273,2025-09-17,"Copom mantém a taxa Selic em 15,00% a.a.",O ambiente externo se mantém incerto em função...,,maintain,8,"[Ailton de Aquino Santos, Diogo Abry Guillen, ...",,
19,274,2025-11-05,"Copom mantém a taxa Selic em 15,00% a.a.",O ambiente externo ainda se mantém incerto em ...,,maintain,8,"[Ailton de Aquino Santos, Diogo Abry Guillen, ...",,



Paragraphs DataFrame:


Unnamed: 0,meeting_number,paragraph_id,text,char_count
0,255,0,"O ambiente externo se mantém adverso, ainda qu...",478
1,255,1,"Em relação ao cenário doméstico, o conjunto do...",745
2,255,2,As projeções de inflação do Copom em seu cenár...,194
3,255,3,"O Comitê ressalta que, em seus cenários para a...",1162
4,255,4,"Considerando os cenários avaliados, o balanço ...",522



Total Tables Extracted: 10
Example Table:


Unnamed: 0,0,1,2,3
0,Índice de preços,2025,2026,2º tri 2027
1,IPCA,46,36,33
2,IPCA livres,45,36,32
3,IPCA administrados,50,34,35


In [85]:
df_paragraphs[df_paragraphs['meeting_number'] == 255].to_clipboard()