In [1]:
import torch
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device count:", torch.cuda.device_count())
    print("GPU 0:", torch.cuda.get_device_name(0))


PyTorch: 2.9.0+cu128
CUDA available: True
Device count: 1
GPU 0: NVIDIA GeForce RTX 4050 Laptop GPU


In [2]:
# Retry: save CSV, and only attempt Parquet if an engine is available.
import importlib
import re
from pathlib import Path
import pandas as pd
from typing import List, Optional, Tuple

FOLDER = Path("copom_texts")  # <-- change this if your files are elsewhere

# ---------- Helpers ----------
def extract_between(text: str, start_label: str, end_labels: List[str]) -> Tuple[str, str]:
    start = re.search(re.escape(start_label), text, flags=re.IGNORECASE)
    if not start:
        return "", text
    rest = text[start.end():]
    next_positions = []
    for lbl in end_labels:
        m = re.search(re.escape(lbl), rest, flags=re.IGNORECASE)
        if m:
            next_positions.append(m.start())
    if next_positions:
        end_idx = min(next_positions)
        return rest[:end_idx].strip(), rest[end_idx:].lstrip()
    else:
        return rest.strip(), ""

def clean_lines_block(block: str) -> str:
    lines = [re.sub(r"^[\-\u2022•\s]+", "", ln.strip()) for ln in block.splitlines()]
    lines = [ln for ln in lines if ln]
    return "\n".join(lines).strip()

def first_nonempty_lines(text: str, n: int = 2) -> str:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    return " — ".join(lines[:n]) if lines else ""

def extract_field(text: str, label: str) -> Optional[str]:
    m = re.search(rf"{re.escape(label)}\s*:\s*(.+)", text, flags=re.IGNORECASE)
    if m:
        return m.group(1).splitlines()[0].strip()
    return None

def normalize_date_pt(date_str: Optional[str]) -> Optional[str]:
    if not date_str:
        return None
    m = re.search(r"(\d{1,2})[./\-](\d{1,2})[./\-](\d{2,4})", date_str)
    if not m:
        return date_str
    d, mth, y = m.groups()
    d = int(d); mth = int(mth)
    if len(y) == 2:
        yy = int(y)
        y = 1900 + yy if yy >= 50 else 2000 + yy
    else:
        y = int(y)
    try:
        return f"{y:04d}-{mth:02d}-{d:02d}"
    except Exception:
        return date_str

def extract_sumario(text: str) -> Tuple[str, str]:
    labels_after = ["Data:", "Local:", "Horário de início:", "Horário de término:", "Presentes:", "Membros da Diretoria", "Chefes de Departamento", "Demais participantes"]
    block, remainder = extract_between(text, "Sumário", labels_after)
    if not block:
        return "", text
    block = clean_lines_block(block)
    for lbl in labels_after:
        idx = block.lower().find(lbl.lower())
        if idx != -1:
            block = block[:idx].strip()
            break
    block_joined = "; ".join([ln.strip() for ln in block.splitlines() if ln.strip()])
    return block_joined, remainder if remainder else text

def extract_people_list(text: str, header: str, next_headers: List[str]) -> Tuple[str, str]:
    block, remainder = extract_between(text, header, next_headers)
    block = clean_lines_block(block)
    people = "; ".join([ln.strip(" ;") for ln in block.splitlines() if ln.strip()])
    return people, remainder if remainder else text

'''def parse_copom_text(txt: str) -> dict:
    original_text = txt
    txt = txt.replace("–", "-").replace("—", "-")
    title = first_nonempty_lines(txt, n=2)
    sumario, txt_after_sumario = extract_sumario(txt)
    data = extract_field(txt, "Data")
    local = extract_field(txt, "Local")
    h_inicio = extract_field(txt, "Horário de início") or extract_field(txt, "Horario de inicio")
    h_termino = extract_field(txt, "Horário de término") or extract_field(txt, "Horario de termino")
    data_norm = normalize_date_pt(data)
    next_headers = ["Chefes de Departamento", "Demais participantes", "Data:", "Local:", "Horário", "Diretrizes", "Preços", "Agregados", "Finanças", "Balanço", "Ambiente", "Evolução", "Reservas", "Liquidez", "Mercado", "Diretrizes", "Sumário"]
    membros, remainder1 = extract_people_list(txt, "Membros da Diretoria", next_headers)
    chefes, remainder2 = extract_people_list(remainder1 or txt, "Chefes de Departamento", ["Demais participantes"] + next_headers)
    demais, remainder3 = extract_people_list(remainder2 or txt, "Demais participantes", next_headers)
    body_candidate = remainder3 or txt_after_sumario or original_text
    body_candidate = re.sub(r"^(?s).*?(A Diretoria Colegiada|Preços e Nível de Atividade|Diretrizes de Política Monetária)", r"\1", body_candidate, flags=re.IGNORECASE)
    body = body_candidate.strip()
    return {
        "title": title,
        "data": data_norm or data or "",
        "local": local or "",
        "horário de início e de término": f"{h_inicio or ''} — {h_termino or ''}".strip(" —"),
        "sumário": sumario or "",
        "membros da diretoria": membros or "",
        "chefes de departamento": chefes or "",
        "demais participantes": demais or "",
        "Body": body,
    }
'''

def parse_copom_text(txt: str) -> dict:
    original_text = txt
    txt = txt.replace("–", "-").replace("—", "-")

    # Title and metadata
    title = first_nonempty_lines(txt, n=2)
    sumario, txt_after_sumario = extract_sumario(txt)
    data = extract_field(txt, "Data")
    local = extract_field(txt, "Local")
    h_inicio = extract_field(txt, "Horário de início") or extract_field(txt, "Horario de inicio")
    h_termino = extract_field(txt, "Horário de término") or extract_field(txt, "Horario de termino")
    data_norm = normalize_date_pt(data)

    # Get people sections
    membros, remainder1 = extract_people_list(txt, "Membros da Diretoria", ["Chefes de Departamento", "Demais participantes"])
    chefes, remainder2 = extract_people_list(txt, "Chefes de Departamento", ["Demais participantes"])
    demais, remainder3 = extract_people_list(txt, "Demais participantes", [])

    # 🟢 New logic for the Body:
    # Everything AFTER the "Demais participantes" header
    body_match = re.search(r"Demais participantes.*?(?=\Z)", txt, flags=re.IGNORECASE | re.DOTALL)
    if body_match:
        # Get text after the "Demais participantes" section
        # Remove the header itself
        after = re.sub(r"(?is)^.*?Demais participantes", "", txt, count=1).strip()
        body = clean_lines_block(after)
    else:
        # fallback — whole text
        body = clean_lines_block(original_text)

    return {
        "title": title,
        "data": data_norm or data or "",
        "local": local or "",
        "horário de início e de término": f"{h_inicio or ''} — {h_termino or ''}".strip(" —"),
        "sumário": sumario or "",
        "membros da diretoria": membros or "",
        "chefes de departamento": chefes or "",
        "demais participantes": demais or "",
        "Body": body,
    }


rows = []
for path in sorted(FOLDER.glob("*.txt")):
    try:
        content = path.read_text(encoding="utf-8", errors="ignore")
        parsed = parse_copom_text(content)
        parsed["__filename"] = path.name
        rows.append(parsed)
    except Exception as e:
        rows.append({
            "title": "",
            "data": "",
            "local": "",
            "horário de início e de término": "",
            "sumário": "",
            "membros da diretoria": "",
            "chefes de departamento": "",
            "demais participantes": "",
            "Body": "",
            "__filename": path.name,
            "__error": str(e),
        })

cols = [
    "title",
    "data",
    "local",
    "horário de início e de término",
    "sumário",
    "membros da diretoria",
    "chefes de departamento",
    "demais participantes",
    "Body",
    "__filename",
]

df = pd.DataFrame(rows, columns=cols)

# Save CSV
csv_path = FOLDER / "copom_parsed.csv"
df.to_csv(csv_path, index=False)

# Try Parquet if possible
parquet_path = FOLDER / "copom_parsed.parquet"
parquet_ok = False
for engine in ("pyarrow", "fastparquet"):
    try:
        importlib.import_module(engine)
        df.to_parquet(parquet_path, index=False, engine=engine)
        parquet_ok = True
        break
    except Exception:
        continue

from IPython.display import display
display(df)

print("CSV saved to:", csv_path)
print("Parquet saved to:", parquet_path if parquet_ok else "Parquet not saved (no engine available)")


Unnamed: 0,title,data,local,horário de início e de término,sumário,membros da diretoria,chefes de departamento,demais participantes,Body,__filename
0,Atas do Comitê de Política Monetária - Copom —...,1998-01-28,Sala de reuniões do 8o andar do Edifício-Sede ...,17:20h — 20:00h,Preços e Nível de Atividade; Agregados Monetár...,Gustavo H. B. Franco - Presidente; Carlos Edua...,:; Altamir Lopes - Chefe do DEPEC; Maria do So...,Alexandre Pundek Rocha - Consultor da Diretori...,Alexandre Pundek Rocha - Consultor da Diretori...,19980128.txt
1,Atas do Comitê de Política Monetária - Copom —...,1998-03-04,Sala de reuniões do 8o andar do Edifício-Sede ...,15:30h. — 18:00h,Preços e Nível de Atividade; Agregados Monetár...,Gustavo H. B. Franco - Presidente; Carlos Edua...,:; Altamir Lopes - Chefe do DEPEC; Maria do So...,Alexandre Pundek Rocha - Consultor da Diretori...,Alexandre Pundek Rocha - Consultor da Diretori...,19980304.txt
2,Atas do Comitê de Política Monetária - Copom —...,1998-04-15,Sala de reuniões do 8o andar do Edifício-Sede ...,16:50h — 19:00h,Preços e Nível de Atividade; Agregados Monetár...,:; Francisco Lafaiete de Pádua Lopes - Preside...,:; Altamir Lopes - Chefe do DEPEC; Carlos Yosh...,:; Alexandre Pundek Rocha - Consultor da Diret...,:\nAlexandre Pundek Rocha - Consultor da Diret...,19980415.txt
3,Atas do Comitê de Política Monetária - Copom —...,1998-05-20,Sala de reuniões do 8o andar do Edifício-Sede ...,17:30h — 19:35h,Preços e Nível de Atividade; Agregados Monetár...,:; Gustavo H. B. Franco - Presidente; Cláudio ...,:; Altamir Lopes - Chefe do DEPEC; Carlos Yosh...,:; Alexandre Pundek Rocha - Consultor da Diret...,:\nAlexandre Pundek Rocha - Consultor da Diret...,19980520.txt
4,Atas do Comitê de Política Monetária - Copom —...,1998-06-24,Sala de reuniões do 8o andar do Edifício-Sede ...,17:20h — 20:20h,Preços e Nível de Atividade; Agregados Monetár...,Colegiada; Gustavo H. B. Franco - Presidente; ...,:; Altamir Lopes - Chefe do DEPEC; Eduardo Hit...,:; Alexandre Pundek Rocha - Consultor da Diret...,:\nAlexandre Pundek Rocha - Consultor da Diret...,19980624.txt
...,...,...,...,...,...,...,...,...,...,...
176,1 bcb.gov.br — Ata da Reunião do,18 e 19 de março de 2025,Sala s de reuniões do 8º andar (18/3 e 19/3 - ...,,,,,Alexandre de Carvalho - Chefe da Assessoria Ec...,Alexandre de Carvalho - Chefe da Assessoria Ec...,20250319.txt
177,1 bcb.gov.br — Ata da Reunião do,6 e 7 de maio de 2025,Sala s de reuniões do 8º andar (6/5 e 7/5 - ma...,,,,,Alexandre de Carvalho - Chefe da Assessoria Ec...,Alexandre de Carvalho - Chefe da Assessoria Ec...,20250507.txt
178,1 bcb.gov.br — Ata da Reunião do,17 e 18 de junho de 2025,Sala s de reuniões do 8º andar (17/6 e 18/6 ...,,,,,Alexandre de Carvalho - Chefe da Assessoria Ec...,Alexandre de Carvalho - Chefe da Assessoria Ec...,20250618.txt
179,1 bcb.gov.br — Ata da Reunião do,29 e 30 de julho de 2025,Sala s de reuniões do 8º andar (29/7 e 30/7 - ...,,,,,Alexandre de Carvalho - Chefe da Assessoria Ec...,Alexandre de Carvalho - Chefe da Assessoria Ec...,20250730.txt


CSV saved to: copom_texts\copom_parsed.csv
Parquet saved to: Parquet not saved (no engine available)


In [3]:
# Check if the word "selic" (case-insensitive) appears in the Body column
mask = df["Body"].str.contains(r"\bselic\b", case=False, na=False)

# Count how many rows contain "selic"
count_selic = mask.sum()

# Total number of rows
total_rows = len(df)

# Percentage of rows that mention "selic"
percentage = (count_selic / total_rows) * 100

print(f"Rows mentioning 'selic': {count_selic} out of {total_rows} ({percentage:.2f}%)")


Rows mentioning 'selic': 149 out of 181 (82.32%)


In [4]:
# Count how many Body entries contain "ao ano"
mask = df["Body"].str.contains(r"% a.a", case=False, na=False)

count_with_term = mask.sum()
total_rows = len(df)
percentage = (count_with_term / total_rows) * 100

print(f"Rows containing 'a.a': {count_with_term} out of {total_rows} ({percentage:.2f}%)")

df_no_decisao = df[~mask]
df_no_decisao.tail()


Rows containing 'a.a': 176 out of 181 (97.24%)


Unnamed: 0,title,data,local,horário de início e de término,sumário,membros da diretoria,chefes de departamento,demais participantes,Body,__filename
27,Atas do Comitê de Política Monetária - Copom —...,2001-07-18,Sala de reuniões do 8o andar (no dia 17) e 20o...,16:00 h dia 17 e 17:48 h dia 18 — 19:21 h dia ...,Atividade econômica; Preços; Ambiente Externo;...,Colegiada; Arminio Fraga Neto - Presidente; Ca...,(todos presentes no dia 17); Altamir Lopes - D...,(todos presentes no dia 17); José Pedro Ramos ...,(todos presentes no dia 17)\nJosé Pedro Ramos ...,20010718.txt
31,Atas do Comitê de Política Monetária - Copom —...,2002-05-22,Sala de reuniões do 8o andar (no dia 21) e 20o...,15:30 h dia 21 e 11:35 h dia 22 — 20:05 h dia ...,Atividade econômica; Ambiente externo; Preços;...,Colegiada; Arminio Fraga Neto - Presidente; Be...,(todos presentes no dia 21); Altamir Lopes - D...,(todos presentes no dia 21); Antônio Carlos Mo...,(todos presentes no dia 21)\nAntônio Carlos Mo...,20020522.txt
32,Atas do Comitê de Política Monetária - Copom —...,2002-06-19,Sala de reuniões do 8o andar (no dia 18) e 20o...,15:43 h dia 18 e 11:18 h dia 19 — 19:33 h dia ...,Atividade econômica; Ambiente externo; Preços;...,Colegiada; Arminio Fraga Neto - Presidente; Be...,(todos presentes no dia 18); Altamir Lopes - D...,(todos presentes no dia 18); Antônio Carlos Mo...,(todos presentes no dia 18)\nAntônio Carlos Mo...,20020619.txt
33,Atas do Comitê de Política Monetária - Copom —...,2002-07-17,Sala de reuniões do 8o andar (no dia 16) e 20o...,15h35 (16/7) e 11h28 (17/7) — 19h52 (16/7) e 1...,Atividade econômica; Ambiente externo; Preços;...,Colegiada; Arminio Fraga Neto - Presidente; Be...,(todos presentes no dia 16); Altamir Lopes - D...,(todos presentes no dia 16); Antônio Carlos Mo...,(todos presentes no dia 16)\nAntônio Carlos Mo...,20020717.txt
35,Atas do Comitê de Política Monetária - Copom —...,2002-09-18,Sala de reuniões do 8o andar (17/9) e 20o anda...,15h40 (17/9) e 11h49 (18/9) — 19h20 (17/9) e 1...,Atividade econômica; Ambiente externo; Preços;...,Colegiada; Arminio Fraga Neto - Presidente; Be...,(todos presentes no dia 17); Altamir Lopes - D...,(todos presentes no dia 17); Antônio Carlos Mo...,(todos presentes no dia 17)\nAntônio Carlos Mo...,20020918.txt


In [5]:
import re

pattern = re.compile(r'(\d{1,3}(?:[.,]\d+)?)\s*%?\s*(?:a\.?\s*a\.?|ao\s+ano)', re.IGNORECASE)

def extrai_taxa(texto):
    m = pattern.search(texto)
    if m:
        return float(m.group(1).replace(',', '.'))
    return None


In [6]:
df["taxa_juros"] = df["Body"].apply(extrai_taxa)


In [7]:
df["taxa_juros"].isna().value_counts()


taxa_juros
False    181
Name: count, dtype: int64

In [8]:
df["taxa_juros"].describe()


count    181.000000
mean      11.031326
std       10.542380
min        0.000000
25%        4.500000
50%        8.750000
75%       14.250000
max       75.000000
Name: taxa_juros, dtype: float64

In [9]:
print(df.columns)


Index(['title', 'data', 'local', 'horário de início e de término', 'sumário',
       'membros da diretoria', 'chefes de departamento',
       'demais participantes', 'Body', '__filename', 'taxa_juros'],
      dtype='object')


In [10]:
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords

# stopwords_pt = stopwords.words('portuguese')


In [11]:
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LinearRegression

# # 1. Seleciona X e y
# X_text = df["Body"]
# y = df["taxa_juros"]

# # 2. Vetorização do texto
# vectorizer = TfidfVectorizer(
#     stop_words=stopwords_pt,
#     max_features=5000,
#     ngram_range=(1,2)
# )

# X = vectorizer.fit_transform(X_text)

# # 3. Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(
#     X,
#     y,
#     test_size=0.2,       # 20% para teste
#     random_state=42      # garante reprodutibilidade
# )

# # 4. Modelo inicial
# model = LinearRegression()
# model.fit(X_train, y_train)

# # 5. Avaliação
# r2_train = model.score(X_train, y_train)
# r2_test = model.score(X_test, y_test)

# print(f"R² (treino): {r2_train:.4f}")
# print(f"R² (teste):  {r2_test:.4f}")


In [12]:
# from transformers import DistilBertTokenizer, DistilBertModel
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from tqdm import tqdm
# from torch.utils.data import DataLoader, TensorDataset
# import pandas as pd

In [13]:
# class BertRegressor(nn.Module):
#     def __init__(self, bert_model, freeze_bert=False):
#         super().__init__()
#         self.bert = bert_model
#         self.reg_head = nn.Linear(self.bert.config.hidden_size, 1)  # single value
#         if freeze_bert:
#             for p in self.bert.parameters():
#                 p.requires_grad = False

#     def forward(self, input_ids, attention_mask=None):
#         out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         cls = out.last_hidden_state[:, 0, :]  # CLS token
#         pred = self.reg_head(cls)             # shape: (batch, 1)
#         return pred


In [14]:
# from transformers import DistilBertTokenizer, DistilBertModel
# from sklearn.model_selection import train_test_split
# from torch.utils.data import DataLoader, TensorDataset

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Use the correct column name:
# text_series = df["Body"].fillna("").astype(str)
# y = torch.tensor(df["taxa_juros"].values, dtype=torch.float32).unsqueeze(1)  # (N, 1)

# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# enc = tokenizer(
#     list(text_series),
#     truncation=True,
#     padding=True,
#     max_length=512,
#     return_tensors="pt"
# )

# # Train/test split by indices
# idx_train, idx_test = train_test_split(
#     torch.arange(len(text_series)),
#     test_size=0.2,
#     random_state=42
# )

# train_ds = TensorDataset(
#     enc["input_ids"][idx_train],
#     enc["attention_mask"][idx_train],
#     y[idx_train]
# )
# test_ds = TensorDataset(
#     enc["input_ids"][idx_test],
#     enc["attention_mask"][idx_test],
#     y[idx_test]
# )

# train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
# test_loader  = DataLoader(test_ds,  batch_size=16, shuffle=False)


In [15]:
# bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
# model = BertRegressor(bert).to(device)

# from torch.optim import AdamW
# optimizer = AdamW(model.parameters(), lr=2e-5)
# criterion = nn.MSELoss()


In [16]:
# from tqdm import tqdm
# import numpy as np
# from sklearn.metrics import r2_score, mean_absolute_error

# def evaluate(model, loader):
#     model.eval()
#     preds, golds = [], []
#     with torch.no_grad():
#         for input_ids, attn_mask, labels in loader:
#             input_ids = input_ids.to(device)
#             attn_mask = attn_mask.to(device)
#             labels = labels.to(device)

#             out = model(input_ids, attention_mask=attn_mask)  # (B,1)
#             preds.append(out.squeeze(1).cpu().numpy())
#             golds.append(labels.squeeze(1).cpu().numpy())
#     preds = np.concatenate(preds)
#     golds = np.concatenate(golds)
#     r2 = r2_score(golds, preds)
#     mae = mean_absolute_error(golds, preds)
#     return r2, mae

# EPOCHS = 3
# for epoch in range(1, EPOCHS+1):
#     model.train()
#     running_loss = 0.0
#     for input_ids, attn_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch}"):
#         input_ids = input_ids.to(device)
#         attn_mask = attn_mask.to(device)
#         labels = labels.to(device)

#         optimizer.zero_grad()
#         out = model(input_ids, attention_mask=attn_mask)  # (B,1)
#         loss = criterion(out, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item() * input_ids.size(0)

#     train_loss = running_loss / len(train_loader.dataset)
#     r2_test, mae_test = evaluate(model, test_loader)
#     print(f"Epoch {epoch} | train MSE: {train_loss:.4f} | test R²: {r2_test:.4f} | test MAE: {mae_test:.3f} p.p.")


In [17]:
# def predict_rate(text: str) -> float:
#     model.eval()
#     tok = tokenizer(
#         [text],
#         truncation=True,
#         padding=True,
#         max_length=512,
#         return_tensors="pt"
#     )
#     with torch.no_grad():
#         out = model(
#             tok["input_ids"].to(device),
#             attention_mask=tok["attention_mask"].to(device)
#         )
#     return float(out.squeeze(1).cpu().item())

# # Example:
# pred = predict_rate(df.iloc[0]["Body"])
# print("Predicted rate:", pred)


## Usando BERT Português

In [18]:
import os, random, numpy as np, torch

# (Optional) better error traces — set BEFORE any CUDA init:
os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Seed CPU only (does NOT touch CUDA):
torch.random.manual_seed(SEED)

# Seed CUDA ONLY if available & healthy
if torch.cuda.is_available():
    try:
        torch.cuda.manual_seed_all(SEED)
    except Exception as e:
        print("Warning: CUDA seeding failed; continuing without CUDA seed:", e)


In [19]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [20]:
if torch.cuda.is_available():
    print("CUDA devices:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))


CUDA devices: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm   # <<< added

# ------------------ Config ------------------
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  # PT-BR BERT
MAX_LEN = 512
BATCH_TRAIN = 8
BATCH_TEST  = 16
EPOCHS = 21                    # example: more epochs to see periodic eval
LR = 2e-5
FREEZE_BERT_WARMUP_EPOCHS = 1
TOLERANCE_PP = 0.05

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ------------------ Prepare labels ------------------
assert {"Body", "taxa_juros", "data"}.issubset(set(df.columns)), "df must have Body, taxa_juros, data"

df_cls = df.dropna(subset=["Body", "taxa_juros"]).copy()
df_cls["data"] = pd.to_datetime(df_cls["data"], errors="coerce")
df_cls = df_cls.sort_values("data").reset_index(drop=True)

df_cls["delta_pp"] = df_cls["taxa_juros"].diff()

def to_label(delta):
    if pd.isna(delta):
        return np.nan
    if abs(delta) <= TOLERANCE_PP:
        return 0
    return 1 if delta > 0 else -1

df_cls["label"] = df_cls["delta_pp"].apply(to_label)
df_cls = df_cls.dropna(subset=["label"]).reset_index(drop=True)
df_cls["label"] = df_cls["label"].astype(int)

print("Class counts:")
print(df_cls["label"].value_counts().sort_index())

label_to_idx = {-1: 0, 0: 1, 1: 2}
idx_to_label = {0: -1, 1: 0, 2: 1}
df_cls["label_idx"] = df_cls["label"].map(label_to_idx).astype(int)

# ------------------ Tokenization ------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

enc = tokenizer(
    df_cls["Body"].fillna("").astype(str).tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)

X_input_ids = enc["input_ids"]
X_attn      = enc["attention_mask"]
y_labels    = torch.tensor(df_cls["label_idx"].values, dtype=torch.long)

assert y_labels.min().item() >= 0 and y_labels.max().item() <= 2

# ------------------ Split ------------------
X_ids_tr, X_ids_te, X_attn_tr, X_attn_te, y_tr, y_te = train_test_split(
    X_input_ids, X_attn, y_labels,
    test_size=0.2,
    random_state=SEED,
    stratify=y_labels,
)

train_loader = DataLoader(TensorDataset(X_ids_tr, X_attn_tr, y_tr), batch_size=BATCH_TRAIN, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_ids_te, X_attn_te, y_te), batch_size=BATCH_TEST, shuffle=False)

# ------------------ Model ------------------
class BertClassifier(nn.Module):
    def __init__(self, model_name, num_classes=3, freeze_bert=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False

    def forward(self, input_ids, attention_mask=None):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return self.classifier(cls)

# class weights
counts = df_cls["label_idx"].value_counts().sort_index()
weights = torch.tensor([counts.get(i, 0) for i in [0,1,2]], dtype=torch.float32)
weights = (weights.sum() / (weights + 1e-8))
weights = weights / weights.mean()
print("Class weights:", weights.tolist())

model = BertClassifier(MODEL_NAME, num_classes=3, freeze_bert=True).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(weight=weights.to(device))

# ------------------ Eval ------------------
def eval_model(model, loader):
    model.eval()
    all_pred, all_true = [], []
    with torch.no_grad():
        for ids, attn, yb in loader:
            ids, attn, yb = ids.to(device), attn.to(device), yb.to(device)
            preds = model(ids, attention_mask=attn).argmax(dim=1)
            all_pred.append(preds.cpu().numpy())
            all_true.append(yb.cpu().numpy())
    all_pred = np.concatenate(all_pred)
    all_true = np.concatenate(all_true)
    return (
        np.vectorize(idx_to_label.get)(all_true),
        np.vectorize(idx_to_label.get)(all_pred)
    )

# ------------------ Train ------------------
for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs"):
    model.train()
    running = 0.0

    # tqdm on batches
    for ids, attn, yb in tqdm(train_loader, desc=f"Training {epoch}", leave=False):
        ids, attn, yb = ids.to(device), attn.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(ids, attention_mask=attn), yb)
        loss.backward()
        optimizer.step()
        running += loss.item() * ids.size(0)

    # warm-up unfreeze
    if epoch == FREEZE_BERT_WARMUP_EPOCHS:
        for p in model.bert.parameters():
            p.requires_grad = True
        optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

    train_loss = running / len(train_loader.dataset)
    print(f"\nEpoch {epoch} | train loss: {train_loss:.4f}")

    # only evaluate every 5 epochs
    if epoch % 5 == 0:
        y_true, y_pred = eval_model(model, test_loader)
        print("\n--- Evaluation ---")
        print(confusion_matrix(y_true, y_pred, labels=[-1,0,1]))
        print(classification_report(
            y_true, y_pred,
            labels=[-1,0,1],
            target_names=["Corte (-1)","Manutenção (0)","Alta (+1)"],
            digits=3
        ))

# ------------------ Predict ------------------
def predict_direction(text: str) -> str:
    model.eval()
    tok = tokenizer([str(text)], truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
    with torch.no_grad():
        pred_idx = int(model(tok["input_ids"].to(device), attention_mask=tok["attention_mask"].to(device)).argmax(dim=1))
    return { -1:"Corte", 0:"Manutenção", 1:"Alta" }[idx_to_label[pred_idx]]

print(predict_direction(df_cls.iloc[0]["Body"]))
