In [3]:
# Notebook: copom_semantic_drift_themes.ipynb

# ============================================================
# 0. Setup & Configuration
# ============================================================

import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer, util
from sklearn.decomposition import PCA

import sqlalchemy as sa

import matplotlib.pyplot as plt

# ---------- CONFIG: adjust to your actual column names ----------
COL_MEETING_ID   = "meeting_id"
COL_MEETING_DATE = "meeting_date"
COL_FULL_TEXT    = "full_text_pt"      # 1 string per meeting, Portuguese
COL_PARAGRAPHS   = "paragraphs"        # list[str] per meeting (optional)

# path for local SQLite DB that will track themes over time
SQLITE_PATH = "copom_themes.sqlite"

# embedding model (multilingual; PT-BR works well)
EMBEDDING_MODEL_NAME = "distiluse-base-multilingual-cased-v2"

# number of past meetings for rolling baseline
ROLLING_WINDOW = 4





ModuleNotFoundError: No module named 'sqlalchemy'

In [9]:
# ============================================================
# 1. Load / Inspect DataFrame
# ============================================================

# If df is already in memory, skip this cell.
# Otherwise, example of loading from a parquet or CSV:

# df = pd.read_parquet("copom_statements.parquet")
# OR:
# df = pd.read_csv("copom_statements.csv")

# For safety: sort by meeting_date
df = df.sort_values(COL_MEETING_DATE).reset_index(drop=True)

# Make sure meeting_date is datetime
df[COL_MEETING_DATE] = pd.to_datetime(df[COL_MEETING_DATE])

df.head()


NameError: name 'df' is not defined

In [6]:
# ============================================================
# 2. Load Sentence-Transformer Model & Embed Full Statements
# ============================================================

model = SentenceTransformer(EMBEDDING_MODEL_NAME)

texts = df[COL_FULL_TEXT].astype(str).tolist()
emb_full = model.encode(texts, normalize_embeddings=True)  # shape: (M, D)
emb_full = np.vstack(emb_full)

emb_full.shape


NameError: name 'df' is not defined

In [7]:
# ============================================================
# 3. Drift Metrics on Full-Statement Embeddings
#    - Δ vs previous meeting
#    - Δ vs rolling centroid (last ROLLING_WINDOW meetings)
# ============================================================

# Cosine similarity vs previous
cos_prev = np.full(len(emb_full), np.nan, dtype=float)

for i in range(1, len(emb_full)):
    cos_prev[i] = float(emb_full[i] @ emb_full[i-1])  # already normalized

delta_cos_prev = 1 - cos_prev  # higher = bigger semantic jump

# Rolling baseline: compare each meeting to centroid of last ROLLING_WINDOW meetings
delta_cos_roll = np.full(len(emb_full), np.nan, dtype=float)

for i in range(len(emb_full)):
    if i == 0:
        continue
    j0 = max(0, i - ROLLING_WINDOW)
    # centroid of previous window; if window empty, fallback to previous
    if i - j0 > 0:
        base = emb_full[j0:i].mean(axis=0)
        # ensure base is normalized for cosine
        base = base / np.linalg.norm(base)
    else:
        base = emb_full[i-1]
    delta_cos_roll[i] = 1 - float(emb_full[i] @ base)

df["delta_cos_prev_full"]  = delta_cos_prev
df["delta_cos_roll_full"]  = delta_cos_roll

df[[COL_MEETING_ID, COL_MEETING_DATE, "delta_cos_prev_full", "delta_cos_roll_full"]].tail()


NameError: name 'emb_full' is not defined

In [10]:
# ============================================================
# 4. Differential PCA (optional but very useful for visualization)
# ============================================================

pca = PCA(n_components=2)
Z = pca.fit_transform(emb_full)  # meeting positions in semantic space
dZ = np.zeros_like(Z)
dZ[1:] = Z[1:] - Z[:-1]          # step vectors

df["pca_x_full"]   = Z[:, 0]
df["pca_y_full"]   = Z[:, 1]
df["dpca_dx_full"] = dZ[:, 0]
df["dpca_dy_full"] = dZ[:, 1]

# Quick visualization: trajectory in PCA space
plt.figure(figsize=(8, 6))
plt.plot(df["pca_x_full"], df["pca_y_full"], marker="o")
for i, row in df.iterrows():
    plt.text(row["pca_x_full"], row["pca_y_full"], str(row[COL_MEETING_ID]))
plt.title("COPOM Statements in PCA Space (Full Text)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.show()
# 

NameError: name 'emb_full' is not defined

In [11]:
# ============================================================
# 5. Define Themes & Extract Theme-Specific Text per Meeting
# ============================================================

THEMES = {
    "balanco_riscos": [
        "balanço de riscos", "balanco de riscos",
        "assimetria", "assimétrico", "cauda", "caudas",
        "riscos altistas", "riscos baixistas", "incerteza"
    ],
    "inflacao_projecoes": [
        "inflação", "inflacao", "expectativas de inflação",
        "projeções de inflação", "projecoes de inflacao",
        "metas de inflação", "núcleo", "desancoragem", "desancoradas",
        "horizonte relevante"
    ],
    "prescricao_politica": [
        "política monetária", "politica monetaria",
        "Selic", "taxa básica", "taxa basica",
        "postura contracionista", "postura expansionista",
        "ciclo de aperto", "ciclo de afrouxamento",
        "comitê decidiu", "comite decidiu"
    ],
}

def pick_theme_text_from_paragraphs(paragraphs, keywords):
    """
    paragraphs: list of strings
    keywords: list of lowercase substrings
    returns: concatenated text with paragraphs that mention any keyword
    """
    if not isinstance(paragraphs, (list, tuple)):
        return None
    out = []
    for p in paragraphs:
        pl = p.lower()
        if any(k in pl for k in keywords):
            out.append(p)
    return "\n".join(out) if out else None

def pick_theme_text_row(row, theme_keywords):
    """
    Prefer paragraphs if available; fallback to full text.
    """
    if COL_PARAGRAPHS in row and isinstance(row[COL_PARAGRAPHS], (list, tuple)):
        text = pick_theme_text_from_paragraphs(row[COL_PARAGRAPHS], theme_keywords)
        if text:
            return text
    # fallback: full text filter (cheap)
    full = str(row[COL_FULL_TEXT])
    fl = full.lower()
    if any(k in fl for k in theme_keywords):
        return full
    return None

for theme_name, kwds in THEMES.items():
    col = f"text_{theme_name}"
    df[col] = df.apply(lambda r: pick_theme_text_row(r, [k.lower() for k in kwds]), axis=1)

df[[COL_MEETING_ID] + [f"text_{t}" for t in THEMES.keys()]].head()


NameError: name 'df' is not defined

In [12]:
# ============================================================
# 6. Embed Themes & Compute Theme-Level Drift
# ============================================================

# For each theme, build an embedding series (some entries may be None)
theme_embeddings = {}
theme_delta_prev = {}
theme_delta_roll = {}

for theme_name in THEMES.keys():
    col_text = f"text_{theme_name}"
    texts_theme = df[col_text].tolist()
    
    # Prepare embeddings with Nones preserved
    emb_theme = []
    for txt in texts_theme:
        if txt is None or not isinstance(txt, str) or not txt.strip():
            emb_theme.append(None)
        else:
            v = model.encode([txt], normalize_embeddings=True)[0]
            emb_theme.append(v)
    theme_embeddings[theme_name] = emb_theme
    
    # Compute deltas where both current and previous exist
    n = len(emb_theme)
    d_prev = np.full(n, np.nan, dtype=float)
    d_roll = np.full(n, np.nan, dtype=float)
    
    for i in range(n):
        v = emb_theme[i]
        if v is None:
            continue
        
        # --- vs previous ---
        # find previous meeting that has theme embedding
        j = i - 1
        while j >= 0 and emb_theme[j] is None:
            j -= 1
        if j >= 0:
            v_prev = emb_theme[j]
            d_prev[i] = 1 - float(v @ v_prev)
        
        # --- vs rolling centroid among previous ROLLING_WINDOW theme embeddings ---
        count = 0
        acc = None
        j = i - 1
        while j >= 0 and count < ROLLING_WINDOW:
            if emb_theme[j] is not None:
                if acc is None:
                    acc = emb_theme[j].copy()
                else:
                    acc += emb_theme[j]
                count += 1
            j -= 1
        
        if acc is not None:
            acc = acc / np.linalg.norm(acc)
            d_roll[i] = 1 - float(v @ acc)
    
    theme_delta_prev[theme_name] = d_prev
    theme_delta_roll[theme_name] = d_roll
    df[f"delta_cos_prev_{theme_name}"] = d_prev
    df[f"delta_cos_roll_{theme_name}"] = d_roll

df[[COL_MEETING_ID, COL_MEETING_DATE] + [f"delta_cos_prev_{t}" for t in THEMES.keys()]].tail()


NameError: name 'df' is not defined

In [13]:
# ============================================================
# 7. Theme-level Visualization Example
# ============================================================

theme_to_plot = "inflacao_projecoes"

plt.figure(figsize=(10, 4))
plt.plot(df[COL_MEETING_DATE], df[f"delta_cos_prev_{theme_to_plot}"], marker="o")
plt.title(f"Tema '{theme_to_plot}': Drift vs reunião anterior")
plt.xlabel("Data da reunião")
plt.ylabel("1 - cos(sim)")
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


NameError: name 'df' is not defined

<Figure size 1000x400 with 0 Axes>

In [14]:
# ============================================================
# 8. Build & Populate SQLite DB with Themes and Scores
# ============================================================

engine = sa.create_engine(f"sqlite:///{SQLITE_PATH}")

# ---------- 8.1 meetings table ----------
meetings_df = df[[COL_MEETING_ID, COL_MEETING_DATE]].copy()
meetings_df = meetings_df.rename(columns={
    COL_MEETING_ID: "meeting_id",
    COL_MEETING_DATE: "meeting_date"
})
meetings_df.to_sql("meetings", engine, if_exists="replace", index=False)

# ---------- 8.2 themes table ----------
themes_list = []
for i, (name, kwds) in enumerate(THEMES.items(), start=1):
    themes_list.append({
        "theme_id": i,
        "theme_name": name,
        "keywords": ", ".join(kwds)
    })
themes_df = pd.DataFrame(themes_list)
themes_df.to_sql("themes", engine, if_exists="replace", index=False)

themes_df


NameError: name 'df' is not defined

In [15]:
# ---------- 8.3 meeting_theme_scores table ----------

records = []

theme_name_to_id = {row["theme_name"]: row["theme_id"] for _, row in themes_df.iterrows()}

for idx, row in df.iterrows():
    mid = row[COL_MEETING_ID]
    mdate = row[COL_MEETING_DATE]
    
    for theme_name in THEMES.keys():
        tid = theme_name_to_id[theme_name]
        
        text_col = f"text_{theme_name}"
        has_text = int(isinstance(row[text_col], str) and bool(row[text_col].strip()))
        
        rec = {
            "meeting_id": mid,
            "meeting_date": mdate,
            "theme_id": tid,
            "theme_name": theme_name,
            "has_theme_text": has_text,
            "delta_cos_prev": row[f"delta_cos_prev_{theme_name}"],
            "delta_cos_roll": row[f"delta_cos_roll_{theme_name}"],
        }
        records.append(rec)

mt_scores_df = pd.DataFrame(records)
mt_scores_df.to_sql("meeting_theme_scores", engine, if_exists="replace", index=False)

mt_scores_df.head()


NameError: name 'themes_df' is not defined

In [16]:
# Example skeleton – you fill with actual market data loader.

# swap_df columns:
#   'date'           – datetime
#   'di1m', 'di3m', 'di1y', ...   # levels or changes
#   or 'swap_1y', 'swap_2y', etc.

# swap_df = load_your_swap_data(...)

swap_df["date"] = pd.to_datetime(swap_df["date"])

# Merge with meeting_theme_scores, e.g. left-join on date
mt_scores_df["meeting_date_dateonly"] = mt_scores_df["meeting_date"].dt.date
swap_df["date_dateonly"] = swap_df["date"].dt.date

merged = pd.merge(
    mt_scores_df,
    swap_df,
    left_on="meeting_date_dateonly",
    right_on="date_dateonly",
    how="left",
)

merged.head()


NameError: name 'swap_df' is not defined