##FOMC - Extraction (Steps 1-3)


In [21]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get("Hugging_Face_HF_Token")
assert hf_token, "Hugging_Face_HF_Token not found in Colab Secrets."

login(hf_token)
print("Logged in to Hugging Face.")

Logged in to Hugging Face.


In [4]:
!pip -q install transformers datasets accelerate evaluate scikit-learn pandas numpy matplotlib openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import os, re, json, math, random
import numpy as np
import pandas as pd

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, pipeline,
    DataCollatorWithPadding, TrainingArguments, Trainer
)

device = 0 if torch.cuda.is_available() else -1
print("CUDA:", torch.cuda.is_available(), "device:", device)

CUDA: True device: 0


Step 1

In [6]:
PDF_PATH = "/content/FOMCpresconf20251029.pdf"

In [7]:
!pip -q install pypdf
from pypdf import PdfReader

def pdf_to_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)

raw_text = pdf_to_text(PDF_PATH)
print("Chars:", len(raw_text))
print(raw_text[:1500])

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hChars: 55519
October 29, 2025   Chair Powell’s Press Conference  FINAL 
Page 1 of 28 
 
Transcript of Chair Powell’s Press Conference 
October 29, 2025 
 
CHAIR POWELL.  Good afternoon.  My colleagues and I remain squarely focused on 
achieving our dual-mandate goals of maximum employment and stable prices for the benefit of 
the American people.  Although some important federal government data have been delayed due 
to the shutdown, the public- and private-sector data that have remained available suggest that the 
outlook for employment and inflation has not changed much since our meeting in September.  
Conditions in the labor market appear to be gradually cooling, and inflation remains somewhat 
elevated. 
In support of our goals, and in light of 

We will pass transcript text and ask GPT to return strict JSON of Q&A pairs.

In [11]:
from google.colab import userdata
from openai import OpenAI

api_key = userdata.get("OpenAI_API")
assert api_key, "OpenAI_API secret not found in Colab Secrets."

client = OpenAI(api_key=api_key)

print("OpenAI client initialized successfully.")

OpenAI client initialized successfully.


In [12]:
resp = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "Reply with OK"}],
    temperature=0
)
print(resp.choices[0].message.content)

OK


In [13]:
raw_text = pdf_to_text(PDF_PATH)

In [14]:
QA_EXTRACTION_PROMPT = """
You are extracting question-answer (Q&A) pairs from a Federal Reserve press conference transcript.

Rules:
- A Q&A pair is ONE question and its immediate answer by the Chair.
- Follow-up questions count as separate questions.
- Exclude moderator transitions (e.g., "MICHELLE SMITH: Nick").
- Do NOT merge multiple questions together.
- Preserve original wording.
- Output STRICT JSON ONLY.

Schema:
{
  "pairs": [
    {"question": "...", "answer": "..."}
  ]
}

Transcript:
<<TRANSCRIPT>>
"""

In [15]:
import json

def chunk_text(text, max_chars=12000):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        chunks.append(text[start:end])
        start = end
    return chunks

chunks = chunk_text(raw_text, max_chars=12000)
print("Number of chunks:", len(chunks))
print("First chunk preview:\n", chunks[0][:500])

Number of chunks: 5
First chunk preview:
 October 29, 2025   Chair Powell’s Press Conference  FINAL 
Page 1 of 28 
 
Transcript of Chair Powell’s Press Conference 
October 29, 2025 
 
CHAIR POWELL.  Good afternoon.  My colleagues and I remain squarely focused on 
achieving our dual-mandate goals of maximum employment and stable prices for the benefit of 
the American people.  Although some important federal government data have been delayed due 
to the shutdown, the public- and private-sector data that have remained available suggest th


In [18]:
pairs = []

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")

    prompt = QA_EXTRACTION_PROMPT.replace("<<TRANSCRIPT>>", chunk)

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    content = resp.choices[0].message.content

    # Robust JSON parsing: extract JSON from potential markdown code blocks
    json_match = re.search(r"```json\n([\s\S]*?)\n```", content)
    if json_match:
        json_string = json_match.group(1)
    else:
        json_string = content.strip()

    try:
        data = json.loads(json_string)
        pairs.extend(data.get("pairs", []))
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError in chunk {i+1}: {e}")
        print(f"Problematic content: {json_string}")
        continue # Skip this chunk and continue to the next one

print("Total extracted pairs:", len(pairs))

Processing chunk 1/5
Processing chunk 2/5
Processing chunk 3/5
Processing chunk 4/5
Processing chunk 5/5
Total extracted pairs: 30


In [19]:
import pandas as pd

df_powell = pd.DataFrame(pairs)
df_powell["question"] = df_powell["question"].astype(str).str.strip()
df_powell["answer"] = df_powell["answer"].astype(str).str.strip()

df_powell = df_powell[
    (df_powell["question"].str.len() > 0) &
    (df_powell["answer"].str.len() > 0)
].drop_duplicates().reset_index(drop=True)

df_powell.to_csv("powell_qa.csv", index=False)

print("Saved powell_qa.csv with rows:", len(df_powell))
df_powell.head(10)


Saved powell_qa.csv with rows: 30


Unnamed: 0,question,answer
0,"Chair Powell, are you uncomfortable with how m...","Well, as I just mentioned, a further reduction..."
1,At what point do you conclude that you’ve take...,So the way we have been thinking—the way I’ve ...
2,"Was there any consideration, for instance, of ...",I wouldn’t say that’s a—that’s a factor in eve...
3,How much of the fund impressions we’ve seen in...,"That could be one of the factors, but the real..."
4,So much of the rationale for cutting interest ...,"Yeah, I mean in principle if you were to see d..."
5,But if this shutdown lasts a while longer and ...,Yeah. So we’ll—we get—I mentioned what we get ...
6,"Mr. Chairman, can you characterize the meeting...",So I was referring to the discussion about—to ...
7,Just a follow-up on the balance sheet. If you ...,So you’re right. The place we’ll be on Decembe...
8,How are officials interpreting the latest CPI ...,"So, okay, so the September CPI report, we didn..."
9,"With a stubborn services inflation, what are s...","Oh, stubborn services inflation. Well, again, ..."


Add SubjECTive pretrained labels (assertive/optimistic/specific)

In [22]:
df_powell = pd.read_csv("powell_qa.csv")
df_powell["qa_text"] = "Question: " + df_powell["question"].astype(str) + "\nAnswer: " + df_powell["answer"].astype(str)
df_powell.head(2)

Unnamed: 0,question,answer,qa_text
0,"Chair Powell, are you uncomfortable with how m...","Well, as I just mentioned, a further reduction...","Question: Chair Powell, are you uncomfortable ..."
1,At what point do you conclude that you’ve take...,So the way we have been thinking—the way I’ve ...,Question: At what point do you conclude that y...


In [23]:
from transformers import pipeline
import torch, re

device = 0 if torch.cuda.is_available() else -1
print("device:", device)

SUBJECTIVE_MODELS = {
    "assertive": "gtfintechlab/SubjECTiveQA-ASSERTIVE",
    "optimistic": "gtfintechlab/SubjECTiveQA-OPTIMISTIC",
    "specific": "gtfintechlab/SubjECTiveQA-SPECIFIC",
}

pipes = {}
for feat, mid in SUBJECTIVE_MODELS.items():
    pipes[feat] = pipeline(
        "text-classification",
        model=mid,
        tokenizer=mid,
        device=device,
        truncation=True,
        max_length=512
    )

print("Loaded SubjECTive models:", list(pipes.keys()))

device: 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/910 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/910 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


Loaded SubjECTive models: ['assertive', 'optimistic', 'specific']


In [24]:
def label_to_int(label: str):
    # Expect LABEL_0 / LABEL_1 / LABEL_2
    m = re.search(r"(\d+)$", label)
    return int(m.group(1)) if m else None

def run_model(pipe, texts, batch_size=16):
    out = []
    for i in range(0, len(texts), batch_size):
        preds = pipe(texts[i:i+batch_size])
        out.extend([label_to_int(p["label"]) for p in preds])
    return out

texts = df_powell["qa_text"].tolist()

for feat in ["assertive", "optimistic", "specific"]:
    df_powell[f"{feat}_subj"] = run_model(pipes[feat], texts)

df_powell[["assertive_subj","optimistic_subj","specific_subj"]].head()

Unnamed: 0,assertive_subj,optimistic_subj,specific_subj
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,2,1,1


In [25]:
df_powell.to_csv("powell_step3_subjective.csv", index=False)
print("Saved powell_step3_subjective.csv")

Saved powell_step3_subjective.csv


Label SubjECTive-QA with GPT (forward-looking + certain)

In [29]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset('gtfintechlab/SubjECTive-QA', '5768')
print(ds)

subj_train = ds["train"].to_pandas()
subj_test  = ds["test"].to_pandas()

print("Original columns:", subj_train.columns.tolist())

# --- FORCE column normalization ---
if "question" not in subj_train.columns:
    subj_train = subj_train.rename(columns={"QUESTION": "question"})
    subj_test  = subj_test.rename(columns={"QUESTION": "question"})

if "answer" not in subj_train.columns:
    subj_train = subj_train.rename(columns={"ANSWER": "answer"})
    subj_test  = subj_test.rename(columns={"ANSWER": "answer"})

# sanity check
assert "question" in subj_train.columns
assert "answer" in subj_train.columns

# build qa_text
subj_train["qa_text"] = (
    "Question: " + subj_train["question"].astype(str)
    + "\nAnswer: " + subj_train["answer"].astype(str)
)

subj_test["qa_text"] = (
    "Question: " + subj_test["question"].astype(str)
    + "\nAnswer: " + subj_test["answer"].astype(str)
)

subj_train[["question", "answer"]].head()


DatasetDict({
    train: Dataset({
        features: ['COMPANYNAME', 'QUARTER', 'YEAR', 'ASKER', 'RESPONDER', 'QUESTION', 'ANSWER', 'CLEAR', 'ASSERTIVE', 'CAUTIOUS', 'OPTIMISTIC', 'SPECIFIC', 'RELEVANT', '__index_level_0__'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['COMPANYNAME', 'QUARTER', 'YEAR', 'ASKER', 'RESPONDER', 'QUESTION', 'ANSWER', 'CLEAR', 'ASSERTIVE', 'CAUTIOUS', 'OPTIMISTIC', 'SPECIFIC', 'RELEVANT', '__index_level_0__'],
        num_rows: 577
    })
    val: Dataset({
        features: ['COMPANYNAME', 'QUARTER', 'YEAR', 'ASKER', 'RESPONDER', 'QUESTION', 'ANSWER', 'CLEAR', 'ASSERTIVE', 'CAUTIOUS', 'OPTIMISTIC', 'SPECIFIC', 'RELEVANT', '__index_level_0__'],
        num_rows: 248
    })
})
Original columns: ['COMPANYNAME', 'QUARTER', 'YEAR', 'ASKER', 'RESPONDER', 'QUESTION', 'ANSWER', 'CLEAR', 'ASSERTIVE', 'CAUTIOUS', 'OPTIMISTIC', 'SPECIFIC', 'RELEVANT', '__index_level_0__']


Unnamed: 0,question,answer
0,Did you open up any new Apartment Guides in th...,Not in the second quarter. In the first quarte...
1,"So the first one, just curious if you could pr...","So the nice part is, as we launched this prog..."
2,"Okay, great. That's helpful. And then just one...","As is typical, when we give guidance, we base ..."
3,"Right, that’s understandable. And just to clar...","Yes. it’s a little, it’s a bit uncertain, I me..."
4,I wanted to ask a couple about the elacestrant...,"At this point, we're not providing guidance on..."


In [31]:
import json
import re
import time

FORWARD_PROMPT = """
You are labeling a Q&A pair.

Forward-looking (1): discusses future economic events, future policy decisions, future outlook, projections, or what will happen.
Not forward-looking (0): focuses on past or current conditions/explanations without discussing the future.

Return STRICT JSON ONLY:
{"forward_looking": 1 or 0}

Q&A:
<<QA>>
"""

CERTAIN_PROMPT = """
You are labeling a Q&A pair.

Certain (1): the answer presents information definitively/committed with minimal hedging.
Uncertain (0): the answer includes speculation/possibility/conditions/hedging (might, could, depends, we’ll see, uncertain).

Return STRICT JSON ONLY:
{"certain": 1 or 0}

Q&A:
<<QA>>
"""

def extract_json(content: str):
    # handles occasional extra text (rare, but safe)
    m = re.search(r"\{.*\}", content, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found")
    return json.loads(m.group(0))

def gpt_json(prompt: str, model="gpt-4o-mini", max_retries=3):
    for attempt in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role":"user","content":prompt}],
                temperature=0
            )
            return extract_json(resp.choices[0].message.content)
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(1.0 * (attempt + 1))

In [32]:
import numpy as np

N_LABEL = 1000  # start here, then bump to 3000 once it works end-to-end
subj_labeled = subj_train.sample(n=min(N_LABEL, len(subj_train)), random_state=7).copy()

fw, ce = [], []
for i, qa in enumerate(subj_labeled["qa_text"].tolist(), start=1):
    if i % 50 == 0:
        print(f"Labeled {i}/{len(subj_labeled)}")

    fw.append(gpt_json(FORWARD_PROMPT.replace("<<QA>>", qa))["forward_looking"])
    ce.append(gpt_json(CERTAIN_PROMPT.replace("<<QA>>", qa))["certain"])

subj_labeled["forward_looking"] = fw
subj_labeled["certain"] = ce

print("Forward-looking balance:\n", subj_labeled["forward_looking"].value_counts())
print("\nCertain balance:\n", subj_labeled["certain"].value_counts())

subj_labeled.to_csv("subjective_labeled_forward_certain.csv", index=False)
print("\nSaved subjective_labeled_forward_certain.csv with rows:", len(subj_labeled))

Labeled 50/1000
Labeled 100/1000
Labeled 150/1000
Labeled 200/1000
Labeled 250/1000
Labeled 300/1000
Labeled 350/1000
Labeled 400/1000
Labeled 450/1000
Labeled 500/1000
Labeled 550/1000
Labeled 600/1000
Labeled 650/1000
Labeled 700/1000
Labeled 750/1000
Labeled 800/1000
Labeled 850/1000
Labeled 900/1000
Labeled 950/1000
Labeled 1000/1000
Forward-looking balance:
 forward_looking
1    725
0    275
Name: count, dtype: int64

Certain balance:
 certain
0    724
1    276
Name: count, dtype: int64

Saved subjective_labeled_forward_certain.csv with rows: 1000


checkpoint

In [33]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [34]:
import shutil
shutil.copy(
    "subjective_labeled_forward_certain.csv",
    "/content/drive/MyDrive/subjective_labeled_forward_certain.csv"
)

print("Saved checkpoint to Google Drive.")

Saved checkpoint to Google Drive.


Validation sample (required by rubric)

In [None]:
val25 = subj_train.sample(n=25, random_state=42)[["question","answer","qa_text"]].copy()
val25.to_csv("prompt_validation_25_unlabeled.csv", index=False)
print("Saved prompt_validation_25_unlabeled.csv — add forward_true and certain_true (0/1) later.")
val25.head()

Fine-tune DistilBERT for Forward-looking

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("subjective_labeled_forward_certain.csv").dropna(subset=["qa_text","forward_looking","certain"]).copy()
df["forward_looking"] = df["forward_looking"].astype(int)
df["certain"] = df["certain"].astype(int)

# split for forward-looking
train_df, test_df = train_test_split(df, test_size=0.2, random_state=7, stratify=df["forward_looking"])
train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=7, stratify=train_df["forward_looking"])

print("Sizes:", len(train_df), len(val_df), len(test_df))
print("Forward class balance (train):\n", train_df["forward_looking"].value_counts())

Sizes: 640 160 200
Forward class balance (train):
 forward_looking
1    464
0    176
Name: count, dtype: int64


In [38]:
import inspect
import numpy as np
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="binary")["f1"]
    }

def to_hf_dataset(df_in, label_col):
    out = df_in[["qa_text", label_col]].rename(columns={label_col: "labels"}).copy()
    return Dataset.from_pandas(out)

def finetune_distilbert(label_col: str, out_dir: str):
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    ds_train = to_hf_dataset(train_df, label_col)
    ds_val   = to_hf_dataset(val_df, label_col)
    ds_test  = to_hf_dataset(test_df, label_col)

    def tokenize(batch):
        return tokenizer(batch["qa_text"], truncation=True, max_length=256)

    ds_train = ds_train.map(tokenize, batched=True)
    ds_val   = ds_val.map(tokenize, batched=True)
    ds_test  = ds_test.map(tokenize, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    # ------------------------------
    # VERSION-SAFE TrainingArguments
    # ------------------------------
    ta_kwargs = dict(
        output_dir=out_dir,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_steps=25,
        report_to="none"
    )

    sig = inspect.signature(TrainingArguments.__init__).parameters
    if "evaluation_strategy" in sig:
        ta_kwargs["evaluation_strategy"] = "epoch"
    elif "eval_strategy" in sig:
        ta_kwargs["eval_strategy"] = "epoch"
    else:
        raise RuntimeError(
            "TrainingArguments has neither evaluation_strategy nor eval_strategy"
        )

    args = TrainingArguments(**ta_kwargs)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    print("TEST METRICS:", trainer.evaluate(ds_test))

    trainer.save_model(out_dir)
    tokenizer.save_pretrained(out_dir)

    return out_dir


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [39]:
forward_dir = finetune_distilbert(
    label_col="forward_looking",
    out_dir="distilbert_forward_model"
)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5806,0.505204,0.7875,0.867188
2,0.4507,0.47875,0.7625,0.844262
3,0.417,0.482811,0.7875,0.862903


TEST METRICS: {'eval_loss': 0.48393675684928894, 'eval_accuracy': 0.79, 'eval_f1': 0.8695652173913043, 'eval_runtime': 1.3358, 'eval_samples_per_second': 149.726, 'eval_steps_per_second': 9.732, 'epoch': 3.0}


In [40]:
!zip -r distilbert_forward_model.zip distilbert_forward_model
print("Saved distilbert_forward_model.zip")

  adding: distilbert_forward_model/ (stored 0%)
  adding: distilbert_forward_model/config.json (deflated 45%)
  adding: distilbert_forward_model/model.safetensors (deflated 8%)
  adding: distilbert_forward_model/training_args.bin (deflated 53%)
  adding: distilbert_forward_model/checkpoint-80/ (stored 0%)
  adding: distilbert_forward_model/checkpoint-80/config.json (deflated 45%)
  adding: distilbert_forward_model/checkpoint-80/model.safetensors (deflated 8%)
  adding: distilbert_forward_model/checkpoint-80/training_args.bin (deflated 53%)
  adding: distilbert_forward_model/checkpoint-80/special_tokens_map.json (deflated 42%)
  adding: distilbert_forward_model/checkpoint-80/vocab.txt (deflated 53%)
  adding: distilbert_forward_model/checkpoint-80/tokenizer_config.json (deflated 75%)
  adding: distilbert_forward_model/checkpoint-80/rng_state.pth (deflated 26%)
  adding: distilbert_forward_model/checkpoint-80/trainer_state.json (deflated 64%)
  adding: distilbert_forward_model/checkpoint

Fine-tune DistilBERT for Certain

In [41]:
from sklearn.model_selection import train_test_split

# df is your full labeled dataset (subjective_labeled_forward_certain.csv already loaded earlier)
# If needed:
# df = pd.read_csv("subjective_labeled_forward_certain.csv").dropna(subset=["qa_text","forward_looking","certain"]).copy()
# df["forward_looking"] = df["forward_looking"].astype(int)
# df["certain"] = df["certain"].astype(int)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=7, stratify=df["certain"])
train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=7, stratify=train_df["certain"])

print("Sizes:", len(train_df), len(val_df), len(test_df))
print("Certain class balance (train):\n", train_df["certain"].value_counts())


Sizes: 640 160 200
Certain class balance (train):
 certain
0    463
1    177
Name: count, dtype: int64


In [42]:
certain_dir = finetune_distilbert(
    label_col="certain",
    out_dir="distilbert_certain_model"
)

!zip -r distilbert_certain_model.zip distilbert_certain_model
print("Saved distilbert_certain_model.zip")


Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5946,0.576688,0.725,0.0
2,0.5366,0.566727,0.725,0.0
3,0.5449,0.552815,0.725,0.0


TEST METRICS: {'eval_loss': 0.5682868361473083, 'eval_accuracy': 0.725, 'eval_f1': 0.0, 'eval_runtime': 1.3941, 'eval_samples_per_second': 143.46, 'eval_steps_per_second': 9.325, 'epoch': 3.0}
  adding: distilbert_certain_model/ (stored 0%)
  adding: distilbert_certain_model/config.json (deflated 45%)
  adding: distilbert_certain_model/model.safetensors (deflated 8%)
  adding: distilbert_certain_model/training_args.bin (deflated 53%)
  adding: distilbert_certain_model/checkpoint-80/ (stored 0%)
  adding: distilbert_certain_model/checkpoint-80/config.json (deflated 45%)
  adding: distilbert_certain_model/checkpoint-80/model.safetensors (deflated 8%)
  adding: distilbert_certain_model/checkpoint-80/training_args.bin (deflated 53%)
  adding: distilbert_certain_model/checkpoint-80/special_tokens_map.json (deflated 42%)
  adding: distilbert_certain_model/checkpoint-80/vocab.txt (deflated 53%)
  adding: distilbert_certain_model/checkpoint-80/tokenizer_config.json (deflated 75%)
  adding: dis

In [43]:
import shutil

shutil.copy("distilbert_forward_model.zip", "/content/drive/MyDrive/distilbert_forward_model.zip")
shutil.copy("distilbert_certain_model.zip", "/content/drive/MyDrive/distilbert_certain_model.zip")

print("Saved model zips to Drive.")

Saved model zips to Drive.


Apply DistilBERT models to Powell Q&A

In [44]:
import pandas as pd

powell = pd.read_csv("powell_qa.csv")

powell["qa_text"] = (
    "Question: " + powell["question"].astype(str)
    + "\nAnswer: " + powell["answer"].astype(str)
)

powell.head()

Unnamed: 0,question,answer,qa_text
0,"Chair Powell, are you uncomfortable with how m...","Well, as I just mentioned, a further reduction...","Question: Chair Powell, are you uncomfortable ..."
1,At what point do you conclude that you’ve take...,So the way we have been thinking—the way I’ve ...,Question: At what point do you conclude that y...
2,"Was there any consideration, for instance, of ...",I wouldn’t say that’s a—that’s a factor in eve...,"Question: Was there any consideration, for ins..."
3,How much of the fund impressions we’ve seen in...,"That could be one of the factors, but the real...",Question: How much of the fund impressions we’...
4,So much of the rationale for cutting interest ...,"Yeah, I mean in principle if you were to see d...",Question: So much of the rationale for cutting...


In [45]:
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
print("device:", device)

forward_pipe = pipeline(
    "text-classification",
    model="distilbert_forward_model",
    tokenizer="distilbert_forward_model",
    device=device,
    truncation=True,
    max_length=256
)

certain_pipe = pipeline(
    "text-classification",
    model="distilbert_certain_model",
    tokenizer="distilbert_certain_model",
    device=device,
    truncation=True,
    max_length=256
)

device: 0


Device set to use cuda:0
Device set to use cuda:0


In [46]:
import re

def label_to_int(label):
    m = re.search(r"(\d+)$", label)
    return int(m.group(1)) if m else None

powell["forward_looking"] = [
    label_to_int(p["label"])
    for p in forward_pipe(powell["qa_text"].tolist())
]

powell["certain"] = [
    label_to_int(p["label"])
    for p in certain_pipe(powell["qa_text"].tolist())
]

powell[["forward_looking","certain"]].value_counts(dropna=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
forward_looking,certain,Unnamed: 2_level_1
1,0,22
0,0,8


In [47]:
subj_powell = pd.read_csv("powell_step3_subjective.csv")

powell = powell.merge(
    subj_powell[
        ["question","assertive_subj","optimistic_subj","specific_subj"]
    ],
    on="question",
    how="left"
)

powell.head()

Unnamed: 0,question,answer,qa_text,forward_looking,certain,assertive_subj,optimistic_subj,specific_subj
0,"Chair Powell, are you uncomfortable with how m...","Well, as I just mentioned, a further reduction...","Question: Chair Powell, are you uncomfortable ...",1,0,1,1,1
1,At what point do you conclude that you’ve take...,So the way we have been thinking—the way I’ve ...,Question: At what point do you conclude that y...,1,0,1,1,1
2,"Was there any consideration, for instance, of ...",I wouldn’t say that’s a—that’s a factor in eve...,"Question: Was there any consideration, for ins...",0,0,1,1,1
3,How much of the fund impressions we’ve seen in...,"That could be one of the factors, but the real...",Question: How much of the fund impressions we’...,1,0,1,1,1
4,So much of the rationale for cutting interest ...,"Yeah, I mean in principle if you were to see d...",Question: So much of the rationale for cutting...,1,0,2,1,1


In [48]:
powell.to_csv("powell_labeled_full.csv", index=False)
print("Saved powell_labeled_full.csv")

Saved powell_labeled_full.csv


Hawkish / Dovish / Neutral labels (GPT)

In [49]:
import json, re, time

HAWK_DOVE_PROMPT = """
You are labeling the TONE of a Federal Reserve Chair's answer.

Definitions:
- Hawkish (2): emphasizes inflation risks, tighter policy, higher rates for longer, need to restrain demand, concern about inflation persistence.
- Dovish (0): emphasizes growth/employment risks, downside risks, openness to easing, inflation improving, patience, or support for accommodation.
- Neutral (1): balanced, technical, or non-directional; acknowledges both sides; mostly explanatory without a clear tilt.

Return STRICT JSON ONLY:
{"hawk_dove_neutral": 0 or 1 or 2}

Answer:
<<ANSWER>>
"""

def extract_json_obj(content: str):
    m = re.search(r"\{.*\}", content, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON found")
    return json.loads(m.group(0))

def gpt_label_hdn(answer: str, model="gpt-4o-mini", max_retries=3):
    prompt = HAWK_DOVE_PROMPT.replace("<<ANSWER>>", answer)
    for attempt in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[{"role":"user","content":prompt}],
                temperature=0
            )
            return extract_json_obj(resp.choices[0].message.content)["hawk_dove_neutral"]
        except Exception:
            if attempt == max_retries - 1:
                raise
            time.sleep(1.0 * (attempt + 1))

In [50]:
powell = pd.read_csv("powell_labeled_full.csv")

labels = []
for i, ans in enumerate(powell["answer"].astype(str).tolist(), start=1):
    if i % 10 == 0:
        print(f"Labeled {i}/{len(powell)}")
    labels.append(gpt_label_hdn(ans))

powell["hawk_dove_neutral"] = labels

print(powell["hawk_dove_neutral"].value_counts())
powell.to_csv("powell_labeled_full.csv", index=False)
print("Updated powell_labeled_full.csv with hawk/dove/neutral.")

Labeled 10/30
Labeled 20/30
Labeled 30/30
hawk_dove_neutral
1    28
0     2
Name: count, dtype: int64
Updated powell_labeled_full.csv with hawk/dove/neutral.


Counts + Correlation Matrix

In [51]:
import numpy as np

df = pd.read_csv("powell_labeled_full.csv")

count_cols = [
    "forward_looking",
    "certain",
    "hawk_dove_neutral",
    "assertive_subj",
    "optimistic_subj",
    "specific_subj"
]

for c in count_cols:
    print("\n", c)
    print(df[c].value_counts(dropna=False))


 forward_looking
forward_looking
1    22
0     8
Name: count, dtype: int64

 certain
certain
0    30
Name: count, dtype: int64

 hawk_dove_neutral
hawk_dove_neutral
1    28
0     2
Name: count, dtype: int64

 assertive_subj
assertive_subj
1    16
2    14
Name: count, dtype: int64

 optimistic_subj
optimistic_subj
1    29
2     1
Name: count, dtype: int64

 specific_subj
specific_subj
1    16
2    14
Name: count, dtype: int64


In [52]:
corr_cols = [
    "forward_looking",
    "certain",
    "hawk_dove_neutral",
    "assertive_subj",
    "optimistic_subj",
    "specific_subj"
]

corr_df = df[corr_cols].copy()

corr_pearson = corr_df.corr(method="pearson")
corr_spearman = corr_df.corr(method="spearman")

corr_pearson.to_csv("powell_label_corr_pearson.csv")
corr_spearman.to_csv("powell_label_corr_spearman.csv")

print("Saved powell_label_corr_pearson.csv and powell_label_corr_spearman.csv")
corr_spearman

Saved powell_label_corr_pearson.csv and powell_label_corr_spearman.csv


Unnamed: 0,forward_looking,certain,hawk_dove_neutral,assertive_subj,optimistic_subj,specific_subj
forward_looking,1.0,,-0.161165,0.412984,0.111979,0.564076
certain,,,,,,
hawk_dove_neutral,-0.161165,,1.0,-0.285714,0.049629,-0.285714
assertive_subj,0.412984,,-0.285714,1.0,0.198517,0.732143
optimistic_subj,0.111979,,0.049629,0.198517,1.0,0.198517
specific_subj,0.564076,,-0.285714,0.732143,0.198517,1.0
