## Gemini Prompting

In [None]:
from datasets import load_from_disk
from transformers import AutoTokenizer
from google.cloud import bigquery
from google import genai
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from google.api_core.exceptions import ResourceExhausted, ServiceUnavailable
from tqdm.auto import tqdm
from datetime import datetime, timezone
import re
import json
import time
from itertools import islice

In [None]:
# === 1. Configuration ===
PROJECT_ID   = "bamboo-mercury-462915-f0"
BQ_DATASET   = "edgar_sentiment"
BQ_TABLE     = "filing_scores"
REGION_MODEL = "us-central1"
BQ_REGION    = "europe-west2"
MAX_FILES    = 1000         # process at most 1 000 filings
MAX_TOKENS   = 512
STRIDE       = 50           # overlap between chunks

In [None]:
# Only label these two sections
TARGET_SECTIONS = {"section_7", "section_1A"}

In [None]:
# === 2. Initialize clients & resources ===
bq = bigquery.Client(project=PROJECT_ID)
genai_client = genai.Client(vertexai=True, project=PROJECT_ID, location=REGION_MODEL)

In [None]:
# Ensure BigQuery dataset exists
ds_ref = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET}")
ds_ref.location = BQ_REGION
bq.create_dataset(ds_ref, exists_ok=True)

In [None]:
# === 2. Prepare the target table ===
table_id = f"{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}"
schema = [
    bigquery.SchemaField("blob_name", "STRING"),
    bigquery.SchemaField("cik", "STRING"),
    bigquery.SchemaField("year", "STRING"),
    bigquery.SchemaField("section", "STRING"),
    bigquery.SchemaField("chunk_text", "STRING"),
    bigquery.SchemaField("sentiment_label", "STRING"),
    bigquery.SchemaField("sentiment_score", "FLOAT"),
]
table = bigquery.Table(table_id, schema=schema)
bq_client.create_table(table, exists_ok=True)

In [None]:
# === 3. FinBERT tokenizer for 512-token chunks ===
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")

def token_chunker(text: str):
    enc = tokenizer(
        text,
        return_overflowing_tokens=True,
        truncation=True,
        max_length=MAX_TOKENS,
        stride=STRIDE
    )
    for toks in enc["input_ids"]:
        yield tokenizer.decode(toks, skip_special_tokens=True)

In [None]:
# === 4. Ingest & upload chunks for every section ===
# Downlaod from cloud
ds = load_from_disk("edgar-corpus-full")
ds = ds['train']
# shuffle once
ds = ds.shuffle(seed=42)

In [None]:
def analyze_sentiment(text: str) -> str:
    prompt = f"""
You are a financial analyst. Respond **only** with JSON matching this schema:
{{"sentiment":"Positive|Neutral|Negative","score":float,"explanation":string}}

Excerpt:
{text}
"""
    resp = genai_client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt]
    )
    return resp.text.strip()

In [None]:
# === 5. Parser for Gemini JSON response ===
def parse_gemini(resp: str):
    try:
        obj = json.loads(resp)
        return obj.get("sentiment", "Unknown"), obj.get("score", None)
    except json.JSONDecodeError:
        # fallback to regex
        sent_m = re.search(r'"sentiment"\s*:\s*"(\w+)"', resp)
        score_m = re.search(r'"score"\s*:\s*([-+]?\d*\.?\d+)', resp)
        label = sent_m.group(1) if sent_m else "Unknown"
        score = float(score_m.group(1)) if score_m else None
        return label, score

In [None]:
# === 7. Fetch already-processed blob_names from BQ ===
processed = {
    row.blob_name
    for row in bq.query(
        f"SELECT DISTINCT blob_name FROM `{table_id}`"
    ).result()
}

In [None]:
# === 8. Main loop: chunk → label → insert ===
count = 0
for idx, row in tqdm(
    islice(enumerate(ds), MAX_FILES),
    total=MAX_FILES,
    desc="Filings",
    unit="file"
):
    cik  = row["cik"]
    year = row["year"]
    for section in TARGET_SECTIONS:
        text = row.get(section) or ""
        for cidx, chunk in enumerate(token_chunker(text)):
            blob_name = f"{cik}_{year}_{section}_{idx}_{cidx}"
            if blob_name in processed:
                continue

            # call Gemini with retries
            try:
                resp = analyze_sentiment(chunk)
            except Exception as e:
                print(f"[ERROR] Gemini failed for {blob_name}: {e}")
                continue

            label, score = parse_gemini(resp)

            # prepare row
            row_out = {
                "blob_name":       blob_name,
                "cik":             cik,
                "year":            year,
                "section":         section,
                "chunk_text":      chunk,
                "sentiment_label": label,
                "sentiment_score": score,
            }

            # insert with dedupe via row_ids
            errors = bq.insert_rows_json(
                table_id,
                [row_out],
                row_ids=[blob_name]
            )
            if errors:
                print(f"[BQ ERROR] {blob_name}: {errors}")

    count += 1

print(f"✅ Completed sentiment labeling for {count} filings.")