## 0) Imports & Setup
We load libraries and configure the Gemini client. For safety, the API key is read from the `GEMINI_API_KEY` environment variable.



In [None]:
import os
import time
import textwrap
import pandas as pd
import google.generativeai as genai
from tqdm import tqdm

# Read API key from environment for safety (keeps the variable name API_KEY)
API_KEY = os.getenv("GEMINI_API_KEY")
print("GEMINI_API_KEY set:", bool(API_KEY))

## 1) Configuration (repo-relative paths)


In [None]:
# === CONFIGURATION ===
INPUT_CSV = r"../../../data/electoralTerm_19.csv"  # repo-relative path
OUTPUT_CSV = r"../../../data/electoralTerm_19_scored_gemini_2_5.csv"  # repo-relative path
CHUNK_SIZE = 500
MODEL_NAME = "gemini-2.5-flash-lite" # change model if needed 
SLEEP_BETWEEN_REQUESTS = 0.5  # adjust if needed

## 2) Initialize Gemini
Configures the Gemini client with your key and creates the model object.
This keeps your original variable names.

In [None]:
# === Initialize Gemini ===
if not API_KEY:
    raise RuntimeError(
        "GEMINI_API_KEY is not set. Please set it in your environment before running this cell."
    )
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

## 3) Load Data
Reads the CSV to `df` and adds a `custom_id` column to preserve original indexing for joins.

In [None]:
# === Load Data ===
df = pd.read_csv(INPUT_CSV)
df['custom_id'] = df.index.astype(str)
print(f"📄 Loaded {len(df)} rows from {INPUT_CSV}")

## 4) Load Existing Results (if any)


In [None]:
# === Load existing results if available ===
if os.path.exists(OUTPUT_CSV):
    existing_df = pd.read_csv(OUTPUT_CSV)
    if 'gemini_score' in existing_df.columns:
        df['gemini_score'] = existing_df['gemini_score']
    else:
        df['gemini_score'] = pd.NA
else:
    df['gemini_score'] = pd.NA

# Optional: save a safety snapshot of current df state
# df.to_csv(OUTPUT_CSV, index=False)

## 5) Prompt Template
Builds the classification instruction for each `speechContent` text. 

In [None]:
# === Prompt template ===
def build_prompt(text):
    return f"""You are a political analyst trained to classify political texts on a 1-10 left-right ideological spectrum. Your task is to read a political speech or excerpt and assign a number from 1 (far-left) to 10 (far-right) based on its ideological content.


Use this scale as a reference, based on common positions in German politics:


1: Far-left, revolutionary socialism (e.g., MLPD, Antifa)  
2: Anti-capitalist democratic socialism (e.g., DIE LINKE, radical wing)  
3: Progressive left, social justice-focused (e.g., DIE LINKE, moderate)  
4: Center-left, reformist social democracy (e.g., SPD)  
5: Centrist, socially liberal or pragmatic (e.g., Volt, FDP, left-liberal)  
6: Center-right liberalism, market-oriented (e.g., FDP, neoliberal wing)  
7: Conservative mainstream (e.g., CDU/CSU)  
8: National-conservative or traditionalist right (e.g., WerteUnion)  
9: Right-wing populist, anti-immigration (e.g., AfD, moderate)  
10: Far-right nationalist or extremist (e.g., AfD, radical wing)


Classify the following political text according to this scale and output **only the numeric value (1-10)**. Do not add explanations, comments, or any additional text.


Text:
\"\"\"{text}\"\"\"
"""

## 6) Process in Chunks & Save Progress
Iterates the dataset in `CHUNK_SIZE` blocks, queries Gemini, parses a numeric score, and writes progress to CSV after each chunk.

In [None]:
# === Process Remaining Entries Only ===
for i in tqdm(range(0, len(df), CHUNK_SIZE), desc="🔄 Processing chunks"):
    chunk = df.iloc[i:i + CHUNK_SIZE]

    for idx, row in chunk.iterrows():
        if pd.notna(row['gemini_score']):
            continue  # already done, skip

        prompt = build_prompt(row['speechContent'])

        try:
            response = model.generate_content(prompt, generation_config={"temperature": 0.05})
            score = response.text.strip()
            score = ''.join(filter(str.isdigit, score))
            if score.isdigit():
                df.at[idx, 'gemini_score'] = int(score)
            else:
                df.at[idx, 'gemini_score'] = None

        except Exception as e:
            print(f"⚠️ Error on row {idx}: {e}")
            df.at[idx, 'gemini_score'] = None

        time.sleep(SLEEP_BETWEEN_REQUESTS)

    # ✅ Save progress after each chunk
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"💾 Progress saved after chunk ending at row {i + CHUNK_SIZE}")