## 0. Imports & Setup
Consolidate imports and set up the OpenAI client. API keys are read from environment variables to avoid accidental commits.



In [None]:
import os
import json
import pandas as pd
import openai
from openai import OpenAI

# Configure API key from environment (preferred for security)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
openai.api_key = os.getenv("OPENAI_API_KEY")

# Sanity check (will be None if not set yet)
print("OPENAI_API_KEY set:", bool(openai.api_key))

## 1. Configuration (placeholders)
Update the placeholders below before running. Names are preserved exactly as in your script.

In [None]:
# === CONFIGURATION ===
INPUT_CSV = r"../../../data/electoralTerm_19.csv" #replace with different file path if needed
CHUNK_SIZE = 500  # ⬅️ Now splitting into 500-row chunks
CHUNKS_DIR = "batch_chunks_500"
MODEL = "gpt-4o-mini" #adjust model if needed

## 2. Ensure Output Directory & Load CSV
Creates the chunks directory (if needed) and loads the input data to `df`.

In [None]:
# === Ensure output directory exists ===
os.makedirs(CHUNKS_DIR, exist_ok=True)

# === Load CSV ===
df = pd.read_csv(INPUT_CSV)
print(f"📄 Loaded {len(df)} speeches")

## 3. Prompt Template
Defines `build_prompt(text)` to produce a consistent classification instruction.


In [None]:
# === Prompt template ===
def build_prompt(text):
    return f"""You are a political analyst trained to classify political texts on a 1-10 left-right ideological spectrum. Your task is to read a political speech or excerpt and assign a number from 1 (far-left) to 10 (far-right) based on its ideological content.


Use this scale as a reference, based on common positions in German politics:


1: Far-left, revolutionary socialism (e.g., MLPD, Antifa)  
2: Anti-capitalist democratic socialism (e.g., DIE LINKE, radical wing)  
3: Progressive left, social justice-focused (e.g., DIE LINKE, moderate)  
4: Center-left, reformist social democracy (e.g., SPD)  
5: Centrist, socially liberal or pragmatic (e.g., Volt, FDP, left-liberal)  
6: Center-right liberalism, market-oriented (e.g., FDP, neoliberal wing)  
7: Conservative mainstream (e.g., CDU/CSU)  
8: National-conservative or traditionalist right (e.g., WerteUnion)  
9: Right-wing populist, anti-immigration (e.g., AfD, moderate)  
10: Far-right nationalist or extremist (e.g., AfD, radical wing)


Classify the following political text according to this scale and output **only the numeric value (1-10)**. Do not add explanations, comments, or any additional text.


Text:
\"\"\"{text}\"\"\"
"""

## 4. Create JSONL Chunks
Splits the dataset into `CHUNK_SIZE` slices and writes one JSONL file per slice inside `CHUNKS_DIR`.
Each line contains a request object compatible with the Batch API.

In [None]:
# === Chunk and write JSONL files ===
chunks = [df.iloc[i:i + CHUNK_SIZE] for i in range(0, len(df), CHUNK_SIZE)]
print(f"🧩 Splitting into {len(chunks)} chunks of {CHUNK_SIZE} rows each")

for i, chunk in enumerate(chunks):
    jsonl_path = os.path.join(CHUNKS_DIR, f"chunk_{i:03d}.jsonl")
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for idx, row in chunk.iterrows():
            messages = [
                {"role": "system", "content": "You are a political classification assistant."},
                {"role": "user", "content": build_prompt(row['speechContent'])}
            ]
            task = {
                "custom_id": str(row.name),  # Keep global index
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": MODEL,
                    "temperature": 0.05,
                    "messages": messages
                }
            }
            f.write(json.dumps(task) + "\n")
    print(f"✅ Saved chunk {i+1} → {jsonl_path} with {len(chunk)} tasks")

print("🎉 Done! You can now submit each JSONL to OpenAI Batch API.")

## 5. Submit a Batch Job
Uploads a selected JSONL chunk and creates a batch (24h window by default). Note: Batch jobs were chosen because of cheaper tokens. On a paid plan, also possible to run analysis directly without batch limits.

In [None]:
# === Submit batch job to OpenAI ===
import openai  

# Choose which chunk to upload (edit as needed)
jsonl_to_upload = os.path.join(CHUNKS_DIR, "chunk_000.jsonl")

# Upload the file for batch processing
upload = openai.files.create(
    file=open(jsonl_to_upload, "rb"),
    purpose="batch"
)
file_id = upload.id
print(f"✅ File uploaded: {file_id}")

# Create the batch job
batch = openai.batches.create(
    input_file_id=file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h" 
)

batch_id = batch.id
print(f"🚀 Batch job submitted! Batch ID: {batch_id}")

## 6. Retrieve Result File ID from an Existing Batch
If you already have a `batch_id`, you can retrieve its status and the `output_file_id` (when completed).

In [None]:
# === Retrieve file id from batch job ===
batch_id = "<REPLACE_WITH_YOUR_BATCH_ID>"  # ← insert batch job ID (found in output of previous code chunk)

batch_info = openai.batches.retrieve(batch_id)

if batch_info.status == "completed":
    result_file_id = batch_info.output_file_id
    print("🎉 Here's the file ID:", result_file_id)
else:
    print("Batch not completed yet. Current status:", batch_info.status)

## 7. Merge Scores Back Into CSV
Downloads the batch results and updates the CSV's `gpt_score` column. Only missing values are updated.

**Note**: This cell preserves your variable names and fixes a small function indentation bug.

In [None]:
# === 1. Config ===
csv_path = r"../../../data/electoralTerm_19_scored.csv" #replace with different file path if needed
result_file_id = result_file_id  # assumes it's set in the previous cell when batch completed

# === 2. Load the CSV ===
df = pd.read_csv(csv_path)
df['custom_id'] = df['custom_id'].astype(str)  # Ensure matching format

# === 3. Retrieve the result file content from OpenAI ===
result_text = openai.files.retrieve_content(result_file_id)

# === 4. Parse JSONL into a lookup dict: {custom_id: content}
custom_id_to_score = {}
for line in result_text.splitlines():
    try:
        entry = json.loads(line)
        custom_id = str(entry.get('custom_id'))
        content = entry['response']['body']['choices'][0]['message']['content']
        custom_id_to_score[custom_id] = content
    except Exception as e:
        print(f"⚠️ Skipping malformed line: {e}")

# === 5. Update only missing gpt_score values ===
def update_score(row):
    return custom_id_to_score.get(str(row['custom_id']), row.get('gpt_score'))

df['gpt_score'] = df.apply(update_score, axis=1)

# === 6. Save the updated CSV ===
df.to_csv(csv_path, index=False)
print("✅ gpt_score column updated using result_file_id.")