In [1]:
import pandas as pd
import json
import os
import re
import ast
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
full_comments = pd.read_csv("../data/full_comments.csv")
examples = full_comments.loc[full_comments['true_label'].notna(), ["comment_text", "true_label"]].sample(10)
negative_examples = pd.read_excel("negative_examples.xlsx", names=['comment_text', 'original_label', 'corrected_label'])

examples = examples.rename(columns={"comment_text": "comment", "true_label": "label"})

examples_json = json.dumps(examples.to_dict(orient="records"), indent=2)
negative_examples_json = json.dumps(negative_examples.to_dict(orient="records"), indent=2)

In [3]:
subset = full_comments.sample(30000, random_state=10)

indices = [a for a in full_comments.index if a not in subset.index]

subset = full_comments.loc[indices, :]

subset['label'] = subset['true_label']

In [None]:
half = int(subset.shape[0] / 2)
subset1 = subset[:half]
subset2 = subset[half:]

In [29]:
COMMENT_COLUMN = "comment_text"
OUTPUT_CSV = "openai_comments_labeled_p2.csv"

load_dotenv()
api_key = os.environ.get('OPENAI_API_KEY')

In [30]:
# ============================================================
# CONFIGURATION
# ============================================================

MODEL = "gpt-5-mini"
BATCH_SIZE = 30
BATCH_ID_FILE = "openai_batch_id.txt"
BATCH_INPUT_FILE = "openai_batch_input.jsonl"

# ============================================================
# SYSTEM PROMPT
# ============================================================
SYSTEM_PROMPT = f"""You are a comment classifier. You will be given a batch of comments, each with an ID number. 
Classify each comment into exactly ONE of these five categories:

**Argumentative**
- Makes specific claims, predictions, or assertions supported by reasoning
- Uses evidence, anecdotes, or scenarios to build a case
- The key distinction from Opinion: there's an attempt to *persuade* or *explain why*, not just state a position

**Informational**
- Shares facts, data, links, or context relevant to the discussion
- Low emotional affect — the comment is trying to *inform*, not convince or react
- Includes answering another commenter's question with factual content
- The key distinction from Argumentative: presenting information without advocating for a position

**Opinion**
- States a value judgment, stance, or take without substantial reasoning
- "This is good/bad/wrong/overrated" — the comment *asserts* but doesn't *argue*
- The key distinction from Argumentative: no real attempt to persuade or support the claim
- The key distinction from Expressive: the comment is making a point, not just reacting

**Expressive**
- Emotional reactions, sarcasm, jokes, venting, exclamations
- The comment is primarily *expressing feeling* rather than making a point
- Includes performative agreement/disagreement ("THIS," "lol exactly," "what a joke")
- The key distinction from Opinion: no identifiable stance being taken, just affect

**Neutral**
- Clarifying or rhetorical questions, meta-commentary, off-topic remarks
- Comments that don't clearly fit the other four categories
- Includes simple factual questions directed at other commenters

**Correctly labeled examples** — these demonstrate the correct label for each comment:
{examples_json}

**Incorrectly labeled examples** — these were originally mislabeled. The "original_label" is the wrong label that was assigned, and the "corrected_label" is what the label should have been. Use these to understand common mistakes to avoid:
{negative_examples_json}

Respond with ONLY a valid JSON array where each element has "id", "label" keys and a confidence indicator where 
0 is not confident in the chosen label and 1 is confident in the chosen label.
Example: [{{"id": 0, "label": "Argumentative", "confidence": 1}}, {{"id": 1, "label": "Expressive", "confidence": 0}}]

Do not include any text outside the JSON array. No explanations, no markdown."""

VALID_LABELS = {"Argumentative", "Informational", "Opinion", "Expressive", "Neutral"}


def format_batch(comments):
    lines = []
    for idx, comment in comments:
        truncated = comment[:1500] if len(comment) > 1500 else comment
        lines.append(f"[{idx}] {truncated}")
    return "\n\n".join(lines)


def parse_response(response_text, expected_ids):
    text = response_text.strip()
    if text.startswith("```"):
        text = text.split("\n", 1)[1]
        text = text.rsplit("```", 1)[0]

    try:
        results = json.loads(text)
    except json.JSONDecodeError:
        try:
            results = ast.literal_eval(text)
        except (ValueError, SyntaxError):
            match = re.search(r'\[.*\]', text, re.DOTALL)
            if match:
                try:
                    results = json.loads(match.group())
                except json.JSONDecodeError:
                    return {}
            else:
                return {}

    if results and isinstance(results[0], list):
        results = results[0]

    labels = {}
    for item in results:
        idx = item.get("id")
        label = item.get("label", "").strip()
        conf = item.get("confidence", "")
        if idx not in expected_ids:
            continue
        if label not in VALID_LABELS:
            matched = [v for v in VALID_LABELS if v.lower() == label.lower()]
            if matched:
                label = matched[0]
            else:
                continue
        labels[idx] = {"label": label, "confidence": conf}
    return labels


def save_results(df):
    if os.path.exists(OUTPUT_CSV):
        existing = pd.read_csv(OUTPUT_CSV)
        combined = pd.concat([existing, df], ignore_index=True)
    else:
        combined = df
    combined.to_csv(OUTPUT_CSV, index=False)
    return combined


client = OpenAI(api_key=api_key)

In [31]:
# ============================================================
# STEP 1: Build .jsonl file, upload, and submit batch
# ============================================================

df = subset2.copy()

# Build batches of comments
unlabeled_mask = df["label"].isna()
unlabeled_indices = df[unlabeled_mask].index.tolist()
print(f"{len(unlabeled_indices)} comments to label")

batches = []
for i in range(0, len(unlabeled_indices), BATCH_SIZE):
    batch_indices = unlabeled_indices[i:i + BATCH_SIZE]
    batch = [(idx, str(df.loc[idx, COMMENT_COLUMN])) for idx in batch_indices]
    batches.append(batch)

# Save mapping of batch index -> expected IDs for retrieval later
batch_mapping = {}

# Write .jsonl file
with open(BATCH_INPUT_FILE, "w") as f:
    for i, batch in enumerate(batches):
        expected_ids = [idx for idx, _ in batch]
        batch_mapping[str(i)] = expected_ids
        request = {
            "custom_id": f"batch_{i}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL,
                "max_completion_tokens": 8092,
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": format_batch(batch)}
                ]
            }
        }
        f.write(json.dumps(request) + "\n")

print(f"{len(batches)} requests written to {BATCH_INPUT_FILE}")

# Upload the file
batch_file = client.files.create(
    file=open(BATCH_INPUT_FILE, "rb"),
    purpose="batch"
)
print(f"File uploaded: {batch_file.id}")

# Submit the batch
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

# Save batch_id and mapping for retrieval later
with open(BATCH_ID_FILE, "w") as f:
    f.write(batch_job.id)

with open("openai_batch_mapping.json", "w") as f:
    json.dump(batch_mapping, f)

print(f"Batch submitted! ID: {batch_job.id}")
print(f"Saved batch_id to {BATCH_ID_FILE} and mapping to openai_batch_mapping.json")
print("You can close your computer now.")

23819 comments to label
794 requests written to openai_batch_input.jsonl
File uploaded: file-XtQGENXQWXnLmhFhNFsNeh
Batch submitted! ID: batch_699376e36e388190b6b2f3ec5ea2f993
Saved batch_id to openai_batch_id.txt and mapping to openai_batch_mapping.json
You can close your computer now.


In [34]:
# ============================================================
# STEP 2: Check batch status (run when you come back)
# ============================================================

with open(BATCH_ID_FILE, "r") as f:
    batch_id = f.read().strip()

status = client.batches.retrieve(batch_id)
print(f"Status: {status.status}")
print(f"Counts: {status.request_counts}")

Status: in_progress
Counts: BatchRequestCounts(completed=778, failed=0, total=794)


In [8]:
# Check error details
batch_job = client.batches.retrieve(batch_id)
if batch_job.error_file_id:
    error_content = client.files.content(batch_job.error_file_id).content
    with open("openai_batch_errors.jsonl", "wb") as f:
        f.write(error_content)

    with open("openai_batch_errors.jsonl", "r") as f:
        for i, line in enumerate(f):
            err = json.loads(line.strip())
            print(f"{err['custom_id']}: {err['error']}")
            if i >= 4:
                print("...")
                break

In [None]:
# ============================================================
# STEP 3: Retrieve results and save (run once status is "completed")
# ============================================================

with open(BATCH_ID_FILE, "r") as f:
    batch_id = f.read().strip()

with open("openai_batch_mapping.json", "r") as f:
    batch_mapping = json.load(f)

# Download results file
batch_job = client.batches.retrieve(batch_id)
result_content = client.files.content(batch_job.output_file_id).content

result_file = "openai_batch_results.jsonl"
with open(result_file, "wb") as f:
    f.write(result_content)

print(f"Results downloaded to {result_file}")

# Parse results
df = subset2.copy()
total_labeled = 0
failed = 0

with open(result_file, "r") as f:
    for line in f:
        result = json.loads(line.strip())
        batch_idx = result["custom_id"].split("_")[1]
        expected_ids = batch_mapping[batch_idx]

        if result["error"] is None:
            response_text = result["response"]["body"]["choices"][0]["message"]["content"]
            labels = parse_response(response_text, expected_ids)

            for idx, value in labels.items():
                df.loc[idx, "label"] = value["label"]
                df.loc[idx, "confidence"] = value["confidence"]

            total_labeled += len(labels)
        else:
            failed += 1
            print(f"  Batch {batch_idx} failed: {result['error']}")

combined = save_results(df)

print(f"\nDONE \u2014 {total_labeled} comments labeled, {failed} batches failed")
print(f"Saved to: {OUTPUT_CSV} ({len(combined)} total rows)")
print(f"\nLabel distribution:")
print(df["label"].value_counts().to_string())

Results downloaded to openai_batch_results.jsonl

DONE — 23827 comments labeled, 0 batches failed
Saved to: openai_comments_labeled_p1.csv (23857 total rows)

Label distribution:
label
Argumentative    8287
Expressive       5317
Opinion          4908
Informational    3196
Neutral          2146
