In [1]:
import pandas as pd
import json
import os
import re
import ast
import time
from dotenv import load_dotenv
from google import genai
from google.genai import types

In [2]:
full_comments = pd.read_csv("../data/full_comments.csv")
examples = full_comments.loc[full_comments['true_label'].notna(), ["comment_text", "true_label"]].sample(10)
negative_examples = pd.read_excel("negative_examples.xlsx", names=['comment_text', 'original_label', 'corrected_label'])

examples = examples.rename(columns={"comment_text": "comment", "true_label": "label"})

examples_json = json.dumps(examples.to_dict(orient="records"), indent=2)
negative_examples_json = json.dumps(negative_examples.to_dict(orient="records"), indent=2)

In [14]:
subset = full_comments.sample(30000, random_state=10)

indices = [a for a in full_comments.index if a not in subset.index]

subset = full_comments.loc[indices, :]

subset['label'] = subset['true_label']

In [15]:
half = int(subset.shape[0] / 2)
subset1 = subset[:half]
subset2 = subset[half:]

In [16]:
COMMENT_COLUMN = "comment_text"
OUTPUT_CSV = "gemini_comments_labeled_p1.csv"

load_dotenv()
api_key = os.environ.get('GEMINI_API_KEY')

In [17]:
# ============================================================
# CONFIGURATION
# ============================================================

MODEL = "models/gemini-3-flash-preview"
BATCH_SIZE = 50
MAX_REQUESTS_PER_JOB = 100
JOB_NAMES_FILE = "gemini_job_names.json"

# ============================================================
# SYSTEM PROMPT
# ============================================================
SYSTEM_PROMPT = f"""You are a comment classifier. You will be given a batch of comments, each with an ID number. 
Classify each comment into exactly ONE of these five categories:

**Argumentative**
- Makes specific claims, predictions, or assertions supported by reasoning
- Uses evidence, anecdotes, or scenarios to build a case
- The key distinction from Opinion: there's an attempt to *persuade* or *explain why*, not just state a position

**Informational**
- Shares facts, data, links, or context relevant to the discussion
- Low emotional affect — the comment is trying to *inform*, not convince or react
- Includes answering another commenter's question with factual content
- The key distinction from Argumentative: presenting information without advocating for a position

**Opinion**
- States a value judgment, stance, or take without substantial reasoning
- "This is good/bad/wrong/overrated" — the comment *asserts* but doesn't *argue*
- The key distinction from Argumentative: no real attempt to persuade or support the claim
- The key distinction from Expressive: the comment is making a point, not just reacting

**Expressive**
- Emotional reactions, sarcasm, jokes, venting, exclamations
- The comment is primarily *expressing feeling* rather than making a point
- Includes performative agreement/disagreement ("THIS," "lol exactly," "what a joke")
- The key distinction from Opinion: no identifiable stance being taken, just affect

**Neutral**
- Clarifying or rhetorical questions, meta-commentary, off-topic remarks
- Comments that don't clearly fit the other four categories
- Includes simple factual questions directed at other commenters

**Correctly labeled examples** — these demonstrate the correct label for each comment:
{examples_json}

**Incorrectly labeled examples** — these were originally mislabeled. The "original_label" is the wrong label that was assigned, and the "corrected_label" is what the label should have been. Use these to understand common mistakes to avoid:
{negative_examples_json}

Respond with ONLY a valid JSON array where each element has "id", "label" keys and a confidence indicator where 
0 is not confident in the chosen label and 1 is confident in the chosen label.
Example: [{{"id": 0, "label": "Argumentative", "confidence": 1}}, {{"id": 1, "label": "Expressive", "confidence": 0}}]

Do not include any text outside the JSON array. No explanations, no markdown."""

VALID_LABELS = {"Argumentative", "Informational", "Opinion", "Expressive", "Neutral"}


def format_batch(comments):
    lines = []
    for idx, comment in comments:
        truncated = comment[:1500] if len(comment) > 1500 else comment
        lines.append(f"[{idx}] {truncated}")
    return "\n\n".join(lines)


def parse_response(response_text, expected_ids):
    text = response_text.strip()
    if text.startswith("```"):
        text = text.split("\n", 1)[1]
        text = text.rsplit("```", 1)[0]

    try:
        results = json.loads(text)
    except json.JSONDecodeError:
        try:
            results = ast.literal_eval(text)
        except (ValueError, SyntaxError):
            match = re.search(r'\[.*\]', text, re.DOTALL)
            if match:
                try:
                    results = json.loads(match.group())
                except json.JSONDecodeError:
                    return {}
            else:
                return {}

    if results and isinstance(results[0], list):
        results = results[0]

    labels = {}
    for item in results:
        idx = item.get("id")
        label = item.get("label", "").strip()
        conf = item.get("confidence", "")
        if idx not in expected_ids:
            continue
        if label not in VALID_LABELS:
            matched = [v for v in VALID_LABELS if v.lower() == label.lower()]
            if matched:
                label = matched[0]
            else:
                continue
        labels[idx] = {"label": label, "confidence": conf}
    return labels


def save_results(df):
    if os.path.exists(OUTPUT_CSV):
        existing = pd.read_csv(OUTPUT_CSV)
        combined = pd.concat([existing, df], ignore_index=True)
    else:
        combined = df
    combined.to_csv(OUTPUT_CSV, index=False)
    return combined


client = genai.Client(api_key=api_key)

In [19]:
# ============================================================
# STEP 1: Build inline requests and submit batch jobs
#         (100 requests per job due to tier limit)
# ============================================================

df = subset1.copy()

# Build batches of comments
unlabeled_mask = df["label"].isna()
unlabeled_indices = df[unlabeled_mask].index.tolist()
print(f"{len(unlabeled_indices)} comments to label")

batches = []
for i in range(0, len(unlabeled_indices), BATCH_SIZE):
    batch_indices = unlabeled_indices[i:i + BATCH_SIZE]
    batch = [(idx, str(df.loc[idx, COMMENT_COLUMN])) for idx in batch_indices]
    batches.append(batch)

print(f"{len(batches)} requests to submit")

# Build inline requests
all_requests = []
batch_mapping = {}
for i, batch in enumerate(batches):
    expected_ids = [idx for idx, _ in batch]
    batch_mapping[str(i)] = expected_ids
    all_requests.append({
        'contents': [{
            'parts': [{'text': format_batch(batch)}],
            'role': 'user'
        }],
        'config': {
            'system_instruction': {'parts': [{'text': SYSTEM_PROMPT}]},
            'thinking_config': {'thinking_level': 'minimal'}
        }
    })

# Submit in groups of MAX_REQUESTS_PER_JOB
job_names = []
for chunk_start in range(0, len(all_requests), MAX_REQUESTS_PER_JOB):
    chunk = all_requests[chunk_start:chunk_start + MAX_REQUESTS_PER_JOB]
    chunk_num = chunk_start // MAX_REQUESTS_PER_JOB

    batch_job = client.batches.create(
        model=MODEL,
        src=chunk,
        config={'display_name': f'labeling-chunk-{chunk_num}'}
    )
    job_names.append(batch_job.name)
    print(f"  Chunk {chunk_num}: submitted {len(chunk)} requests -> {batch_job.name}")
    time.sleep(5)

# Save job names and mapping for retrieval later
with open(JOB_NAMES_FILE, "w") as f:
    json.dump({"job_names": job_names, "batch_mapping": batch_mapping}, f)

print(f"\n{len(job_names)} batch jobs submitted")
print(f"Saved to {JOB_NAMES_FILE}")
print("You can close your computer now.")

23830 comments to label
477 requests to submit


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. ', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}]}}

In [8]:
# ============================================================
# STEP 2: Check batch status (run when you come back)
# ============================================================

with open(JOB_NAMES_FILE, "r") as f:
    saved = json.load(f)
    job_names = saved["job_names"]

for name in job_names:
    job = client.batches.get(name=name)
    print(f"{job.name}: {job.state.name}")
    time.sleep(2)  # Delay between status checks to avoid 429 rate limit

batches/keh46cnqeyjuktw5xu4gtr904g1qu7owo5b5: JOB_STATE_SUCCEEDED
batches/y8aors0wnhldx115ohdt0s48gwin7dsfpfrq: JOB_STATE_SUCCEEDED
batches/9uouipfoblrwb4muh28cdadfh490nre1qnhg: JOB_STATE_SUCCEEDED
batches/prlvcnjl4y95r2yvpvs5lxbvuj7rcr87k7yg: JOB_STATE_SUCCEEDED
batches/gzss7lkccrx40bpzul86sdiwjdwls4kuw21o: JOB_STATE_SUCCEEDED
batches/8ocypzlq34ogy680e23wrg9amzc1alc29735: JOB_STATE_SUCCEEDED


In [None]:
# ============================================================
# STEP 2.5: Check for errors in completed jobs
# ============================================================

with open(JOB_NAMES_FILE, "r") as f:
    saved = json.load(f)
    job_names = saved["job_names"]

for name in job_names:
    job = client.batches.get(name=name)

    if job.state.name == 'JOB_STATE_FAILED':
        print(f"{name}: FAILED")
        if hasattr(job, 'error') and job.error:
            print(f"  Error: {job.error}")
        time.sleep(2)
        continue

    if job.state.name != 'JOB_STATE_SUCCEEDED':
        print(f"{name}: {job.state.name} (still running)")
        time.sleep(2)
        continue

    # Check individual request errors within succeeded jobs
    error_count = 0
    for i, inline_response in enumerate(job.dest.inlined_responses):
        if inline_response.error:
            error_count += 1
            if error_count <= 5:
                print(f"  {name} request {i}: {inline_response.error}")

    if error_count > 5:
        print(f"  ... and {error_count - 5} more errors")
    elif error_count == 0:
        print(f"{name}: all requests succeeded")

    time.sleep(2)  # Delay between job retrievals to avoid 429 rate limit

In [None]:
# ============================================================
# STEP 3: Retrieve results and save
#         (run once all jobs show JOB_STATE_SUCCEEDED)
# ============================================================

with open(JOB_NAMES_FILE, "r") as f:
    saved = json.load(f)
    job_names = saved["job_names"]
    batch_mapping = saved["batch_mapping"]

df = subset1.copy()
total_labeled = 0
failed = 0
request_idx = 0
debug_responses = []  # Save problematic responses for inspection

for name in job_names:
    job = client.batches.get(name=name)
    time.sleep(2)  # Delay between job retrievals to avoid 429 rate limit

    if job.state.name != 'JOB_STATE_SUCCEEDED':
        print(f"  Skipping {name} — state: {job.state.name}")
        chunk_size = min(MAX_REQUESTS_PER_JOB, len(batch_mapping) - request_idx)
        request_idx += chunk_size
        continue

    for inline_response in job.dest.inlined_responses:
        expected_ids = batch_mapping[str(request_idx)]

        if inline_response.response:
            response_text = inline_response.response.text

            # Handle None or empty text (e.g. thinking consumed all tokens)
            if not response_text:
                failed += 1
                debug_responses.append({
                    "request_idx": request_idx,
                    "job_name": name,
                    "issue": "response.text is None/empty",
                    "response_obj": str(inline_response.response)
                })
                print(f"  Request {request_idx}: response.text is None — saved for debug")
                request_idx += 1
                continue

            try:
                labels = parse_response(response_text, expected_ids)
            except (AttributeError, TypeError) as e:
                failed += 1
                debug_responses.append({
                    "request_idx": request_idx,
                    "job_name": name,
                    "issue": str(e),
                    "response_text": response_text[:500] if response_text else None,
                    "response_obj": str(inline_response.response)
                })
                print(f"  Request {request_idx}: parse error ({e}) — saved for debug")
                request_idx += 1
                continue

            for idx, value in labels.items():
                df.loc[idx, "label"] = value["label"]
                df.loc[idx, "confidence"] = value["confidence"]

            total_labeled += len(labels)
        else:
            failed += 1
            print(f"  Request {request_idx} failed: {inline_response.error}")

        request_idx += 1

combined = save_results(df)

print(f"\nDONE — {total_labeled} comments labeled, {failed} requests failed")
print(f"Saved to: {OUTPUT_CSV} ({len(combined)} total rows)")
print(f"\nLabel distribution:")
print(df["label"].value_counts().to_string())

if debug_responses:
    print(f"\n{len(debug_responses)} problematic responses saved to 'debug_responses' variable")
    with open("gemini_debug_responses.json", "w") as f:
        json.dump(debug_responses, f, indent=2)
    print("Also saved to gemini_debug_responses.json")

  Request 413: response.text is None — saved for debug

DONE — 29799 comments labeled, 1 requests failed
Saved to: gemini_comments_labeled_p1.csv (77513 total rows)

Label distribution:
label
Expressive       10525
Opinion           8626
Argumentative     6273
Neutral           2633
Informational     1807

1 problematic responses saved to 'debug_responses' variable
Also saved to gemini_debug_responses.json


In [11]:
df = pd.read_csv("gemini_comments_labeled_p1.csv")