In [4]:
# imports
import os
import json
import re
import time
import requests
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import sys

In [5]:
# Config
MODEL = "gpt-4o-mini"
API_URL = "https://openrouter.ai/api/v1/chat/completions"

load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise RuntimeError("OPENROUTER_API_KEY not set")

HEADERS = {
    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    "Content-Type": "application/json",
}

In [6]:
df = pd.read_csv(
    "data_training_selected_clusters_comments_and_rules.csv",
    usecols=["body", "assigned_rule_cluster"]
)

df["assigned_rule_cluster"] = (
    df["assigned_rule_cluster"].astype(str).str.strip()
)

VALID_LABELS = sorted(df["assigned_rule_cluster"].unique())

In [7]:
def sample_few_shot_examples(
    df,
    n_per_label=2,
    random_state=None
):
    """
    Returns list of (text, label)
    """
    samples = []

    for label, group in df.groupby("assigned_rule_cluster"):
        k = min(n_per_label, len(group))
        sampled = group.sample(k, random_state=random_state)
        for _, row in sampled.iterrows():
            samples.append((row["body"], label))

    return samples

In [8]:
def build_prompt(comment, valid_labels, few_shot_examples=None):
    if few_shot_examples:
        examples = "\n\n".join(
            f"Comment: {text}\nLabel: {label}"
            for text, label in few_shot_examples
        )

        return f"""
Task: Assign exactly ONE label to the comment.

Valid labels:
{chr(10).join(valid_labels)}

Examples:
{examples}

Comment:
{comment}

Output (label only):
""".strip()
    else:
        return f"""
Task: Assign exactly ONE label to the comment.

Valid labels:
{chr(10).join(valid_labels)}

Comment:
{comment}

Output (label only):
""".strip()


In [9]:
def classify_comment(comment, valid_labels, few_shot_examples=None, retries=3):
    prompt = build_prompt(comment, valid_labels, few_shot_examples)

    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0,
    }

    for _ in range(retries):
        try:
            r = requests.post(API_URL, headers=HEADERS, json=payload, timeout=30)
            r.raise_for_status()

            output = r.json()["choices"][0]["message"]["content"].strip()
            return output if output in valid_labels else "error"

        except Exception:
            time.sleep(2)

    return "error"


In [10]:
def classify_tuples_from_dataset(
    tuples,
    df,
    mode="zero",
    n_per_label=2,
    random_state=None
):
    """
    tuples: list of (id, text)
    Returns: list of JSON-serializable dicts
    """

    valid_labels = sorted(df["assigned_rule_cluster"].unique())

    few_shot_examples = None
    if mode == "few":
        few_shot_examples = sample_few_shot_examples(
            df,
            n_per_label=n_per_label,
            random_state=random_state
        )

    results = []

    for id_, text in tqdm(tuples):
        label = classify_comment(
            text,
            valid_labels,
            few_shot_examples
        )

        results.append({
            "id": id_,
            "text": text,
            "predicted_cluster": label
        })

    return results


In [11]:
subset = df.sample(10, random_state=42)
tuples = list(zip(subset.index, subset["body"]))

results = classify_tuples_from_dataset(
    tuples,
    df,
    mode="few",
    n_per_label=2,
    random_state=1
)
with open("classification_results.json", "w") as f:
    json.dump(results, f, indent=2)

100%|██████████| 10/10 [00:08<00:00,  1.16it/s]


In [None]:
results_df = pd.read_json("classification_results.json")

# join with ground truth
merged = results_df.merge(
    df.reset_index(),
    left_on="id",
    right_on="index"
)

y_true = merged["assigned_rule_cluster"].astype(str)
y_pred = merged["predicted_cluster"].astype(str)

accuracy = (y_true == y_pred).mean()

error_rate = (y_pred == "error").mean()

labels = sorted(y_true.unique())
f1s = []

for label in labels:
    tp = ((y_pred == label) & (y_true == label)).sum()
    fp = ((y_pred == label) & (y_true != label)).sum()
    fn = ((y_pred != label) & (y_true == label)).sum()

    if tp == 0:
        f1 = 0.0
    else:
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0 else 0.0
        )

    f1s.append(f1)

f1_macro = sum(f1s) / len(f1s)

print(f"Accuracy:   {accuracy:.3f}")
print(f"Macro F1:   {f1_macro:.3f}")
print(f"Error rate: {error_rate:.3f}")

Accuracy:   0.100
Macro F1:   0.056
Error rate: 0.000
