In [43]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_KEY")  # Replace this!

In [45]:
import pandas as pd
import openai
import time
import random
import re
import unicodedata
import json
from sklearn.metrics import classification_report, accuracy_score

# ========================
# Configurable Parameters
# ========================
NUM_TRAIN_PER_CLASS = 5 # Number of training samples per class
NUM_TEST = 100 # Number of test samples
PRINT_SENTENCES = True
RANDOM_SEED = 42
MODEL = "gpt-4o"
TEMPERATURE = 0
LLM_SEED = 42

# Classification labels
LABELS = ["true", "mostly-true", "half-true", "barely-true", "false", "pants-fire"]

# Pricing per 1K tokens
PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.0100},
    "gpt-4-turbo": {"input": 0.0100, "output": 0.0300},
    "gpt-3.5-turbo": {"input": 0.0015, "output": 0.0020},
}

# Load datasets
train_df = pd.read_csv("train.tsv", sep="\t", header=None)
test_df = pd.read_csv("test.tsv", sep="\t", header=None)

# Assign correct column names based on the dataset description
columns = [
    "id", "label", "statement", "subjects", "speaker", "job_title", "state", "party",
    "count_barely_true", "count_false", "count_half_true", "count_mostly_true", "count_pants_fire", "context"
]
train_df.columns = test_df.columns = columns

# Set seed
random.seed(RANDOM_SEED)

# Preprocessing for claim
def preprocess_text(text):
    text = unicodedata.normalize('NFKC', str(text))
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\" ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["statement"] = train_df["statement"].apply(preprocess_text)
test_df["statement"] = test_df["statement"].apply(preprocess_text)

# Ensure each label has at least 5 examples
label_counts = train_df["label"].value_counts()
missing = [label for label in LABELS if label_counts.get(label, 0) < NUM_TRAIN_PER_CLASS]
if missing:
    raise ValueError(f"Not enough examples to sample from labels: {missing}")

# Sample 5 per class 
train_sample_list = []
for label in LABELS:
    class_samples = train_df[train_df["label"] == label].sample(
        n=NUM_TRAIN_PER_CLASS, random_state=RANDOM_SEED
    )
    train_sample_list.append(class_samples)

train_sample = pd.concat(train_sample_list).reset_index(drop=True)

# Stratified test sample (balanced across labels)
test_sample = (
    test_df[test_df["label"].isin(LABELS)]
    .groupby("label", group_keys=False)
    .apply(lambda x: x.sample(n=NUM_TEST // len(LABELS), random_state=RANDOM_SEED + 1).loc[:, test_df.columns])
    .reset_index(drop=True)
)

# Convert to dict for prompt use
train_data = train_sample.to_dict(orient="records")
test_data = test_sample.to_dict(orient="records")

# Track usage and cost
token_usage = []
cost_usage = []

import json

def classify_with_llm(instance, training_data):
    examples = "\n".join([
        f'- Statement: "{ex["statement"]}"\n'
        f'  Subjects: {ex["subjects"]}\n'
        f'  Speaker: {ex["speaker"]} ({ex["job_title"]}, {ex["state"]}, {ex["party"]})\n'
        f'  Context: {ex["context"]}\n'
        f'  Credit History: Barely True={ex["count_barely_true"]}, False={ex["count_false"]}, '
        f'Half True={ex["count_half_true"]}, Mostly True={ex["count_mostly_true"]}, Pants on Fire={ex["count_pants_fire"]}\n'
        f'  → {ex["label"]}'
        for ex in training_data
    ])

    system_message = {
        "role": "system",
        "content": (
            "You are a fact-checking AI that classifies political statements "
            "into one of six labels: true, mostly-true, half-true, barely-true, false, pants-fire.\n"
            "Return your answer as a JSON object: {\"label\": \"<one of the six labels>\"}.\n"
            "Do not include any explanation, commentary, or additional text."
        )
    }

    user_prompt = f"""
## 🧠 Classification Labels

Please choose **one and only one** of the following labels:

**✅ true**  
- The statement is **fully accurate and verifiable**.  
- It is supported by **multiple trusted sources** and contains no significant inaccuracies.  
- Use this label only when the statement is **entirely true** with no misleading elements.

**👍 mostly-true**  
- The statement is **mostly accurate** but may contain **minor caveats, missing context, or slight exaggerations**.  
- The core claim is true, but there are **small inaccuracies or omissions** that do not significantly alter the overall meaning.  
- Use this label when the statement is **largely true** but not perfect.

**🤔 half-true**  
- The statement contains a **mix of accurate and inaccurate elements**.  
- It may **leave out important details** or be **misleading** in its presentation.  
- Use this label only when the statement is **roughly equally true and false**. Avoid defaulting to this label unless the statement clearly warrants it.

**⚠️ barely-true**  
- The statement contains **small elements of truth** but is **mostly misleading or inaccurate**.  
- The claim may be **technically true in a narrow sense** but is presented in a way that distorts the overall truth.  
- Use this label when the statement is **more false than true**.

**❌ false**  
- The statement is **factually incorrect** or **unsupported by evidence**.  
- It may contain **clear falsehoods** or **misrepresentations** of facts.  
- Use this label when the statement is **entirely or predominantly false**.

**🔥 pants-fire**  
- The statement is **not only false** but also **blatantly ridiculous or absurd**.  
- It may contain **outlandish claims** that are easily debunked or are **intentionally deceptive**.  
- Use this label for statements that are **clearly and outrageously false**.

---

## 📜 What to Consider

When deciding the label, evaluate:
1. **Statement Content**: The factual accuracy of the claim itself.
2. **Context**: The circumstances or background in which the statement was made.
3. **Speaker Credibility**: The speaker’s past record and alignment (e.g., party affiliation, job title).
4. **Supporting Evidence**: Whether the claim is backed by reliable sources or data.
5. **Intent**: Whether the statement is intentionally misleading or deceptive.

---

## ⚠️ Output Instructions

- Respond with exactly **one of the six labels** above.
- **NO** punctuation, quotes, emojis, or arrows.
- Only return the raw label:  
  `true`, `mostly-true`, `half-true`, `barely-true`, `false`, or `pants-fire`.

---

## Labeled Examples
Use the labeled examples below to learn patterns.
{examples}

---

## Statement to Classify

Statement: "{instance["statement"]}"  
Subjects: {instance["subjects"]}  
Speaker: {instance["speaker"]} ({instance["job_title"]}, {instance["state"]}, {instance["party"]})  
Context: {instance["context"]}  
Credit History: Barely True={instance["count_barely_true"]}, False={instance["count_false"]}, Half True={instance["count_half_true"]}, Mostly True={instance["count_mostly_true"]}, Pants on Fire={instance["count_pants_fire"]}

Return the label as a JSON object like this:
{{"label": "half-true"}}
"""

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                system_message,
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=20,
            temperature=TEMPERATURE,
            seed=LLM_SEED
        )

        usage = response.usage
        input_tokens = usage.prompt_tokens
        output_tokens = usage.completion_tokens
        total_tokens = usage.total_tokens

        input_cost = (input_tokens / 1000) * PRICING[MODEL]["input"]
        output_cost = (output_tokens / 1000) * PRICING[MODEL]["output"]
        cost = input_cost + output_cost

        token_usage.append(total_tokens)
        cost_usage.append(cost)

        response_text = response.choices[0].message.content.strip()

        try:
            response_json = json.loads(response_text)
            label = response_json.get("label", "").lower()
            if label in LABELS:
                return label
            else:
                return "error"
        except json.JSONDecodeError:
            print("⚠️ JSON parsing error. Raw response:", response_text)
            return "error"

    except Exception as e:
        print(f"Error: {e}")
        return "error"
        
# Run predictions
predictions = []
true_labels = []
start_time = time.time()

for instance in test_data:
    pred = classify_with_llm(instance, train_data)
    predictions.append(pred)
    true_labels.append(instance["label"])

    if PRINT_SENTENCES:
        print(f"Statement: {instance['statement']}\nPredicted: {pred} | Actual: {instance['label']}\n")

from sklearn.metrics import precision_recall_fscore_support
# Evaluation
end_time = time.time()
total_time = end_time - start_time
total_tokens = sum(token_usage)
avg_tokens_per_article = total_tokens / len(test_data)
total_cost = sum(cost_usage)
cost_per_classification = total_cost / len(test_data)

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro', zero_division=0)

# Collect all evaluation results
eval_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "Processing Time (s)": total_time,
    "Total Cost ($)": total_cost,
    "Cost per Classification ($)": cost_per_classification,
    "Total Tokens Used": total_tokens,
    "Average Tokens per Article": avg_tokens_per_article,
}

# Print formatted results
print("\n===== Evaluation Metrics =====")
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

print("\nClassification Report:")
print(classification_report(true_labels, predictions, zero_division=0))

  .apply(lambda x: x.sample(n=NUM_TEST // len(LABELS), random_state=RANDOM_SEED + 1).loc[:, test_df.columns])


Statement: Hillary Clinton said gun confiscation would be worth considering.
Predicted: false | Actual: barely-true

Statement: Congressman Joe Heck has voted 62 of the time with Nancy Pelosi and the Democrats on a variety of important issues.
Predicted: barely-true | Actual: barely-true

Statement: Republicans offered Wisconsin bar patrons free shots of alcohol to sign recall petitions against Democrats.
Predicted: false | Actual: barely-true

Statement: Says he got twice as much money from the sale of County Grounds land than Milwaukee County Executive Scott Walker was willing to accept.
Predicted: barely-true | Actual: barely-true

Statement: Says Barack Obamas favorability rating in Israel once clocked in at 4 percent.
Predicted: false | Actual: barely-true

Statement: Eliminating affirmative action in admissions in Florida led to more African American and Hispanic kids attending our university system than before.
Predicted: half-true | Actual: barely-true

Statement: Says Adam Has

In [None]:
# GPT 4O
===== Evaluation Metrics =====
Accuracy: 0.5000
Precision: 0.5196
Recall: 0.5000
F1-score: 0.4830
Processing Time (s): 67.1404
Total Cost ($): 0.9336
Cost per Classification ($): 0.0097
Total Tokens Used: 371038
Average Tokens per Article: 3864.9792

Classification Report:
              precision    recall  f1-score   support

 barely-true       0.78      0.44      0.56        16
       false       0.45      0.56      0.50        16
   half-true       0.39      0.75      0.51        16
 mostly-true       0.36      0.31      0.33        16
  pants-fire       0.81      0.81      0.81        16
        true       0.33      0.12      0.18        16

    accuracy                           0.50        96
   macro avg       0.52      0.50      0.48        96
weighted avg       0.52      0.50      0.48        96


# GPT 4-Turbo
===== Evaluation Metrics =====
Accuracy: 0.4583
Precision: 0.6000
Recall: 0.4583
F1-score: 0.4472
Processing Time (s): 98.3273
Total Cost ($): 3.7399
Cost per Classification ($): 0.0390
Total Tokens Used: 372320
Average Tokens per Article: 3878.3333

Classification Report:
              precision    recall  f1-score   support

 barely-true       0.53      0.56      0.55        16
       false       0.50      0.50      0.50        16
   half-true       0.27      0.56      0.37        16
 mostly-true       0.39      0.44      0.41        16
  pants-fire       0.91      0.62      0.74        16
        true       1.00      0.06      0.12        16

    accuracy                           0.46        96
   macro avg       0.60      0.46      0.45        96
weighted avg       0.60      0.46      0.45        96

# GPT 3.5-Turbo
===== Evaluation Metrics =====
Accuracy: 0.3958
Precision: 0.6011
Recall: 0.3958
F1-score: 0.3826
Processing Time (s): 53.8922
Total Cost ($): 0.5589
Cost per Classification ($): 0.0058
Total Tokens Used: 372334
Average Tokens per Article: 3878.4792

Classification Report:
              precision    recall  f1-score   support

 barely-true       0.60      0.38      0.46        16
       false       0.60      0.19      0.29        16
   half-true       0.24      0.75      0.36        16
 mostly-true       0.33      0.38      0.35        16
  pants-fire       0.83      0.62      0.71        16
        true       1.00      0.06      0.12        16

    accuracy                           0.40        96
   macro avg       0.60      0.40      0.38        96
weighted avg       0.60      0.40      0.38        96