In [3]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_KEY")  # Replace this!
# politifact_fake 432
# politifact_real 624
# gossipcop_fake 5323
# gossipcop_real 16817
# real 17441 75.2%
# fake 5755 24.8%

In [15]:
import pandas as pd
import openai
import time
import random
import re
import unicodedata
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ========================
# Configurable Parameters
# ========================
NUM_SAMPLES = 100  # Number of samples for training and testing
PRINT_SENTENCES = True
RANDOM_SEED = 42
MODEL = "gpt-3.5-turbo"
TEMPERATURE = 0
LLM_SEED = 42
USE_FULL_DATASET = False

# Set random seed
random.seed(RANDOM_SEED)

# Pricing details (per 1K tokens)
PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.0100},
    "gpt-4-turbo": {"input": 0.0100, "output": 0.0300},
    "gpt-3.5-turbo": {"input": 0.0020, "output": 0.0020},
}

# Load datasets
politifact_fake = pd.read_csv("politifact_fake.csv")["title"].dropna()
politifact_real = pd.read_csv("politifact_real.csv")["title"].dropna()
gossipcop_fake = pd.read_csv("gossipcop_fake.csv")["title"].dropna()
gossipcop_real = pd.read_csv("gossipcop_real.csv")["title"].dropna()

# Preprocessing function
def preprocess_text(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\" ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Preprocess and merge data
data = (
    politifact_fake.apply(preprocess_text).tolist() +
    politifact_real.apply(preprocess_text).tolist() +
    gossipcop_fake.apply(preprocess_text).tolist() +
    gossipcop_real.apply(preprocess_text).tolist()
)

labels = (
    ["fake"] * len(politifact_fake) +
    ["real"] * len(politifact_real) +
    ["fake"] * len(gossipcop_fake) +
    ["real"] * len(gossipcop_real)
)

# Train-test split (100 each)
indices = list(range(len(data)))
random.shuffle(indices)

train_indices = indices[:NUM_SAMPLES]
test_indices = indices[NUM_SAMPLES:NUM_SAMPLES * 2]

train_data = [data[i] for i in train_indices]
train_labels = [labels[i] for i in train_indices]

test_data = [data[i] for i in test_indices]
test_labels = [labels[i] for i in test_indices]

token_usage = []
cost_usage = []

def classify_with_llm(text, train_data, train_labels):
    """Few-shot LLM classification with system message and JSON output (no domain)."""

    training_examples = "\n".join(
        [f'- Headline: "{headline}" → {label.lower()}'
         for headline, label in zip(train_data, train_labels)]
    )

    system_message = {
        "role": "system",
        "content": (
            "You are a fake news classification assistant. "
            "Your job is to analyze a news headline and decide whether it is real or fake.\n"
            "You must respond in this JSON format: {\"label\": \"real\"} or {\"label\": \"fake\"}.\n"
            "Do not include any extra explanation or text."
        )
    }

    user_prompt = f"""
## **Classification Rules**
**✅ Real News (Fact-Based)**
- Reports **actual events, official statements, or verifiable facts**.
- Includes politics, entertainment, and sports **if they report real events**.
- Headlines about **movies, music, or lifestyle topics** are typically **Real** unless they contain false claims.

**❌ Fake News (Misleading)**
- Includes **false, exaggerated, or misleading claims**.
- **Viral rumors, unverified social media posts, and hoaxes** are Fake.
- Political or celebrity gossip **without proof** should be classified as Fake.

## **Handling Special Cases**
- **Social media reactions alone do not confirm truth** → Fake.
- **A shocking event can still be Real if verifiable**.
- **Lifestyle and entertainment news is Real unless it contains false information**.

## **How to Respond**
- **Output only** a JSON object: {{"label": "real"}} or {{"label": "fake"}}.
- Do not include any explanation, commentary, or formatting.

---

Below are 100 labeled examples to learn from:

{training_examples}

---

Now classify the headline below:

Headline: "{text}"

Respond in JSON format:
{{"label": "real"}} or {{"label": "fake"}}
""".strip()

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                system_message,
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=20,
            temperature=TEMPERATURE,
            seed=LLM_SEED
        )

        usage = response.usage
        token_usage.append(usage.total_tokens)

        cost = (
            (usage.prompt_tokens / 1000) * PRICING[MODEL]["input"] +
            (usage.completion_tokens / 1000) * PRICING[MODEL]["output"]
        )
        cost_usage.append(cost)

        raw_output = response.choices[0].message.content.strip()

        try:
            parsed = json.loads(raw_output)
            label = parsed.get("label", "").lower()
            if label in {"real", "fake"}:
                return label
            else:
                print("⚠️ Invalid label in JSON:", raw_output)
                return "error"
        except json.JSONDecodeError:
            print("⚠️ JSON parsing error. Raw response:", raw_output)
            return "error"

    except Exception as e:
        print(f"❌ Error during API call: {e}")
        return "error"

# Prediction loop
predictions = []
start_time = time.time()

for i, text in enumerate(test_data):
    pred = classify_with_llm(text, train_data, train_labels)
    predictions.append(pred)

    if PRINT_SENTENCES:
        print(f"Sentence: {text}\nPredicted: {pred} | Actual: {test_labels[i]}\n")

# Time and cost
total_time = time.time() - start_time
total_tokens = sum(token_usage)
avg_tokens_per_article = total_tokens / len(test_data)
total_cost = sum(cost_usage)
cost_per_classification = total_cost / len(test_data)

# Evaluation
accuracy = accuracy_score(test_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='macro', zero_division=0)

eval_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "Processing Time (s)": total_time,
    "Total Cost ($)": total_cost,
    "Cost per Classification ($)": cost_per_classification,
    "Total Tokens Used": total_tokens,
    "Average Tokens per Article": avg_tokens_per_article,
}

print("\nEvaluation Metrics:")
for key, value in eval_results.items():
    print(f"{key}: {value:.6f}")

Sentence: Justin Bieber Tattoo Guide All The Sorry Stars Designs What They Mean
Predicted: real | Actual: real

Sentence: List of awards and nominations received by Emma Stone
Predicted: real | Actual: real

Sentence: Christina Aguilera To Dump Matthew Rutler And Date Colin Farrell Instead
Predicted: fake | Actual: fake

Sentence: Maci Bookout Granted 2 Year Restraining Order Against Ryan Edwards
Predicted: real | Actual: real

Sentence: Stephen Curry
Predicted: real | Actual: real

Sentence: The 1 Surprising Reason John Oliver Wouldnt Blame Meghan Markle for Calling Off the Royal Wedding
Predicted: real | Actual: real

Sentence: 8 of the worlds most expensive wedding dresses
Predicted: real | Actual: real

Sentence: Kristin Cavallari tells Jay Cutler 'I'm with you because I love you not because I need you.'
Predicted: real | Actual: real

Sentence: Fixer Upper Is Ending. What Is Chip and Joanna Gaines' Net Worth?
Predicted: real | Actual: real

Sentence: Weird marriage rules Tom Cruis

In [None]:
# GPT 4O
Evaluation Metrics:
Accuracy: 0.860000
Precision: 0.808114
Recall: 0.808114
F1-score: 0.808114
Processing Time (s): 77.320774
Total Cost ($): 0.664002
Cost per Classification ($): 0.006640
Total Tokens Used: 263501.000000
Average Tokens per Article: 2635.010000


# GPT 4-TURBO
Evaluation Metrics:
Accuracy: 0.670000
Precision: 0.654958
Recall: 0.711623
F1-score: 0.639699
Processing Time (s): 100.285479
Total Cost ($): 2.686480
Cost per Classification ($): 0.026865
Total Tokens Used: 267248.000000
Average Tokens per Article: 2672.480000


# GPT 3.5-TURBO
Evaluation Metrics:
Accuracy: 0.840000
Precision: 0.803571
Recall: 0.723684
F1-score: 0.750000
Processing Time (s): 66.873411
Total Cost ($): 0.534496
Cost per Classification ($): 0.005345
Total Tokens Used: 267248.000000
Average Tokens per Article: 2672.480000


In [None]:
##################
# CLASSIFICATION #
##################
Accuracy = Number of correct predictions/Total predictions

Precision = TP / (TP+ FP)
# How many predicted fake headlines are actually fake
# Low precision means model is mislabelling too many real headlines as fake

Recall = TP / (TP + FN)
# How many fake headlines were correctly identified as fake
# Low recall means model is letting too many fake news headlines slip through as real

F1 = 2 X (Precision x Recall)/(Precision + Recall) #Harmonic mean of precision and recall

##############
# EFFICIENCY #
##############
Tokens per article = How many tokens the LLM processes per request = Total tokens/Number of Sentences
Cost per classification = Total cost/Number of Sentences

#############
# TECHNICAL #
#############
Processing time = How long it takes to run
Cross-dataset performance = Compare performance on different datasets                                                                         