In [1]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_KEY")  # Replace this!
# politifact_fake 432
# politifact_real 624
# gossipcop_fake 5323
# gossipcop_real 16817
# real 17441 75.2%
# fake 5755 24.8%

In [27]:
import pandas as pd
import openai
import time
import random
import re
import unicodedata
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ========================
# Configurable Parameters
# ========================
NUM_SAMPLES = 100
PRINT_SENTENCES = True
RANDOM_SEED = 42
MODEL = "gpt-3.5-turbo"
TEMPERATURE = 0
LLM_SEED = 42

# Pricing per 1K tokens
PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.0100},
    "gpt-4-turbo": {"input": 0.0100, "output": 0.0300},
    "gpt-3.5-turbo": {"input": 0.0020, "output": 0.0020},
}

# Load and preprocess
def preprocess_text(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\" ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Load datasets
gossip_fake = pd.read_csv("gossipcop_fake.csv")
gossip_real = pd.read_csv("gossipcop_real.csv")

# Apply preprocessing
gossip_fake["title"] = gossip_fake["title"].apply(preprocess_text)
gossip_real["title"] = gossip_real["title"].apply(preprocess_text)

# Use raw news_url as "domain"
gossip_fake["domain"] = gossip_fake["news_url"]
gossip_real["domain"] = gossip_real["news_url"]

# Combine datasets
fake_data = gossip_fake[["title", "domain"]].values.tolist()
real_data = gossip_real[["title", "domain"]].values.tolist()

data = fake_data + real_data
labels = ["fake"] * len(fake_data) + ["real"] * len(real_data)

# Train-test split
random.seed(RANDOM_SEED)
indices = list(range(len(data)))
random.shuffle(indices)

train_indices = indices[:NUM_SAMPLES]
test_indices = indices[NUM_SAMPLES:NUM_SAMPLES * 2]

train_data = [data[i] for i in train_indices]
train_labels = [labels[i] for i in train_indices]

test_data = [data[i] for i in test_indices]
test_labels = [labels[i] for i in test_indices]

# Track usage and cost
token_usage = []
cost_usage = []


def classify_with_llm(text, domain, train_data, train_labels):
    """Few-shot LLM classification with JSON output."""
    
    training_examples = "\n".join(
        [f'- Headline: "{headline}"\n  Domain: {url}\n  → {label.lower()}'
         for (headline, url), label in zip(train_data, train_labels)]
    )

    system_message = {
        "role": "system",
        "content": (
            "You are a fake news classification assistant. "
            "Your job is to analyze a news headline and decide whether it is real or fake, based on patterns and domains.\n"
            "You must respond in this JSON format: {\"label\": \"real\"} or {\"label\": \"fake\"}.\n"
            "Do not include any extra explanation or text."
        )
    }

    user_prompt = f"""
    ## **Classification Rules**
    **✅ Real News (Fact-Based)**
    - Reports **actual events, official statements, or verifiable facts**.
    - Includes politics, entertainment, and sports **if they report real events**.
    - Headlines about **movies, music, or lifestyle topics** are typically **Real** unless they contain false claims.

    **❌ Fake News (Misleading)**
    - Includes **false, exaggerated, or misleading claims**.
    - **Viral rumors, unverified social media posts, and hoaxes** are Fake.
    - Political or celebrity gossip **without proof** should be classified as Fake.
    
    ## **Handling Special Cases**
    - **Social media reactions alone do not confirm truth** → Fake.
    - **A shocking event can still be Real if verifiable**.
    - **Lifestyle and entertainment news is Real unless it contains false information**.

    ## **How to Respond**
    - **Output only:** "Real" or "Fake" (no explanations).
    - If uncertain, classify based on **verifiable facts**.
    
    ##**Domain Handling**
    - Look for subtle differences in how the URL is structured that may hint at a source’s reliability

Below are 100 examples with their associated domains (i.e., full URLs). Learn from these patterns:

{training_examples}

---

Now classify the headline below:

Headline: "{text}"  
Domain: {domain}

Respond in JSON format:
{{"label": "real"}} or {{"label": "fake"}} 


"""

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                system_message,
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=20,
            temperature=TEMPERATURE,
            seed=LLM_SEED
        )

        usage = response.usage
        token_usage.append(usage.total_tokens)
        cost = (
            (usage.prompt_tokens / 1000) * PRICING[MODEL]["input"] +
            (usage.completion_tokens / 1000) * PRICING[MODEL]["output"]
        )
        cost_usage.append(cost)

        raw_output = response.choices[0].message.content.strip()

        try:
            parsed = json.loads(raw_output)
            label = parsed.get("label", "").lower()
            if label in {"real", "fake"}:
                return label
            else:
                print("⚠️ Invalid label in JSON:", raw_output)
                return "error"
        except json.JSONDecodeError:
            print("⚠️ JSON parsing error. Raw response:", raw_output)
            return "error"

    except Exception as e:
        print(f"Error during API call: {e}")
        return "error"

# Prediction loop
predictions = []
start_time = time.time()

for i, (headline, domain) in enumerate(test_data):
    pred = classify_with_llm(headline, domain, train_data, train_labels)
    predictions.append(pred)

    if PRINT_SENTENCES:
        print(f"Sentence: {headline}\nDomain: {domain}\nPredicted: {pred} | Actual: {test_labels[i]}\n")

# Evaluation
total_time = time.time() - start_time
total_tokens = sum(token_usage)
avg_tokens_per_article = total_tokens / len(test_data)
total_cost = sum(cost_usage)
cost_per_classification = total_cost / len(test_data)

accuracy = accuracy_score(test_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='macro', zero_division=0)

eval_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "Processing Time (s)": total_time,
    "Total Cost ($)": total_cost,
    "Cost per Classification ($)": cost_per_classification,
    "Total Tokens Used": total_tokens,
    "Average Tokens per Article": avg_tokens_per_article,
}

print("\nEvaluation Metrics:")
for key, value in eval_results.items():
    print(f"{key}: {value:.6f}")

Sentence: Secret in Their Eyes
Domain: en.wikipedia.org/wiki/Secret_in_Their_Eyes
Predicted: real | Actual: fake

Sentence: Kim Kardashian and Katy Perry Hang Out on Taylor Swift Reputation Release Day
Domain: https://www.elle.com/culture/celebrities/a13521386/kim-kardashian-north-west-at-katy-perry-witness-concert/
Predicted: real | Actual: real

Sentence: Jamie Lynn Spears
Domain: https://en.wikipedia.org/wiki/Jamie_Lynn_Spears
Predicted: real | Actual: real

Sentence: Royal complexion Meghan Markles PERFECT SKIN secrets revealed by skin care expert
Domain: https://www.express.co.uk/news/royal/1041414/Meghan-Markle-news-secrets-duchess-of-sussex-skin-latest-royal-news
Predicted: fake | Actual: real

Sentence: Kim Kardashian teases family Christmas card like an advent calendar
Domain: https://www.nickiswift.com/98759/kim-kardashian-teases-family-christmas-card-like-advent-calendar/
Predicted: real | Actual: real

Sentence: Did Issa Rae Turn Drake Down At Golden Globes After Party?
Dom

In [None]:
# GPT 4O
Evaluation Metrics:
Accuracy: 0.900000
Precision: 0.873984
Recall: 0.821678
F1-score: 0.843750
Processing Time (s): 79.894459
Total Cost ($): 1.584060
Cost per Classification ($): 0.015841
Total Tokens Used: 631524.000000
Average Tokens per Article: 6315.240000


# GPT 4-TURBO
Evaluation Metrics:
Accuracy: 0.830000
Precision: 0.752400
Recall: 0.760490
F1-score: 0.756272
Processing Time (s): 112.509617
Total Cost ($): 6.392940
Cost per Classification ($): 0.063929
Total Tokens Used: 637894.000000
Average Tokens per Article: 6378.940000

# GPT 3.5-TURBO
Evaluation Metrics:
Accuracy: 0.860000
Precision: 0.848485
Recall: 0.714452
F1-score: 0.751949
Processing Time (s): 62.184701
Total Cost ($): 1.275788
Cost per Classification ($): 0.012758
Total Tokens Used: 637894.000000
Average Tokens per Article: 6378.940000

In [None]:
##################
# CLASSIFICATION #
##################
Accuracy = Number of correct predictions/Total predictions

Precision = TP / (TP+ FP)
# How many predicted fake headlines are actually fake
# Low precision means model is mislabelling too many real headlines as fake

Recall = TP / (TP + FN)
# How many fake headlines were correctly identified as fake
# Low recall means model is letting too many fake news headlines slip through as real

F1 = 2 X (Precision x Recall)/(Precision + Recall) #Harmonic mean of precision and recall

##############
# EFFICIENCY #
##############
Tokens per article = How many tokens the LLM processes per request = Total tokens/Number of Sentences
Cost per classification = Total cost/Number of Sentences

#############
# TECHNICAL #
#############
Processing time = How long it takes to run
Cross-dataset performance = Compare performance on different datasets                                                       