In [1]:
from openai import OpenAI
client = OpenAI(api_key="INSERT_KEY")  # Replace this!
# politifact_fake 432
# politifact_real 624
# gossipcop_fake 5323
# gossipcop_real 16817
# real 17441 75.2%
# fake 5755 24.8%

In [2]:
import pandas as pd
import openai
import time
import random
import re
import unicodedata
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ========================
# Configurable Parameters
# ========================
# Comment out NUM_SAMPLES to process the entire dataset
NUM_SAMPLES = 100  # Number of API calls (comment this out for full dataset)
PRINT_SENTENCES = True  # Toggle sentence display
RANDOM_SEED = 42  # Reproducibility for dataset sampling
MODEL = "gpt-3.5-turbo"  # Change model if needed
TEMPERATURE = 0  # Controls randomness (0 = deterministic, >0 = more creative)
LLM_SEED = 42  # OpenAI API seed for reproducibility
USE_FULL_DATASET = False  # Set to True to process the entire dataset

# Set random seed for dataset sampling
random.seed(RANDOM_SEED)

# OpenAI pricing details (per 1,000 tokens, updated)
PRICING = {
    "gpt-4o": {"input": 0.0025, "output": 0.0100},  # $ per 1K tokens
    "gpt-4-turbo": {"input": 0.0100, "output": 0.0300},
    "gpt-3.5-turbo": {"input": 0.0020, "output": 0.0020},
}

# Load datasets separately
politifact_fake = pd.read_csv("politifact_fake.csv")["title"].dropna()
politifact_real = pd.read_csv("politifact_real.csv")["title"].dropna()
gossipcop_fake = pd.read_csv("gossipcop_fake.csv")["title"].dropna()
gossipcop_real = pd.read_csv("gossipcop_real.csv")["title"].dropna()

# Improved Text Preprocessing Function
def preprocess_text(text):
    text = unicodedata.normalize('NFKC', text)  # Normalize Unicode characters
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\" ]", "", text)  # Keep key punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
data = (
    politifact_fake.apply(preprocess_text).tolist() +
    politifact_real.apply(preprocess_text).tolist() +
    gossipcop_fake.apply(preprocess_text).tolist() +
    gossipcop_real.apply(preprocess_text).tolist()
)

labels = (
    ["fake"] * len(politifact_fake) +
    ["real"] * len(politifact_real) +
    ["fake"] * len(gossipcop_fake) +
    ["real"] * len(gossipcop_real)
)

# Determine dataset size based on the toggle and NUM_SAMPLES
if "NUM_SAMPLES" in globals() and not USE_FULL_DATASET:
    sample_indices = random.sample(range(len(data)), min(NUM_SAMPLES, len(data)))
    sampled_data = [data[i] for i in sample_indices]
    sampled_labels = [labels[i] for i in sample_indices]
else:
    sampled_data = data  # Use entire dataset
    sampled_labels = labels

token_usage = []
cost_usage = []

def classify_with_llm(text):
    """Classifies a news headline using an LLM and tracks token usage (returns JSON)."""

    system_message = {
        "role": "system",
        "content": (
            "You are a Fake News Detection AI. You classify headlines as either real or fake based on factual accuracy.\n"
            "Your response must be in this exact format:\n"
            "{\"label\": \"real\"} or {\"label\": \"fake\"}.\n"
            "Do not include any other text, explanation, or formatting."
        )
    }

    user_prompt = f"""
You are a **Fake News Detection AI**. Your job is to classify the following headline as either **Real** or **Fake**, based on the rules below.

---

## **Classification Rules**

**✅ Real News (Fact-Based)**  
- Reports **actual events, official statements, or verifiable facts**.  
- Includes politics, entertainment, and sports **if they report real events**.  
- Headlines about **movies, music, or lifestyle topics** are typically **Real** unless they contain false claims.

**❌ Fake News (Misleading)**  
- Includes **false, exaggerated, or misleading claims**.  
- **Viral rumors, unverified social media posts, and hoaxes** are Fake.  
- Political or celebrity gossip **without proof** should be classified as Fake.

---

## **Handling Special Cases**

- **Social media reactions alone do not confirm truth** → Fake.  
- **A shocking event can still be Real if verifiable**.  
- **Lifestyle and entertainment news is Real unless it contains false information**.

---

## **How to Respond**

- **Output only** a valid JSON object like:  
  `{{"label": "real"}}` or `{{"label": "fake"}}`  
- **Do NOT** include any explanation, commentary, punctuation, or additional text.  
- If uncertain, classify based on **verifiable facts**.

---

## Headline to Classify:

"{text}"
"""

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[system_message, {"role": "user", "content": user_prompt}],
            max_tokens=20,
            temperature=TEMPERATURE,
            seed=LLM_SEED
        )

        usage = response.usage
        total_tokens = usage.total_tokens
        input_cost = (usage.prompt_tokens / 1000) * PRICING[MODEL]["input"]
        output_cost = (usage.completion_tokens / 1000) * PRICING[MODEL]["output"]
        cost = input_cost + output_cost

        token_usage.append(total_tokens)
        cost_usage.append(cost)

        raw_output = response.choices[0].message.content.strip()

        try:
            parsed = json.loads(raw_output)
            label = parsed.get("label", "").lower()
            if label in {"real", "fake"}:
                return label
            else:
                print("⚠️ Invalid label format:", raw_output)
                return "error"
        except json.JSONDecodeError:
            print("⚠️ JSON decoding failed. Raw output:", raw_output)
            return "error"

    except Exception as e:
        print(f"Error during API call: {e}")
        return "error"


# Efficiency & Technical Metrics Tracking
predictions = []
start_time = time.time()

for i, text in enumerate(sampled_data):
    pred = classify_with_llm(text)
    predictions.append(pred)

    if PRINT_SENTENCES:
        print(f"Sentence: {text}\nPredicted: {pred} | Actual: {sampled_labels[i]}\n")

# Calculate processing time
total_time = time.time() - start_time

# Calculate total token usage and cost
total_tokens = sum(token_usage)
avg_tokens_per_article = total_tokens / len(sampled_data)
total_cost = sum(cost_usage)
cost_per_classification = total_cost / len(sampled_data)

# Evaluation Metrics
accuracy = accuracy_score(sampled_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(sampled_labels, predictions, average='macro', zero_division=0)

eval_results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "Processing Time (s)": total_time,
    "Total Cost ($)": total_cost,
    "Cost per Classification ($)": cost_per_classification,
    "Total Tokens Used": total_tokens,
    "Average Tokens per Article": avg_tokens_per_article,
}

print("\nEvaluation Metrics:")
for key, value in eval_results.items():
    print(f"{key}: {value:.6f}")

Sentence: What Time Was the Royal Wedding in My Time Zone?
Predicted: real | Actual: real

Sentence: Tom Petty Dead Celebrities React on Social Media Variety
Predicted: fake | Actual: fake

Sentence: Donald Trump, Billy Bush Interview Scrambles Race
Predicted: real | Actual: real

Sentence: Kate Spade shared how she wanted to be remembered in a 2002 interview
Predicted: real | Actual: real

Sentence: Sarah Ferguson Shares Open Letter Blasting 'Bullying' Article About Daughter Eugenie's Wedding
Predicted: real | Actual: real

Sentence: Argentinian twins have the biggest butt augmentations ever
Predicted: fake | Actual: real

Sentence: Celebrity Insider Caught Plagiarizing Cop FactCheck About Brad Pitt Jennifer Aniston
Predicted: fake | Actual: fake

Sentence: Jennifer Garner Isn't Ready to Date After Divorce 'Ben Was the Love of Her Life'
Predicted: fake | Actual: fake

Sentence: Julianne Hough recalls being called fat while filming a movie
Predicted: real | Actual: real

Sentence: Liam

In [None]:
# GPT 4O
Evaluation Metrics:
Accuracy: 0.820000
Precision: 0.747888
Recall: 0.715415
F1-score: 0.728752
Processing Time (s): 64.113810
Total Cost ($): 0.101030
Cost per Classification ($): 0.001010
Total Tokens Used: 38312.000000
Average Tokens per Article: 383.120000

# GPT 4-TURBO
Evaluation Metrics:
Accuracy: 0.770000
Precision: 0.682432
Recall: 0.698193
F1-score: 0.689147
Processing Time (s): 93.009448
Total Cost ($): 0.400530
Cost per Classification ($): 0.004005
Total Tokens Used: 38653.000000
Average Tokens per Article: 386.530000

# GPT 3.5-TURBO
Evaluation Metrics:
Accuracy: 0.750000
Precision: 0.650219
Recall: 0.654715
F1-score: 0.652343
Processing Time (s): 58.795654
Total Cost ($): 0.077306
Cost per Classification ($): 0.000773
Total Tokens Used: 38653.000000
Average Tokens per Article: 386.530000

In [None]:
##################
# CLASSIFICATION #
##################
Accuracy = Number of correct predictions/Total predictions

Precision = TP / (TP+ FP)
# How many predicted fake headlines are actually fake
# Low precision means model is mislabelling too many real headlines as fake

Recall = TP / (TP + FN)
# How many fake headlines were correctly identified as fake
# Low recall means model is letting too many fake news headlines slip through as real

F1 = 2 X (Precision x Recall)/(Precision + Recall) #Harmonic mean of precision and recall

##############
# EFFICIENCY #
##############
Tokens per article = How many tokens the LLM processes per request = Total tokens/Number of Sentences
Cost per classification = Total cost/Number of Sentences

#############
# TECHNICAL #
#############
Processing time = How long it takes to run
Cross-dataset performance = Compare performance on different datasets                                                             